In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch

In [None]:
# Load the dataset
data = pd.read_csv('../data/renttherunway.csv')

# Display basic information
print(data.head())
print(data.info())

In [None]:
# Check for duplicate rows and drop them
data = data.drop_duplicates()

# Drop unnecessary columns
data = data.drop(['user_id', 'item_id', 'review_text', 'review_date'], axis=1)

# Clean 'weight' column
data['weight'] = data['weight'].str.replace('lbs', '').astype(float)

# Group 'party: cocktail' with 'party' in 'rented for'
data['rented for'] = data['rented for'].replace({'party: cocktail': 'party'})

# Convert 'height' to inches
data['height'] = data['height'].str.replace("'", ".").astype(float) * 12

# Handle missing values
data.fillna(data.mean(), inplace=True)

# Check statistical summary
print(data.describe())

In [None]:
# Treat outliers in 'age'
q1, q3 = np.percentile(data['age'], [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
data['age'] = np.clip(data['age'], lower_bound, upper_bound)

In [None]:
# Plot distribution of 'rented for'
sns.countplot(data['rented for'])
plt.title('Distribution of Rented For')
plt.show()

In [None]:
# Encode categorical variables
encoder = LabelEncoder()
for column in ['rented for', 'body type', 'category', 'fit']:
    data[column] = encoder.fit_transform(data[column])

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
# Perform PCA to reduce dimensions
pca = PCA(n_components=0.95)
pca_data = pca.fit_transform(scaled_data)

# Explained variance
print(f"Explained variance by PCA components: {sum(pca.explained_variance_ratio_)}")

In [None]:
# Elbow plot to determine optimal K
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(pca_data)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Plot')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Build KMeans model with optimal K
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(pca_data)
data['KMeans_Cluster'] = kmeans.labels_

# Silhouette score
silhouette_avg = silhouette_score(pca_data, kmeans.labels_)
print(f"Silhouette Score for KMeans: {silhouette_avg}")

In [None]:
# Dendrogram to determine optimal K
dendrogram = sch.dendrogram(sch.linkage(pca_data[:1000], method='ward'))
plt.title('Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Build Agglomerative model
agglom = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
data['Agglomerative_Cluster'] = agglom.fit_predict(pca_data)

# Silhouette score
silhouette_avg = silhouette_score(pca_data, agglom.labels_)
print(f"Silhouette Score for Agglomerative Clustering: {silhouette_avg}")

In [None]:
# Bivariate analysis for clusters
sns.boxplot(x='KMeans_Cluster', y='age', data=data)
plt.title('Age Distribution Across KMeans Clusters')
plt.show()