## 데이터 전처리

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


df = pd.read_csv('../renttherunway_data.csv')
data = df.copy()
data.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


In [107]:
data.describe()

Unnamed: 0,user_id,item_id,rating,size,age
count,192544.0,192544.0,192462.0,192544.0,191584.0
mean,499494.100149,1045684.0,9.092371,12.245175,33.871017
std,289059.719328,805314.8,1.430044,8.494877,8.058083
min,9.0,123373.0,2.0,0.0,0.0
25%,250654.25,195076.0,8.0,8.0,29.0
50%,499419.0,948396.0,10.0,12.0,32.0
75%,750974.0,1678888.0,10.0,16.0,37.0
max,999997.0,2966087.0,10.0,58.0,117.0


In [95]:
data.isnull().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight            29982
rating               82
rented for           10
review_text          68
body type         14637
review_summary      347
category              0
height              677
size                  0
age                 960
review_date           0
dtype: int64

#### bust size, rented for, review_text, review_summary, review_date 컬럼 삭제

In [108]:
# 지정한 컬럼 삭제
columns_to_drop = ['bust size', 'rented for', 'review_text', 'review_summary', 'review_date']
data = data.drop(columns=columns_to_drop)

data.head()

Unnamed: 0,fit,user_id,item_id,weight,rating,body type,category,height,size,age
0,fit,420272,2260466,137lbs,10.0,hourglass,romper,"5' 8""",14,28.0
1,fit,273551,153475,132lbs,10.0,straight & narrow,gown,"5' 6""",12,36.0
2,fit,360448,1063761,,10.0,,sheath,"5' 4""",4,116.0
3,fit,909926,126335,135lbs,8.0,pear,dress,"5' 5""",8,34.0
4,fit,151944,616682,145lbs,10.0,athletic,gown,"5' 9""",12,27.0


#### 상품 id 분석

In [45]:
# item_id별 등장 횟수 집계
item_counts = data['item_id'].value_counts().sort_index()
item_counts_df = item_counts.reset_index()
item_counts_df.columns = ['item_id', 'count']
item_counts_sorted = item_counts_df.sort_values(by='count', ascending=False).reset_index(drop=True)
top_10_items = item_counts_sorted.head(10)

print(top_10_items)


   item_id  count
0   126335   2241
1   174086   1724
2   123793   1714
3   132738   1582
4   145906   1478
5   127865   1393
6   136110   1197
7   137585   1100
8   131533   1091
9   172027    984


#### `weight` 컬럼: 단위 변환 후 결측치 처리

In [109]:
# weight → lbs 숫자 추출 → kg 변환
data['weight'] = data['weight'].str.extract(r'(\d+)').astype(float)
data['weight'] = data['weight'] * 0.453592

# 1. weight가 null인 데이터 삭제
df_weight_dropna = data.dropna(subset=['weight'])

In [110]:
# 2. 평균으로 대체
mean_weight = df_weight_dropna['weight'].mean()
data['weight'] = data['weight'].fillna(mean_weight)

data[['weight']].describe()

Unnamed: 0,weight
count,192544.0
mean,62.31978
std,9.127528
min,22.6796
25%,56.699
50%,62.31978
75%,65.77084
max,136.0776


In [None]:
# 3. 중앙값으로 대체
median_weight = df_weight_dropna['weight'].median()
data['weight'] = data['weight'].fillna(median_weight)

data[['weight']].describe()

In [None]:
# 4. 최빈값으로 대체
mode_weight = df_weight_dropna['weight'].mode().iloc[0]
data['weight'] = data['weight'].fillna(mode_weight)

data[['weight']].describe()

#### `height` 컬럼: 단위 변환 후 결측치 처리

In [111]:
def height_to_cm(h):
    if isinstance(h, str):
        match = re.match(r"(\d+)' (\d+)", h)
        if match:
            feet = int(match.group(1))
            inches = int(match.group(2))
            total_inches = feet * 12 + inches
            return total_inches * 2.54
    return np.nan

data['height'] = data['height'].apply(height_to_cm)

# height 결측치 제거
data = data.dropna(subset=['height'])

data[['height']].describe()
data.isnull().sum()

fit              0
user_id          0
item_id          0
weight           0
rating          82
body type    14508
category         0
height           0
size             0
age            952
dtype: int64

#### `age`: 이상치 및 결측치 제거

In [112]:
data['age'] = data['age'].where((data['age'] < 100) & (data['age'] > 19), np.nan)
data = data.dropna(subset=['age'])

data[['age']].describe()

Unnamed: 0,age
count,189672.0
mean,33.950388
std,7.825581
min,20.0
25%,29.0
50%,32.0
75%,37.0
max,99.0


#### `rating`: 2, 4, 6, 8, 10 -> 1, 2, 3, 4, 5 & 결측치 제거

In [113]:
data['rating'] = (data['rating'] // 2)
data = data.dropna(subset=['rating'])

data[['rating']].describe()

Unnamed: 0,rating
count,189591.0
mean,4.545548
std,0.715359
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


#### `body type`: 범주형 인코딩

In [114]:
body_type_encoded = pd.get_dummies(data['body type'], prefix='body_type', dummy_na=True)

# 인코딩 컬럼 목록 확인
encoded_columns = body_type_encoded.columns.tolist()
print(encoded_columns)

data = data.drop(columns=['body type'])
data = pd.concat([data, body_type_encoded], axis=1)

['body_type_apple', 'body_type_athletic', 'body_type_full bust', 'body_type_hourglass', 'body_type_pear', 'body_type_petite', 'body_type_straight & narrow', 'body_type_nan']


In [91]:
data.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,review_summary,category,...,age,review_date,body_type_apple,body_type_athletic,body_type_full bust,body_type_hourglass,body_type_pear,body_type_petite,body_type_straight & narrow,body_type_nan
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,So many compliments!,romper,...,28.0,"April 20, 2016",False,False,False,True,False,False,False,False
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,I felt so glamourous!!!,gown,...,36.0,"June 18, 2013",False,False,False,False,False,False,True,False
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,It was a great time to celebrate the (almost) ...,sheath,...,116.0,"December 14, 2015",False,False,False,False,False,False,False,True
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,Dress arrived on time and in perfect condition.,dress,...,34.0,"February 12, 2014",False,False,False,False,True,False,False,False
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,Was in love with this dress !!!,gown,...,27.0,"September 26, 2016",False,True,False,False,False,False,False,False


#### `category`: 범주형 인코딩 or 레이블 인코딩

In [104]:
#category_encoded = pd.get_dummies(data['category'], prefix='category')

#### `size`: 정규화 or 표준화

In [119]:
# sns.histplot(df['size'], kde=True)
# plt.title('Size Distribution')
# plt.show()

In [115]:
data.isnull().sum()

fit                            0
user_id                        0
item_id                        0
weight                         0
rating                         0
category                       0
height                         0
size                           0
age                            0
body_type_apple                0
body_type_athletic             0
body_type_full bust            0
body_type_hourglass            0
body_type_pear                 0
body_type_petite               0
body_type_straight & narrow    0
body_type_nan                  0
dtype: int64

### Kmeans 클러스터링 모델 적용

In [118]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# feature 선택
numeric_cols = ['age', 'rating', 'weight', 'height']
encoded_cols = [col for col in data.columns if col.startswith('body_type_')]

feature_cols = numeric_cols + encoded_cols
data_kmeans = data[feature_cols].dropna()

# 표준화 및 클러스터링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_kmeans)

for k in range(3, 15):
    model = KMeans(n_clusters=k, random_state=42)
    labels = model.fit_predict(X_scaled)
    sil_score = silhouette_score(X_scaled, labels)
    print(f"{k} clusters 완료")
    print(f"Silhouette Score: {sil_score}")

# kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
# labels = kmeans.fit_predict(X_scaled)
#
# # 실루엣 점수 계산
# sil_score = silhouette_score(X_scaled, labels)
# sil_score

3 clusters 완료
Silhouette Score: 0.20030759099193604
4 clusters 완료
Silhouette Score: 0.26279471310749974
5 clusters 완료
Silhouette Score: 0.32272287608244843
6 clusters 완료
Silhouette Score: 0.37643432648103414
7 clusters 완료
Silhouette Score: 0.3705706615497489
8 clusters 완료
Silhouette Score: 0.4724605214846014
9 clusters 완료
Silhouette Score: 0.4196708643912723
10 clusters 완료
Silhouette Score: 0.37117081596737633
11 clusters 완료
Silhouette Score: 0.37103196464049826
12 clusters 완료
Silhouette Score: 0.34567790834590945
13 clusters 완료
Silhouette Score: 0.3202354470878256
14 clusters 완료
Silhouette Score: 0.32453102650287635
