In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import json
import random

In [2]:
# 데이터 크기 확인 함수
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

## 타입별 평균 크기 확인 함수
def type_memory(data) :
    for dtype in ['float','int','object']:
        selected_dtype = data.select_dtypes(include=[dtype])
        mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
        mean_usage_mb = mean_usage_b / 1024 ** 2
        print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

## 이산형 데이터 사이즈 축소 함소
def int_memory_reduce(data) :
    data_int = data.select_dtypes(include=['int'])
    converted_int = data_int.apply(pd.to_numeric,downcast='unsigned')
    print(f"Before : {mem_usage(data_int)} -> After : {mem_usage(converted_int)}")
    data[converted_int.columns] = converted_int
    return data

## 연속형 데이터 사이즈 축소 함소
def float_memory_reduce(data) :
    data_float = data.select_dtypes(include=['float'])
    converted_float = data_float.apply(pd.to_numeric,downcast='float')
    print(f"Before : {mem_usage(data_float)} -> After : {mem_usage(converted_float)}")
    data[converted_float.columns] = converted_float
    return data

## 문자형 데이터 사이즈 축소 함소
def object_memory_reduce(data) :
    gl_obj = data.select_dtypes(include=['object']).copy()
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = gl_obj[col]
    print(f"Before : {mem_usage(gl_obj)} -> After : {mem_usage(converted_obj)}")
    data[converted_obj.columns] = converted_obj
    return data

In [3]:
kdf = pd.read_excel("패션 선호도조사 결과_일부data(1108-1).xlsx", sheet_name='rawdata_1차(1108)', engine='openpyxl')
#kdf = kdf.rename(columns=kdf.iloc[0])
#kdf = kdf.drop(kdf.index[0])
print(kdf.shape)
kdf.head(3)

(38284, 40)


Unnamed: 0,E_id,imgName,era,style,gender,Q1,Q2,Q3,Q411,Q412,...,r_gender,age,mar,job,income,r_style1,r_style2,r_style3,r_style4,r_style5
0,1,W_15268_50_ivy_M.jpg,1950,ivy,M,4,1,1,3,2,...,1,4,2,4,1,2,6,2,2,1
1,2,W_16543_50_ivy_M.jpg,1950,ivy,M,2,1,2,2,2,...,1,4,2,4,1,2,6,2,2,1
2,3,W_17697_50_ivy_M.jpg,1950,ivy,M,3,1,1,2,2,...,1,4,2,4,1,2,6,2,2,1


In [4]:
kdf.isna().sum()

E_id        0
imgName     0
era         0
style       0
gender      0
Q1          0
Q2          0
Q3          0
Q411        0
Q412        0
Q413        0
Q414        0
Q4201       0
Q4202       0
Q4203       0
Q4204       0
Q4205       0
Q4206       0
Q4207       0
Q4208       0
Q4209       0
Q4210       0
Q4211       0
Q4212       0
Q4213       0
Q4214       0
Q4215       0
Q4216       0
Q5          0
R_id        0
r_gender    0
age         0
mar         0
job         0
income      0
r_style1    0
r_style2    0
r_style3    0
r_style4    0
r_style5    0
dtype: int64

# 1. User Data Preprocessing

In [5]:
user_data = kdf[["R_id", "r_gender", "age", "mar", "job", "income", 
                 "r_style1", "r_style2", "r_style3", "r_style4", "r_style5"]]
user_data = user_data.drop_duplicates(['R_id']).reset_index(drop=True)
user_data["user"] = user_data.reset_index()["index"]
user_data = user_data[["user", "R_id", "r_gender", "age", "mar", "job", "income", 
                 "r_style1", "r_style2", "r_style3", "r_style4", "r_style5"]]
print(user_data.shape)
user_data

(822, 12)


Unnamed: 0,user,R_id,r_gender,age,mar,job,income,r_style1,r_style2,r_style3,r_style4,r_style5
0,0,27,1,4,2,4,1,2,6,2,2,1
1,1,133,1,2,1,6,2,1,1,2,2,2
2,2,179,2,3,1,4,2,1,5,1,2,2
3,3,289,2,2,2,1,2,2,3,1,2,1
4,4,1022,2,2,1,6,2,2,1,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
817,817,63970,1,3,1,2,2,1,2,2,2,1
818,818,63976,2,1,1,4,1,1,2,2,2,1
819,819,63986,1,3,2,4,2,1,3,1,1,1
820,820,63989,1,1,1,2,2,2,3,1,1,2


In [6]:
user_data.columns = ["user", "user_name", "r_gender", "age", "mar", "job", "income",
                     "r_style1", "r_style2", "r_style3", "r_style4", "r_style5"]
user_data = user_data[["user", "user_name", "r_gender", "age", "mar", "job", "income",
                     "r_style1", "r_style2", "r_style3", "r_style4", "r_style5"]]
user_data = int_memory_reduce(user_data)
user_data = float_memory_reduce(user_data)
user_data = object_memory_reduce(user_data)

Before : 0.08 MB -> After : 0.01 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 0.00 MB -> After : 0.00 MB


In [7]:
user_data

Unnamed: 0,user,user_name,r_gender,age,mar,job,income,r_style1,r_style2,r_style3,r_style4,r_style5
0,0,27,1,4,2,4,1,2,6,2,2,1
1,1,133,1,2,1,6,2,1,1,2,2,2
2,2,179,2,3,1,4,2,1,5,1,2,2
3,3,289,2,2,2,1,2,2,3,1,2,1
4,4,1022,2,2,1,6,2,2,1,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
817,817,63970,1,3,1,2,2,1,2,2,2,1
818,818,63976,2,1,1,4,1,1,2,2,2,1
819,819,63986,1,3,2,4,2,1,3,1,1,1
820,820,63989,1,1,1,2,2,2,3,1,1,2


In [8]:
user_data.to_csv("../KData/user_data.csv")
user_data

Unnamed: 0,user,user_name,r_gender,age,mar,job,income,r_style1,r_style2,r_style3,r_style4,r_style5
0,0,27,1,4,2,4,1,2,6,2,2,1
1,1,133,1,2,1,6,2,1,1,2,2,2
2,2,179,2,3,1,4,2,1,5,1,2,2
3,3,289,2,2,2,1,2,2,3,1,2,1
4,4,1022,2,2,1,6,2,2,1,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
817,817,63970,1,3,1,2,2,1,2,2,2,1
818,818,63976,2,1,1,4,1,1,2,2,2,1
819,819,63986,1,3,2,4,2,1,3,1,1,1
820,820,63989,1,1,1,2,2,2,3,1,1,2


# 2. Item Data Preprocessing

In [9]:
# Q2
q2_dict = {"1" : "spring fall", "2" : "summer", "3" : "winter"}

# Q3
q3_dict = {"1" : "attendance",
           "2" : "date",
           "3" : "event",
           "4" : "social gathering",
           "5" : "daily",
           "6" : "leisure sports",
           "7" : "trip vacation",
           "8" : "etc"}

# Q411
q411_dict = {"1":"loose",
             "2":"appropriate",
             "3":"tight"}

# Q412
q412_dict = {"1":"dark",
             "2":"bright"}

# Q413
q413_dict = {"1":"cold",
             "2":"warm"}

# Q414
q414_dict = {"1":"heavy",
             "2":"light"}



# Q4201
q4201_dict = {"0":"no",
              "1":"nice"}

# Q4202
q4202_dict = {"0":"no",
              "2":"urban"}

# Q4203
q4203_dict = {"0":"no",
              "3":"trendy"}

# Q4204
q4204_dict = {"0":"no",
              "4":"sophisticated"}

# Q4205
q4205_dict = {"0":"no",
              "5":"clean"}

# Q4206
q4206_dict = {"0":"no",
              "6":"magnificent"}

# Q4207
q4207_dict = {"0":"no",
              "7":"unique"}

# Q4208
q4208_dict = {"0":"no",
              "8":"easy"}

# Q4209
q4209_dict = {"0":"no",
              "9":"open mined"}

# Q4210
q4210_dict = {"0":"no",
              "10":"practical"}

# Q4211
q4211_dict = {"0":"no",
              "11":"activity"}

# Q4212
q4212_dict = {"0":"no",
              "12":"comfortable"}

# Q4213
q4213_dict = {"0":"no",
              "13":"bubbly"}

# Q4214
q4214_dict = {"0":"no",
              "14":"feminine"}

# Q4215
q4215_dict = {"0":"no",
              "15":"manly"}

# Q4216
q4216_dict = {"0":"no",
              "16":"soft"}


In [10]:
item_data = kdf[['imgName','era','style','gender', 
                 'Q1','Q2','Q3','Q411','Q412','Q413','Q414','Q4201','Q4202','Q4203','Q4204','Q4205',
                 'Q4206','Q4207','Q4208','Q4209','Q4210','Q4211','Q4212','Q4213','Q4214','Q4215','Q4216','Q5']]
item_data

Unnamed: 0,imgName,era,style,gender,Q1,Q2,Q3,Q411,Q412,Q413,...,Q4208,Q4209,Q4210,Q4211,Q4212,Q4213,Q4214,Q4215,Q4216,Q5
0,W_15268_50_ivy_M.jpg,1950,ivy,M,4,1,1,3,2,2,...,0,0,10,0,0,0,0,15,0,2
1,W_16543_50_ivy_M.jpg,1950,ivy,M,2,1,2,2,2,2,...,0,0,0,0,12,0,0,0,0,1
2,W_17697_50_ivy_M.jpg,1950,ivy,M,3,1,1,2,2,2,...,0,0,0,0,0,0,0,15,0,2
3,W_00485_60_mods_M.jpg,1960,mods,M,4,1,1,2,2,2,...,0,0,0,0,12,0,0,0,0,2
4,W_06723_60_mods_M.jpg,1960,mods,M,3,1,5,3,1,1,...,8,0,10,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38279,W_17696_10_sportivecasual_M.jpg,2010,sportivecasual,M,2,3,5,2,1,1,...,8,0,0,0,0,0,0,0,0,1
38280,W_07174_19_normcore_M.jpg,2019,normcore,M,3,2,7,1,2,2,...,0,9,10,11,0,0,0,0,16,2
38281,W_07358_19_normcore_M.jpg,2019,normcore,M,3,1,5,2,2,2,...,8,9,0,11,0,0,0,0,0,2
38282,W_16898_19_normcore_M.jpg,2019,normcore,M,2,3,5,2,1,1,...,0,0,0,0,0,0,0,15,0,1


In [11]:
for col_i in ['Q2','Q3','Q411','Q412','Q413','Q414','Q4201','Q4202','Q4203','Q4204','Q4205',
          'Q4206','Q4207','Q4208','Q4209','Q4210','Q4211','Q4212','Q4213','Q4214','Q4215','Q4216']:
    item_data.loc[:, col_i] = item_data.loc[:, col_i].astype("str")
item_data.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38284 entries, 0 to 38283
Data columns (total 28 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   imgName  38284 non-null  object
 1   era      38284 non-null  int64 
 2   style    38284 non-null  object
 3   gender   38284 non-null  object
 4   Q1       38284 non-null  int64 
 5   Q2       38284 non-null  object
 6   Q3       38284 non-null  object
 7   Q411     38284 non-null  object
 8   Q412     38284 non-null  object
 9   Q413     38284 non-null  object
 10  Q414     38284 non-null  object
 11  Q4201    38284 non-null  object
 12  Q4202    38284 non-null  object
 13  Q4203    38284 non-null  object
 14  Q4204    38284 non-null  object
 15  Q4205    38284 non-null  object
 16  Q4206    38284 non-null  object
 17  Q4207    38284 non-null  object
 18  Q4208    38284 non-null  object
 19  Q4209    38284 non-null  object
 20  Q4210    38284 non-null  object
 21  Q4211    38284 non-null  object
 22

In [12]:
item_data["season"] = [q2_dict[str(i)] for i in item_data["Q2"].values.tolist()]
item_data["tpo"] = [q3_dict[str(i)] for i in item_data["Q3"].values.tolist()]
item_data["fit"] = [q411_dict[str(i)] for i in item_data["Q411"].values.tolist()]
item_data["brightness"] = [q412_dict[str(i)] for i in item_data["Q412"].values.tolist()]
item_data["temperature"] = [q413_dict[str(i)] for i in item_data["Q413"].values.tolist()]
item_data["weight"] = [q414_dict[str(i)] for i in item_data["Q414"].values.tolist()]

item_data["nice"] = [q4201_dict[str(i)] for i in item_data["Q4201"].values.tolist()]
item_data["urban"] = [q4202_dict[str(i)] for i in item_data["Q4202"].values.tolist()]
item_data["trendy"] = [q4203_dict[str(i)] for i in item_data["Q4203"].values.tolist()]
item_data["sophisticated"] = [q4204_dict[str(i)] for i in item_data["Q4204"].values.tolist()]
item_data["clean"] = [q4205_dict[str(i)] for i in item_data["Q4205"].values.tolist()]
item_data["magnificent"] = [q4206_dict[str(i)] for i in item_data["Q4206"].values.tolist()]
item_data["unique"] = [q4207_dict[str(i)] for i in item_data["Q4207"].values.tolist()]
item_data["easy"] = [q4208_dict[str(i)] for i in item_data["Q4208"].values.tolist()]
item_data["open"] = [q4209_dict[str(i)] for i in item_data["Q4209"].values.tolist()]
item_data["practical"] = [q4210_dict[str(i)] for i in item_data["Q4210"].values.tolist()]
item_data["activity"] = [q4211_dict[str(i)] for i in item_data["Q4211"].values.tolist()]
item_data["comfortable"] = [q4212_dict[str(i)] for i in item_data["Q4212"].values.tolist()]
item_data["bubbly"] = [q4213_dict[str(i)] for i in item_data["Q4213"].values.tolist()]
item_data["feminine"] = [q4214_dict[str(i)] for i in item_data["Q4214"].values.tolist()]
item_data["manly"] = [q4215_dict[str(i)] for i in item_data["Q4215"].values.tolist()]
item_data["soft"] = [q4216_dict[str(i)] for i in item_data["Q4216"].values.tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data["season"] = [q2_dict[str(i)] for i in item_data["Q2"].values.tolist()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data["tpo"] = [q3_dict[str(i)] for i in item_data["Q3"].values.tolist()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data["fit"] = [q411_dict[str(i)] for i in

In [13]:
item_data = item_data[["imgName", "era", "style", "gender", 'season', 'tpo', 'fit', 'brightness',
                       'temperature', 'weight', 'nice', 'urban', 'trendy', 'sophisticated',
                       'clean', 'magnificent', 'unique', 'easy', 'open', 'practical',
                       'activity', 'comfortable', 'bubbly', 'feminine', 'manly', 'soft', "Q1", "Q5"]]

In [14]:
for col_l in item_data.columns:
    unq_l = len(item_data[col_l].unique())
    print(f"{unq_l}:{item_data[col_l].unique()}")

16454:['W_15268_50_ivy_M.jpg' 'W_16543_50_ivy_M.jpg' 'W_17697_50_ivy_M.jpg' ...
 'W_16811_19_normcore_M.jpg' 'W_07174_19_normcore_M.jpg'
 'W_17028_19_normcore_M.jpg']
8:[1950 1960 1970 1980 1990 2000 2010 2019]
27:['ivy' 'mods' 'hippie' 'bold' 'hiphop' 'metrosexual' 'sportivecasual'
 'normcore' 'feminine' 'classic' 'minimal' 'punk' 'bodyconscious'
 'powersuit' 'kitsch' 'oriental' 'cityglam' 'athleisure' 'popart' 'disco'
 'grunge' 'ecology' 'military' 'lingerie' 'lounge' 'genderless' 'space']
2:['M' 'W']
3:['spring fall' 'summer' 'winter']
8:['attendance' 'date' 'daily' 'social gathering' 'etc' 'trip vacation'
 'leisure sports' 'event']
3:['tight' 'appropriate' 'loose']
2:['bright' 'dark']
2:['warm' 'cold']
2:['light' 'heavy']
2:['no' 'nice']
2:['urban' 'no']
2:['trendy' 'no']
2:['no' 'sophisticated']
2:['clean' 'no']
2:['no' 'magnificent']
2:['no' 'unique']
2:['no' 'easy']
2:['no' 'open mined']
2:['practical' 'no']
2:['no' 'activity']
2:['no' 'comfortable']
2:['no' 'bubbly']
2:['no' 'f

In [15]:
item_data = pd.get_dummies(item_data, columns = ['season', "tpo", 'fit', 'brightness',
                                                 'temperature', 'weight', 'nice', 'urban', 'trendy', 'sophisticated',
                                                 'clean', 'magnificent', 'unique', 'easy', 'open', 'practical',
                                                 'activity', 'comfortable', 'bubbly', 'feminine', 'manly', 'soft'])
item_data = item_data.groupby(["imgName", "era", "style", "gender"]).sum().reset_index()
item_data

Unnamed: 0,imgName,era,style,gender,Q1,Q5,season_spring fall,season_summer,season_winter,tpo_attendance,...,comfortable_comfortable,comfortable_no,bubbly_bubbly,bubbly_no,feminine_feminine,feminine_no,manly_manly,manly_no,soft_no,soft_soft
0,W_00002_60_mods_M.jpg,1960,mods,M,3,2,1,0,0,1,...,0,1,0,1,0,1,1,0,1,0
1,W_00003_50_ivy_M.jpg,1950,ivy,M,13,8,3,0,1,2,...,1,3,0,4,0,4,4,0,4,0
2,W_00004_50_ivy_M.jpg,1950,ivy,M,4,2,1,0,0,1,...,0,1,0,1,0,1,0,1,1,0
3,W_00005_60_mods_M.jpg,1960,mods,M,3,2,1,0,1,0,...,0,2,0,2,0,2,0,2,2,0
4,W_00007_60_mods_M.jpg,1960,mods,M,13,8,4,0,0,1,...,0,4,1,3,0,4,2,2,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,W_19996_90_kitsch_W.jpg,1990,kitsch,W,4,3,2,0,0,0,...,1,1,2,0,1,1,0,2,2,0
16450,W_19997_00_oriental_W.jpg,2000,oriental,W,5,4,2,0,2,0,...,0,4,0,4,0,4,0,4,4,0
16451,W_19998_50_feminine_W.jpg,1950,feminine,W,2,2,1,0,1,0,...,0,2,0,2,0,2,0,2,2,0
16452,W_19999_00_oriental_W.jpg,2000,oriental,W,7,4,3,1,0,0,...,0,4,0,4,1,3,0,4,4,0


In [16]:
item_data.columns = ["item_name", "era", "style", "gender"] + item_data.columns.tolist()[4:]
item_data = int_memory_reduce(item_data)
item_data = float_memory_reduce(item_data)
item_data = object_memory_reduce(item_data)
item_data

Before : 0.38 MB -> After : 0.06 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 3.21 MB -> After : 1.32 MB


Unnamed: 0,item_name,era,style,gender,Q1,Q5,season_spring fall,season_summer,season_winter,tpo_attendance,...,comfortable_comfortable,comfortable_no,bubbly_bubbly,bubbly_no,feminine_feminine,feminine_no,manly_manly,manly_no,soft_no,soft_soft
0,W_00002_60_mods_M.jpg,1960,mods,M,3,2,1,0,0,1,...,0,1,0,1,0,1,1,0,1,0
1,W_00003_50_ivy_M.jpg,1950,ivy,M,13,8,3,0,1,2,...,1,3,0,4,0,4,4,0,4,0
2,W_00004_50_ivy_M.jpg,1950,ivy,M,4,2,1,0,0,1,...,0,1,0,1,0,1,0,1,1,0
3,W_00005_60_mods_M.jpg,1960,mods,M,3,2,1,0,1,0,...,0,2,0,2,0,2,0,2,2,0
4,W_00007_60_mods_M.jpg,1960,mods,M,13,8,4,0,0,1,...,0,4,1,3,0,4,2,2,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,W_19996_90_kitsch_W.jpg,1990,kitsch,W,4,3,2,0,0,0,...,1,1,2,0,1,1,0,2,2,0
16450,W_19997_00_oriental_W.jpg,2000,oriental,W,5,4,2,0,2,0,...,0,4,0,4,0,4,0,4,4,0
16451,W_19998_50_feminine_W.jpg,1950,feminine,W,2,2,1,0,1,0,...,0,2,0,2,0,2,0,2,2,0
16452,W_19999_00_oriental_W.jpg,2000,oriental,W,7,4,3,1,0,0,...,0,4,0,4,1,3,0,4,4,0


In [17]:
# season
spring_and_fall = item_data[item_data["season_spring fall"]>=1].index
summer = item_data[item_data["season_summer"]>=1].index
winter = item_data[item_data["season_winter"]>=1].index

# tpo
attend = item_data[item_data["tpo_attendance"]>=1].index
daily = item_data[item_data["tpo_daily"]>=1].index
date = item_data[item_data["tpo_date"]>=1].index
etc = item_data[item_data["tpo_etc"]>=1].index
event = item_data[item_data["tpo_event"]>=1].index
sports = item_data[item_data["tpo_leisure sports"]>=1].index
social = item_data[item_data["tpo_social gathering"]>=1].index
trip = item_data[item_data["tpo_trip vacation"]>=1].index

# fit
appro = item_data[item_data["fit_appropriate"]>=1].index
loo = item_data[item_data["fit_loose"]>=1].index
tig = item_data[item_data["fit_tight"]>=1].index

# brightness
bright = item_data[item_data["brightness_bright"]>=1].index
dark = item_data[item_data["brightness_dark"]>=1].index

# temperature
cold = item_data[item_data["temperature_cold"]>=1].index
warm = item_data[item_data["temperature_warm"]>=1].index

# weight
heavy = item_data[item_data["weight_heavy"]>=1].index
light = item_data[item_data["weight_light"]>=1].index



nice_no = item_data[item_data["nice_no"]>=1].index
nice_nice = item_data[item_data["nice_nice"]>=1].index

urban_no = item_data[item_data["urban_no"]>=1].index
urban_urban = item_data[item_data["urban_urban"]>=1].index

trendy_no = item_data[item_data["trendy_no"]>=1].index
trendy_trendy = item_data[item_data["trendy_trendy"]>=1].index

sophisticated_no = item_data[item_data["sophisticated_no"]>=1].index
sophisticated_sohp = item_data[item_data["sophisticated_sophisticated"]>=1].index

clean_no = item_data[item_data["clean_no"]>=1].index
clean_clean = item_data[item_data["clean_clean"]>=1].index

magnificent_no = item_data[item_data["magnificent_no"]>=1].index
magnificent_magnificent = item_data[item_data["magnificent_magnificent"]>=1].index

unique_no = item_data[item_data["unique_no"]>=1].index
unique_unique = item_data[item_data["unique_unique"]>=1].index

easy_no = item_data[item_data["easy_no"]>=1].index
easy_easy = item_data[item_data["easy_easy"]>=1].index

open_no = item_data[item_data["open_no"]>=1].index
open_open = item_data[item_data["open_open mined"]>=1].index

practical_no = item_data[item_data["practical_no"]>=1].index
practical_practical = item_data[item_data["practical_practical"]>=1].index

activity_no = item_data[item_data["activity_no"]>=1].index
activity_activity = item_data[item_data["activity_activity"]>=1].index

comfortable_no = item_data[item_data["comfortable_no"]>=1].index
comfortable_comfortable = item_data[item_data["comfortable_comfortable"]>=1].index

bubbly_no = item_data[item_data["bubbly_no"]>=1].index
bubbly_bubbly = item_data[item_data["bubbly_bubbly"]>=1].index

feminine_no = item_data[item_data["feminine_no"]>=1].index
feminine_feminine = item_data[item_data["feminine_feminine"]>=1].index

manly_no = item_data[item_data["manly_no"]>=1].index
manly_manly = item_data[item_data["manly_manly"]>=1].index

soft_no = item_data[item_data["soft_no"]>=1].index
soft_soft = item_data[item_data["soft_soft"]>=1].index

In [18]:
item_data["season"] = "" 
item_data["tpo"] = ""
item_data["fit"] = ""
item_data["brightness"] = ""
item_data["temperature"] = ""
item_data["weight"] = ""
# item_data["nice"] = ""
# item_data["urban"] = ""
# item_data["trendy"] = ""
# item_data["sophisticated"] = ""
# item_data["clean"] = ""
# item_data["magnificent"] = ""
# item_data["unique"] = ""
# item_data["easy"] = ""
# item_data["open"] = ""
# item_data["practical"] = ""
# item_data["activity"] = ""
# item_data["comfortable"] = ""
# item_data["bubbly"] = ""
# item_data["feminine"] = ""
# item_data["manly"] = ""
# item_data["soft"] = ""

In [19]:
item_data.iloc[spring_and_fall, -6] = "spring fall"
item_data.iloc[summer, -6] = item_data.iloc[summer, -6] + " summer"
item_data.iloc[winter, -6] = item_data.iloc[winter, -6] + " winter"
item_data.head()

Unnamed: 0,item_name,era,style,gender,Q1,Q5,season_spring fall,season_summer,season_winter,tpo_attendance,...,manly_manly,manly_no,soft_no,soft_soft,season,tpo,fit,brightness,temperature,weight
0,W_00002_60_mods_M.jpg,1960,mods,M,3,2,1,0,0,1,...,1,0,1,0,spring fall,,,,,
1,W_00003_50_ivy_M.jpg,1950,ivy,M,13,8,3,0,1,2,...,4,0,4,0,spring fall winter,,,,,
2,W_00004_50_ivy_M.jpg,1950,ivy,M,4,2,1,0,0,1,...,0,1,1,0,spring fall,,,,,
3,W_00005_60_mods_M.jpg,1960,mods,M,3,2,1,0,1,0,...,0,2,2,0,spring fall winter,,,,,
4,W_00007_60_mods_M.jpg,1960,mods,M,13,8,4,0,0,1,...,2,2,4,0,spring fall,,,,,


In [20]:
item_data.iloc[attend, -5] = "attendance"
item_data.iloc[daily, -5] = item_data.iloc[daily, -5] + " daily"
item_data.iloc[date, -5] = item_data.iloc[date, -5] + " date"
item_data.iloc[etc, -5] = item_data.iloc[etc, -5] + " etc"
item_data.iloc[event, -5] = item_data.iloc[event, -5] + " event"
item_data.iloc[sports, -5] = item_data.iloc[sports, -5] + " leisure sports"
item_data.iloc[social, -5] = item_data.iloc[social, -5] + " social gathering"
item_data.iloc[trip, -5] = item_data.iloc[trip, -5] + " social trip vacation"
item_data.head()

Unnamed: 0,item_name,era,style,gender,Q1,Q5,season_spring fall,season_summer,season_winter,tpo_attendance,...,manly_manly,manly_no,soft_no,soft_soft,season,tpo,fit,brightness,temperature,weight
0,W_00002_60_mods_M.jpg,1960,mods,M,3,2,1,0,0,1,...,1,0,1,0,spring fall,attendance,,,,
1,W_00003_50_ivy_M.jpg,1950,ivy,M,13,8,3,0,1,2,...,4,0,4,0,spring fall winter,attendance event,,,,
2,W_00004_50_ivy_M.jpg,1950,ivy,M,4,2,1,0,0,1,...,0,1,1,0,spring fall,attendance,,,,
3,W_00005_60_mods_M.jpg,1960,mods,M,3,2,1,0,1,0,...,0,2,2,0,spring fall winter,social gathering,,,,
4,W_00007_60_mods_M.jpg,1960,mods,M,13,8,4,0,0,1,...,2,2,4,0,spring fall,attendance event,,,,


In [21]:
item_data.iloc[appro, -4] = "appropriate"
item_data.iloc[loo, -4] = item_data.iloc[loo, -4] + " loose"
item_data.iloc[tig, -4] = item_data.iloc[tig, -4] + " tight"

item_data.iloc[bright, -3] = item_data.iloc[bright, -3] + " bright"
item_data.iloc[dark, -3] = item_data.iloc[dark, -3] + " dark"

item_data.iloc[cold, -2] = item_data.iloc[cold, -2] + " cold"
item_data.iloc[warm, -2] = item_data.iloc[warm, -2] + " warm"

item_data.iloc[heavy, -1] = item_data.iloc[heavy, -1] + " heavy"
item_data.iloc[light, -1] = item_data.iloc[light, -1] + " light"
item_data.head()

Unnamed: 0,item_name,era,style,gender,Q1,Q5,season_spring fall,season_summer,season_winter,tpo_attendance,...,manly_manly,manly_no,soft_no,soft_soft,season,tpo,fit,brightness,temperature,weight
0,W_00002_60_mods_M.jpg,1960,mods,M,3,2,1,0,0,1,...,1,0,1,0,spring fall,attendance,tight,bright,warm,light
1,W_00003_50_ivy_M.jpg,1950,ivy,M,13,8,3,0,1,2,...,4,0,4,0,spring fall winter,attendance event,appropriate loose,dark,cold warm,heavy light
2,W_00004_50_ivy_M.jpg,1950,ivy,M,4,2,1,0,0,1,...,0,1,1,0,spring fall,attendance,appropriate,bright,warm,light
3,W_00005_60_mods_M.jpg,1960,mods,M,3,2,1,0,1,0,...,0,2,2,0,spring fall winter,social gathering,appropriate loose,bright dark,cold warm,heavy light
4,W_00007_60_mods_M.jpg,1960,mods,M,13,8,4,0,0,1,...,2,2,4,0,spring fall,attendance event,appropriate tight,bright dark,cold warm,heavy light


In [22]:
for col_lll in item_data.columns.tolist()[-6:]:
    print(item_data[col_lll].unique())

['spring fall' 'spring fall winter' ' winter' 'spring fall summer'
 ' summer' 'spring fall summer winter' ' summer winter']
['attendance' 'attendance event' ' social gathering' ' daily'
 'attendance daily' ' event' 'attendance daily date event'
 ' event social trip vacation' 'attendance date' ' social trip vacation'
 ' daily leisure sports' 'attendance daily social gathering'
 'attendance event social gathering' ' daily etc leisure sports'
 'attendance date event' ' daily social gathering' ' date'
 ' date social gathering social trip vacation' ' daily date'
 ' daily social trip vacation' ' leisure sports' ' etc'
 ' event social gathering' ' daily date event social gathering'
 ' date social gathering' ' date event social gathering'
 ' daily date social trip vacation'
 'attendance daily event social gathering' ' date event'
 'attendance daily date' ' daily event' ' etc social trip vacation'
 ' date event social trip vacation' ' daily date social gathering'
 'attendance social gathering' 

In [23]:
item_data["item"] = item_data.reset_index()["index"]
item_data = \
item_data[['item','item_name', 'era', 'style', 'gender',
           'season', 'tpo', 'fit', 'brightness', 'temperature', 'weight',
           'nice_nice', 'nice_no', 'urban_no', 'urban_urban', 'trendy_no',
           'trendy_trendy', 'sophisticated_no', 'sophisticated_sophisticated',
           'clean_clean', 'clean_no', 'magnificent_magnificent', 'magnificent_no',
           'unique_no', 'unique_unique', 'easy_easy', 'easy_no', 'open_no',
           'open_open mined', 'practical_no', 'practical_practical',
           'activity_activity', 'activity_no', 'comfortable_comfortable',
           'comfortable_no', 'bubbly_bubbly', 'bubbly_no', 'feminine_feminine',
           'feminine_no', 'manly_manly', 'manly_no', 'soft_no', 'soft_soft']]

In [24]:
item_data['season'] = item_data['season'].str.strip()
item_data['tpo'] = item_data['tpo'].str.strip()
item_data['fit'] = item_data['fit'].str.strip()
item_data['brightness'] = item_data['brightness'].str.strip()
item_data['temperature'] = item_data['temperature'].str.strip()
item_data['weight'] = item_data['weight'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data['season'] = item_data['season'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data['tpo'] = item_data['tpo'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data['fit'] = item_data['fit'].str.strip()
A value is trying to be set on a copy of a slice from a

In [25]:
item_data.head()

Unnamed: 0,item,item_name,era,style,gender,season,tpo,fit,brightness,temperature,...,comfortable_comfortable,comfortable_no,bubbly_bubbly,bubbly_no,feminine_feminine,feminine_no,manly_manly,manly_no,soft_no,soft_soft
0,0,W_00002_60_mods_M.jpg,1960,mods,M,spring fall,attendance,tight,bright,warm,...,0,1,0,1,0,1,1,0,1,0
1,1,W_00003_50_ivy_M.jpg,1950,ivy,M,spring fall winter,attendance event,appropriate loose,dark,cold warm,...,1,3,0,4,0,4,4,0,4,0
2,2,W_00004_50_ivy_M.jpg,1950,ivy,M,spring fall,attendance,appropriate,bright,warm,...,0,1,0,1,0,1,0,1,1,0
3,3,W_00005_60_mods_M.jpg,1960,mods,M,spring fall winter,social gathering,appropriate loose,bright dark,cold warm,...,0,2,0,2,0,2,0,2,2,0
4,4,W_00007_60_mods_M.jpg,1960,mods,M,spring fall,attendance event,appropriate tight,bright dark,cold warm,...,0,4,1,3,0,4,2,2,4,0


In [26]:
for re_col in item_data.columns.tolist()[11:]:
    item_data.loc[item_data[re_col] >= 1, re_col] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [27]:
item_data = int_memory_reduce(item_data)
item_data = float_memory_reduce(item_data)
item_data = object_memory_reduce(item_data)
item_data.to_csv("../KData/item_data.csv")
item_data

Before : 0.13 MB -> After : 0.03 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 7.56 MB -> After : 1.41 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,item,item_name,era,style,gender,season,tpo,fit,brightness,temperature,...,comfortable_comfortable,comfortable_no,bubbly_bubbly,bubbly_no,feminine_feminine,feminine_no,manly_manly,manly_no,soft_no,soft_soft
0,0,W_00002_60_mods_M.jpg,1960,mods,M,spring fall,attendance,tight,bright,warm,...,0,1,0,1,0,1,1,0,1,0
1,1,W_00003_50_ivy_M.jpg,1950,ivy,M,spring fall winter,attendance event,appropriate loose,dark,cold warm,...,1,1,0,1,0,1,1,0,1,0
2,2,W_00004_50_ivy_M.jpg,1950,ivy,M,spring fall,attendance,appropriate,bright,warm,...,0,1,0,1,0,1,0,1,1,0
3,3,W_00005_60_mods_M.jpg,1960,mods,M,spring fall winter,social gathering,appropriate loose,bright dark,cold warm,...,0,1,0,1,0,1,0,1,1,0
4,4,W_00007_60_mods_M.jpg,1960,mods,M,spring fall,attendance event,appropriate tight,bright dark,cold warm,...,0,1,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,16449,W_19996_90_kitsch_W.jpg,1990,kitsch,W,spring fall,social trip vacation,appropriate,bright,cold warm,...,1,1,1,0,1,1,0,1,1,0
16450,16450,W_19997_00_oriental_W.jpg,2000,oriental,W,spring fall winter,etc social gathering,appropriate tight,bright,cold warm,...,0,1,0,1,0,1,0,1,1,0
16451,16451,W_19998_50_feminine_W.jpg,1950,feminine,W,spring fall winter,social gathering,appropriate,dark,cold warm,...,0,1,0,1,0,1,0,1,1,0
16452,16452,W_19999_00_oriental_W.jpg,2000,oriental,W,spring fall summer,event social gathering,appropriate tight,dark,warm,...,0,1,0,1,1,1,0,1,1,0


# 3. Rating Data Preprocessing

In [28]:
rate_data = kdf[["R_id", "imgName", "Q1", "Q5"]]
rate_data.columns = ["user", "item", "Q1", "Q5"]
print(rate_data.shape)
rate_data.head()

(38284, 4)


Unnamed: 0,user,item,Q1,Q5
0,27,W_15268_50_ivy_M.jpg,4,2
1,27,W_16543_50_ivy_M.jpg,2,1
2,27,W_17697_50_ivy_M.jpg,3,2
3,27,W_00485_60_mods_M.jpg,4,2
4,27,W_06723_60_mods_M.jpg,3,2


In [29]:
rate_data[["Q1", "Q5"]].corr()

Unnamed: 0,Q1,Q5
Q1,1.0,0.830661
Q5,0.830661,1.0


In [30]:
u_dict = user_data[["user", "user_name"]].astype("str").set_index("user_name")
u_dict = u_dict.T.to_dict('records')[0]

i_dict = item_data[["item", "item_name"]].astype("str").set_index("item_name")
i_dict = i_dict.T.to_dict('records')[0]

In [31]:
#rate_data["user"] = [int(u_dict[str(i)]) for i in rate_data["user"].values.tolist()]
rate_data["item"] = [int(i_dict[str(i)]) for i in rate_data["item"].values.tolist()]
rate_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate_data["item"] = [int(i_dict[str(i)]) for i in rate_data["item"].values.tolist()]


Unnamed: 0,user,item,Q1,Q5
0,27,12843,4,2
1,27,13798,2,1
2,27,14541,3,2
3,27,401,4,2
4,27,5620,3,2


In [32]:
rate_data.shape

(38284, 4)

In [33]:
# rate_data = rate_data.loc[(rate_data["Q5"] == 2) & (rate_data["Q1"] >= 3)].reset_index(drop=True)

In [34]:
rate_data_q1q2 = rate_data[["user", "item", "Q1", "Q5"]]
rate_data_q1q2["rate"] = rate_data_q1q2[["Q1", "Q5"]].mean(axis=1)
rate_data_q1q2 = rate_data_q1q2[["user", "item", "rate"]]
rate_data_q1q2

Unnamed: 0,user,item,rate
0,27,12843,3.0
1,27,13798,1.5
2,27,14541,2.5
3,27,401,3.0
4,27,5620,2.5
...,...,...,...
38279,63992,14540,1.5
38280,63992,5936,2.5
38281,63992,6058,2.5
38282,63992,14043,1.5


In [35]:
rate_data_q1q2 = int_memory_reduce(rate_data_q1q2)
rate_data_q1q2 = float_memory_reduce(rate_data_q1q2)
rate_data_q1q2.to_csv("../KData/rate_data.csv")
rate_data_q1q2

Before : 0.58 MB -> After : 0.15 MB
Before : 0.29 MB -> After : 0.15 MB


Unnamed: 0,user,item,rate
0,27,12843,3.0
1,27,13798,1.5
2,27,14541,2.5
3,27,401,3.0
4,27,5620,2.5
...,...,...,...
38279,63992,14540,1.5
38280,63992,5936,2.5
38281,63992,6058,2.5
38282,63992,14043,1.5
