# 1. Training 전처리

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import json
import random

In [2]:
# 데이터 크기 확인 함수
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

## 타입별 평균 크기 확인 함수
def type_memory(data) :
    for dtype in ['float','int','object']:
        selected_dtype = data.select_dtypes(include=[dtype])
        mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
        mean_usage_mb = mean_usage_b / 1024 ** 2
        print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

## 이산형 데이터 사이즈 축소 함소
def int_memory_reduce(data) :
    data_int = data.select_dtypes(include=['int'])
    converted_int = data_int.apply(pd.to_numeric,downcast='unsigned')
    print(f"Before : {mem_usage(data_int)} -> After : {mem_usage(converted_int)}")
    data[converted_int.columns] = converted_int
    return data

## 연속형 데이터 사이즈 축소 함소
def float_memory_reduce(data) :
    data_float = data.select_dtypes(include=['float'])
    converted_float = data_float.apply(pd.to_numeric,downcast='float')
    print(f"Before : {mem_usage(data_float)} -> After : {mem_usage(converted_float)}")
    data[converted_float.columns] = converted_float
    return data

## 문자형 데이터 사이즈 축소 함소
def object_memory_reduce(data) :
    gl_obj = data.select_dtypes(include=['object']).copy()
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = gl_obj[col]
    print(f"Before : {mem_usage(gl_obj)} -> After : {mem_usage(converted_obj)}")
    data[converted_obj.columns] = converted_obj
    return data

In [3]:
# kdf = pd.read_excel("1.1차 선호도조사결과_data(0825)_10415.xlsx", sheet_name='Pre_Meta data_1.1차(0825)', engine='openpyxl')
# kdf = kdf.rename(columns=kdf.iloc[0])
# kdf = kdf.drop(kdf.index[0])
# print(kdf.shape)
# kdf.head(3)

In [4]:
kdf = pd.read_excel("1차 선호도조사결과(최종)_data(0831)_25155.xlsx", sheet_name='Pre_Meta data_1차(0831)', engine='openpyxl')
#kdf = kdf.rename(columns=kdf.iloc[0])
#kdf = kdf.drop(kdf.index[0])
print(kdf.shape)
kdf.head(3)

(25155, 23)


Unnamed: 0,E_id,R_id,imgNane,era,style,gender,Q1,Q2,Q3_1,Q3_2,...,Q5_2,Q5_3,r_gender,age,mar,child,edu,job,income,fa_expend
0,1,590,W_00206_50_W.jpg,1950,1950W,W,3,3,2.0,1.0,...,5.0,1.0,2,3,2,1,3,3,2,1
1,2,590,W_00246_50_W.jpg,1950,1950W,W,1,2,,,...,5.0,8.0,2,3,2,1,3,3,2,1
2,3,590,W_00567_50_W.jpg,1950,1950W,W,1,2,,,...,4.0,8.0,2,3,2,1,3,3,2,1


In [5]:
kdf.isna().sum()

E_id             0
R_id             0
imgNane          0
era              0
style            0
gender           0
Q1               0
Q2               0
Q3_1         11612
Q3_2         12169
Q3_3         13928
Q4               0
Q5_1             0
Q5_2          2508
Q5_3          6726
r_gender         0
age              0
mar              0
child            0
edu              0
job              0
income           0
fa_expend        0
dtype: int64

### 1) 유저 데이터프레임 생성

In [6]:
user_data = kdf[["R_id", "r_gender", "age"]]
user_data = user_data.drop_duplicates(['R_id']).reset_index(drop=True)
user_data.columns = ["user_name", "r_gender", "age"]
user_data["user"] = user_data.reset_index()["index"]
user_data = user_data[["user", "user_name", "r_gender", "age"]]
user_data = int_memory_reduce(user_data)
user_data = float_memory_reduce(user_data)
user_data = object_memory_reduce(user_data)
user_data.to_csv("user_data.csv")
user_data

Before : 0.02 MB -> After : 0.00 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 0.00 MB -> After : 0.00 MB


Unnamed: 0,user,user_name,r_gender,age
0,0,590,2,3
1,1,1403,2,1
2,2,2305,2,1
3,3,8530,2,4
4,4,14389,2,3
...,...,...,...,...
534,534,60206,1,1
535,535,60208,1,1
536,536,60209,2,4
537,537,18887,2,2


### 2) 아이템 데이터프레임 생성

In [7]:
# Q5
q5_dict = {"1" : "business casual",
           "2" : "business formal",
           "3" : "daily",
           "4" : "esleisure",
           "5" : "date",
           "6" : "condolence",
           "7" : "guest",
           "8" : "party",
           "9" : "wedding"}

# Q4
q4_dict = {"1" : "spring fall", "2" : "summer", "3" : "winter"}

In [8]:
item_data = kdf[['imgNane','era','style','gender', "Q4", "Q5_1"]]
item_data.loc[:, "Q4"] = item_data.loc[:, "Q4"].astype("str")
item_data.loc[:, "Q5_1"] = item_data.loc[:,"Q5_1"].astype("str")
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25155 entries, 0 to 25154
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   imgNane  25155 non-null  object
 1   era      25155 non-null  int64 
 2   style    25155 non-null  object
 3   gender   25155 non-null  object
 4   Q4       25155 non-null  object
 5   Q5_1     25155 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.2+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [9]:
item_data["season"] = [q4_dict[str(i)] for i in item_data["Q4"].values.tolist()]
item_data["tpo"] = [q5_dict[str(i)] for i in item_data["Q5_1"].values.tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data["season"] = [q4_dict[str(i)] for i in item_data["Q4"].values.tolist()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data["tpo"] = [q5_dict[str(i)] for i in item_data["Q5_1"].values.tolist()]


In [10]:
item_data = pd.get_dummies(item_data, columns = ['season', "tpo"])
item_data = item_data.groupby(["imgNane", "era", "style", "gender"]).sum().reset_index()
item_data

Unnamed: 0,imgNane,era,style,gender,season_spring fall,season_summer,season_winter,tpo_business casual,tpo_business formal,tpo_condolence,tpo_daily,tpo_date,tpo_esleisure,tpo_guest,tpo_party,tpo_wedding
0,W_00001_60_M.jpg,1960,1960M,M,1,1,0,0,0,0,1,0,1,0,0,0
1,W_00002_60_M.jpg,1960,1960M,M,3,0,0,1,1,0,1,0,0,0,0,0
2,W_00003_60_M.jpg,1960,1960M,M,1,0,2,2,1,0,0,0,0,0,0,0
3,W_00004_60_M.jpg,1960,1960M,M,4,0,0,4,0,0,0,0,0,0,0,0
4,W_00005_50_M.jpg,1950,1950M,M,7,0,0,1,1,0,0,1,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8413,W_09408_19_W.jpg,2019,2019W,W,1,1,0,1,0,0,1,0,0,0,0,0
8414,W_09409_19_W.jpg,2019,2019W,W,2,0,0,1,0,0,1,0,0,0,0,0
8415,W_09410_19_W.jpg,2019,2019W,W,2,0,0,0,0,0,1,1,0,0,0,0
8416,W_09411_19_W.jpg,2019,2019W,W,0,2,0,0,0,0,1,0,0,0,1,0


In [11]:
item_data.columns = ["item_name", "era", "style", "gender"] + item_data.columns.tolist()[-12:]
item_data = int_memory_reduce(item_data)
item_data = float_memory_reduce(item_data)
item_data = object_memory_reduce(item_data)
item_data

Before : 0.06 MB -> After : 0.02 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 1.55 MB -> After : 0.60 MB


Unnamed: 0,item_name,era,style,gender,season_spring fall,season_summer,season_winter,tpo_business casual,tpo_business formal,tpo_condolence,tpo_daily,tpo_date,tpo_esleisure,tpo_guest,tpo_party,tpo_wedding
0,W_00001_60_M.jpg,1960,1960M,M,1,1,0,0,0,0,1,0,1,0,0,0
1,W_00002_60_M.jpg,1960,1960M,M,3,0,0,1,1,0,1,0,0,0,0,0
2,W_00003_60_M.jpg,1960,1960M,M,1,0,2,2,1,0,0,0,0,0,0,0
3,W_00004_60_M.jpg,1960,1960M,M,4,0,0,4,0,0,0,0,0,0,0,0
4,W_00005_50_M.jpg,1950,1950M,M,7,0,0,1,1,0,0,1,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8413,W_09408_19_W.jpg,2019,2019W,W,1,1,0,1,0,0,1,0,0,0,0,0
8414,W_09409_19_W.jpg,2019,2019W,W,2,0,0,1,0,0,1,0,0,0,0,0
8415,W_09410_19_W.jpg,2019,2019W,W,2,0,0,0,0,0,1,1,0,0,0,0
8416,W_09411_19_W.jpg,2019,2019W,W,0,2,0,0,0,0,1,0,0,0,1,0


In [12]:
business_casual = item_data[item_data["tpo_business casual"]>=1].index
business_formal = item_data[item_data["tpo_business formal"]>=1].index
condolence = item_data[item_data["tpo_condolence"]>=1].index
daily = item_data[item_data["tpo_daily"]>=1].index
date = item_data[item_data["tpo_date"]>=1].index
esleisure = item_data[item_data["tpo_esleisure"]>=1].index
guest = item_data[item_data["tpo_guest"]>=1].index
party = item_data[item_data["tpo_party"]>=1].index
wedding = item_data[item_data["tpo_wedding"]>=1].index

In [13]:
 item_data["tpo"] = ""

In [14]:
item_data.iloc[business_casual, -1] = "business casual"
item_data.iloc[business_formal, -1] = item_data.iloc[business_formal, -1] + " business formal"
item_data.iloc[condolence, -1] = item_data.iloc[condolence, -1] + " condolence"
item_data.iloc[daily, -1] = item_data.iloc[daily, -1] + " daily"
item_data.iloc[date, -1] = item_data.iloc[date, -1] + " date"
item_data.iloc[esleisure, -1] = item_data.iloc[esleisure, -1] + " esleisure"
item_data.iloc[guest, -1] = item_data.iloc[guest, -1] + " guest"
item_data.iloc[party, -1] = item_data.iloc[party, -1] + " party"
item_data.iloc[wedding, -1] = item_data.iloc[wedding, -1] + " wedding"
item_data

Unnamed: 0,item_name,era,style,gender,season_spring fall,season_summer,season_winter,tpo_business casual,tpo_business formal,tpo_condolence,tpo_daily,tpo_date,tpo_esleisure,tpo_guest,tpo_party,tpo_wedding,tpo
0,W_00001_60_M.jpg,1960,1960M,M,1,1,0,0,0,0,1,0,1,0,0,0,daily esleisure
1,W_00002_60_M.jpg,1960,1960M,M,3,0,0,1,1,0,1,0,0,0,0,0,business casual business formal daily
2,W_00003_60_M.jpg,1960,1960M,M,1,0,2,2,1,0,0,0,0,0,0,0,business casual business formal
3,W_00004_60_M.jpg,1960,1960M,M,4,0,0,4,0,0,0,0,0,0,0,0,business casual
4,W_00005_50_M.jpg,1950,1950M,M,7,0,0,1,1,0,0,1,0,0,4,0,business casual business formal date party
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8413,W_09408_19_W.jpg,2019,2019W,W,1,1,0,1,0,0,1,0,0,0,0,0,business casual daily
8414,W_09409_19_W.jpg,2019,2019W,W,2,0,0,1,0,0,1,0,0,0,0,0,business casual daily
8415,W_09410_19_W.jpg,2019,2019W,W,2,0,0,0,0,0,1,1,0,0,0,0,daily date
8416,W_09411_19_W.jpg,2019,2019W,W,0,2,0,0,0,0,1,0,0,0,1,0,daily party


In [15]:
item_data["item"] = item_data.reset_index()["index"]
item_data = item_data[["item", "item_name", "era", "style", "gender", "tpo", "season_spring fall", "season_summer", "season_winter"]]
item_data = int_memory_reduce(item_data)
item_data = float_memory_reduce(item_data)
item_data = object_memory_reduce(item_data)
item_data

Before : 0.06 MB -> After : 0.02 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 1.16 MB -> After : 0.62 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,item,item_name,era,style,gender,tpo,season_spring fall,season_summer,season_winter
0,0,W_00001_60_M.jpg,1960,1960M,M,daily esleisure,1,1,0
1,1,W_00002_60_M.jpg,1960,1960M,M,business casual business formal daily,3,0,0
2,2,W_00003_60_M.jpg,1960,1960M,M,business casual business formal,1,0,2
3,3,W_00004_60_M.jpg,1960,1960M,M,business casual,4,0,0
4,4,W_00005_50_M.jpg,1950,1950M,M,business casual business formal date party,7,0,0
...,...,...,...,...,...,...,...,...,...
8413,8413,W_09408_19_W.jpg,2019,2019W,W,business casual daily,1,1,0
8414,8414,W_09409_19_W.jpg,2019,2019W,W,business casual daily,2,0,0
8415,8415,W_09410_19_W.jpg,2019,2019W,W,daily date,2,0,0
8416,8416,W_09411_19_W.jpg,2019,2019W,W,daily party,0,2,0


In [16]:
item_data['tpo'] = item_data['tpo'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data['tpo'] = item_data['tpo'].str.strip()


In [17]:
item_data.loc[item_data["season_spring fall"] > 0, ["season_spring fall"] ] = 1
item_data.loc[item_data["season_summer"] > 0, ["season_summer"] ] = 1
item_data.loc[item_data["season_winter"] > 0, ["season_winter"] ] = 1
item_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,item,item_name,era,style,gender,tpo,season_spring fall,season_summer,season_winter
0,0,W_00001_60_M.jpg,1960,1960M,M,daily esleisure,1,1,0
1,1,W_00002_60_M.jpg,1960,1960M,M,business casual business formal daily,1,0,0
2,2,W_00003_60_M.jpg,1960,1960M,M,business casual business formal,1,0,1
3,3,W_00004_60_M.jpg,1960,1960M,M,business casual,1,0,0
4,4,W_00005_50_M.jpg,1950,1950M,M,business casual business formal date party,1,0,0
...,...,...,...,...,...,...,...,...,...
8413,8413,W_09408_19_W.jpg,2019,2019W,W,business casual daily,1,1,0
8414,8414,W_09409_19_W.jpg,2019,2019W,W,business casual daily,1,0,0
8415,8415,W_09410_19_W.jpg,2019,2019W,W,daily date,1,0,0
8416,8416,W_09411_19_W.jpg,2019,2019W,W,daily party,0,1,0


In [18]:
sty_dicts = {"1950M" : "men ivy look",
             "1960M" : "men mose look",
             "1970M" : "men hippy",
             "1980M" : "men bold look",
             "1990M" : "men hiphop",
             "2000M" : "men metro sexual",
             "2010M" : "men sportive casual",
             "2019M" : "men normcore",
             "1950W" : "women feminine",
             "1960W" : "women minimal look",
             "1970W" : "women hippy",
             "1980W" : "women bold look",
             "1990W" : "women hiphop",
             "2000W" : "women millennium",
             "2010W" : "women sportive casual",
             "2019W" : "women normcore"}

#rate_data["user"] = [int(u_dict[str(i)]) for i in rate_data["user"].values.tolist()]
item_data["style"] = [sty_dicts[str(i)] for i in item_data["style"].values.tolist()]
item_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_data["style"] = [sty_dicts[str(i)] for i in item_data["style"].values.tolist()]


Unnamed: 0,item,item_name,era,style,gender,tpo,season_spring fall,season_summer,season_winter
0,0,W_00001_60_M.jpg,1960,men mose look,M,daily esleisure,1,1,0
1,1,W_00002_60_M.jpg,1960,men mose look,M,business casual business formal daily,1,0,0
2,2,W_00003_60_M.jpg,1960,men mose look,M,business casual business formal,1,0,1
3,3,W_00004_60_M.jpg,1960,men mose look,M,business casual,1,0,0
4,4,W_00005_50_M.jpg,1950,men ivy look,M,business casual business formal date party,1,0,0
...,...,...,...,...,...,...,...,...,...
8413,8413,W_09408_19_W.jpg,2019,women normcore,W,business casual daily,1,1,0
8414,8414,W_09409_19_W.jpg,2019,women normcore,W,business casual daily,1,0,0
8415,8415,W_09410_19_W.jpg,2019,women normcore,W,daily date,1,0,0
8416,8416,W_09411_19_W.jpg,2019,women normcore,W,daily party,0,1,0


In [19]:
item_data = int_memory_reduce(item_data)
item_data = float_memory_reduce(item_data)
item_data = object_memory_reduce(item_data)
item_data.to_csv("item_data.csv")
item_data

Before : 0.00 MB -> After : 0.00 MB
Before : 0.00 MB -> After : 0.00 MB
Before : 1.73 MB -> After : 0.63 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,item,item_name,era,style,gender,tpo,season_spring fall,season_summer,season_winter
0,0,W_00001_60_M.jpg,1960,men mose look,M,daily esleisure,1,1,0
1,1,W_00002_60_M.jpg,1960,men mose look,M,business casual business formal daily,1,0,0
2,2,W_00003_60_M.jpg,1960,men mose look,M,business casual business formal,1,0,1
3,3,W_00004_60_M.jpg,1960,men mose look,M,business casual,1,0,0
4,4,W_00005_50_M.jpg,1950,men ivy look,M,business casual business formal date party,1,0,0
...,...,...,...,...,...,...,...,...,...
8413,8413,W_09408_19_W.jpg,2019,women normcore,W,business casual daily,1,1,0
8414,8414,W_09409_19_W.jpg,2019,women normcore,W,business casual daily,1,0,0
8415,8415,W_09410_19_W.jpg,2019,women normcore,W,daily date,1,0,0
8416,8416,W_09411_19_W.jpg,2019,women normcore,W,daily party,0,1,0


### 3) rating 데이터프레임 생성

In [20]:
rate_data = kdf[["R_id", "imgNane", "Q1", "Q2", "Q3_1", "Q3_2", "Q3_3", "Q4", "Q5_1", "Q5_2", "Q5_3"]]
rate_data.columns = ["user", "item", "Q1", "Q2", "Q3_1", "Q3_2", "Q3_3", "Q4", "Q5_1", "Q5_2", "Q5_3"]
rate_data

Unnamed: 0,user,item,Q1,Q2,Q3_1,Q3_2,Q3_3,Q4,Q5_1,Q5_2,Q5_3
0,590,W_00206_50_W.jpg,3,3,2.0,1.0,5.0,2,3,5.0,1.0
1,590,W_00246_50_W.jpg,1,2,,,,1,4,5.0,8.0
2,590,W_00567_50_W.jpg,1,2,,,,2,5,4.0,8.0
3,590,W_02256_50_W.jpg,1,2,,,,2,5,3.0,4.0
4,590,W_03860_50_W.jpg,3,3,2.0,1.0,3.0,2,3,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
25150,53554,W_03744_19_W.jpg,1,1,,,,2,3,8.0,4.0
25151,53554,W_03810_19_W.jpg,1,2,,,,1,3,1.0,5.0
25152,53554,W_03874_19_W.jpg,3,3,3.0,4.0,1.0,1,1,2.0,6.0
25153,53554,W_04645_19_W.jpg,2,2,,,,3,1,5.0,3.0


In [21]:
u_dict = user_data[["user", "user_name"]].astype("str").set_index("user_name")
u_dict = u_dict.T.to_dict('records')[0]

i_dict = item_data[["item", "item_name"]].astype("str").set_index("item_name")
i_dict = i_dict.T.to_dict('records')[0]

In [22]:
#rate_data["user"] = [int(u_dict[str(i)]) for i in rate_data["user"].values.tolist()]
rate_data["item"] = [int(i_dict[str(i)]) for i in rate_data["item"].values.tolist()]
rate_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate_data["item"] = [int(i_dict[str(i)]) for i in rate_data["item"].values.tolist()]


Unnamed: 0,user,item,Q1,Q2,Q3_1,Q3_2,Q3_3,Q4,Q5_1,Q5_2,Q5_3
0,590,197,3,3,2.0,1.0,5.0,2,3,5.0,1.0
1,590,236,1,2,,,,1,4,5.0,8.0
2,590,553,1,2,,,,2,5,4.0,8.0
3,590,2172,1,2,,,,2,5,3.0,4.0
4,590,3735,3,3,2.0,1.0,3.0,2,3,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
25150,53554,3620,1,1,,,,2,3,8.0,4.0
25151,53554,3685,1,2,,,,1,3,1.0,5.0
25152,53554,3749,3,3,3.0,4.0,1.0,1,1,2.0,6.0
25153,53554,4460,2,2,,,,3,1,5.0,3.0


In [23]:
rate_data_q1q2 = rate_data[["user", "item", "Q1", "Q2"]]
rate_data_q1q2["rate"] = rate_data_q1q2[["Q1", "Q2"]].mean(axis=1)
rate_data_q1q2 = rate_data_q1q2[["user", "item", "rate"]]
rate_data_q1q2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate_data_q1q2["rate"] = rate_data_q1q2[["Q1", "Q2"]].mean(axis=1)


Unnamed: 0,user,item,rate
0,590,197,3.0
1,590,236,1.5
2,590,553,1.5
3,590,2172,1.5
4,590,3735,3.0
...,...,...,...
25150,53554,3620,1.0
25151,53554,3685,1.5
25152,53554,3749,3.0
25153,53554,4460,2.0


In [24]:
#rate_data_q1q2 = rate_data_q1q2[rate_data_q1q2.rate >= 2.5].reset_index(drop=True)
#rate_data_q1q2
rate_data_q1q2 = int_memory_reduce(rate_data_q1q2)
rate_data_q1q2 = float_memory_reduce(rate_data_q1q2)
rate_data_q1q2.to_csv("rate_data.csv")
rate_data_q1q2

Before : 0.38 MB -> After : 0.10 MB
Before : 0.19 MB -> After : 0.10 MB


Unnamed: 0,user,item,rate
0,590,197,3.0
1,590,236,1.5
2,590,553,1.5
3,590,2172,1.5
4,590,3735,3.0
...,...,...,...
25150,53554,3620,1.0
25151,53554,3685,1.5
25152,53554,3749,3.0
25153,53554,4460,2.0


In [25]:
# json_data = {"R_id" : 1268, 
#  "imgName" : "W_00065_50_M.jpg",
#  "item" : {"imgName" : "W_00065_50_M.jpg", "era" : 1950, "style" : "1950M", "gender" : "M", 
#            "survey" : {"Q1" : 3, 
#                        "Q2" : 3, 
#                        "Q3" : [3, 1, 2], 
#                        "Q4" : 1, 
#                        "Q5" : [3, 1, 5]}
#           },
 
#  "user" : {"R_id" : 1268, "r_gender" : 1, "age" : 2, 
#            "mar" : 1, "child" : 2, "edu" : 4, 
#            "job" : 7, "income" : 2, "fa_expend" : 2}
# }
# print(json.dumps(json_data, indent=2))

# json_data = {"R_id" : 1268, 
#  "imgName" : "W_00065_50_M.jpg",
#  "item" : {"imgName" : "W_00065_50_M.jpg", "era" : 1950, "style" : "1950M", "gender" : "M", 
#            "survey" : {"Q1" : 3, 
#                        "Q2" : 3, 
#                        "Q3" : ["None", "None", "None"], 
#                        "Q4" : 1, 
#                        "Q5" : [3, 1, "None"]}
#           },
 
#  "user" : {"R_id" : 1268, "r_gender" : 1, "age" : 2, 
#            "mar" : 1, "child" : 2, "edu" : 4, 
#            "job" : 7, "income" : 2, "fa_expend" : 2}
# }
# print(json.dumps(json_data, indent=2))

In [26]:
# with open("W_00180_50_W_002932.json", "r") as st_json:
#     st_p = json.load(st_json)

# 2. Hit Rate 측정을 위한 전처리

In [None]:
entire_idx = []
for i in rate_data_q1q2["user"].unique().tolist():
    random.seed(329)
    idx_list = list(rate_data_q1q2.loc[(rate_data_q1q2["user"] == i) & (rate_data_q1q2["rate"] >= 2.5)].index)
    try:
        sample_idx = random.sample(idx_list, 10)
        entire_idx.append(sample_idx)
    except:
        entire_idx.append(idx_list)
entire_idx = sum(entire_idx , [])

In [None]:
rd = rate_data_q1q2.iloc[entire_idx].reset_index(drop=True)

In [None]:
hr_train_index = []
hr_test_index = []
for i in rd["user"].unique().tolist():
    if len(rd.loc[(rd["user"] == i)]) >= 1:
        idx = int(rd.loc[(rd["user"] == i)].index[0])
        hr_test_index.append(idx)
        
        tridx = list(rd.loc[(rd["user"] == i)].index)
        tridx.remove(idx)
        hr_train_index.append(tridx)
    else:
        tridx = list(rd.loc[(rd["user"] == i)].index)
        hr_train_index.append(tridx)

In [None]:
hr_train_index = sum(hr_train_index , [])
hr_tr = rate_data_q1q2_.iloc[hr_train_index].reset_index(drop=True)
hr_tr.to_csv("/mnt/hdd1/wearly/GraphRec/kdeepfashion/HR_train.csv")

In [None]:
hr_te = rate_data_q1q2_.iloc[hr_test_index].reset_index(drop=True)
hr_te.to_csv("/mnt/hdd1/wearly/GraphRec/kdeepfashion/HR_test.csv")

In [None]:
hr_tr.append(hr_te)["item"].nunique()

In [None]:
hr_tr["item"].nunique()

In [None]:
z = hr_te.loc[(hr_te["user"] == 538), "item"].values[0]
z in rate_data_q1q2.loc[(rate_data_q1q2["user"] == 538)].sort_values("rate", ascending=False).head(10)["item"].tolist()

In [None]:
z in rate_data_q1q2.loc[(rate_data_q1q2["user"] == 538)].sort_values("rate", ascending=False).head(10)["item"].tolist()

In [None]:
rate_data_q1q2.loc[(rate_data_q1q2["user"] == 538)].sort_values("rate", ascending=False)["item"]