In [1]:
import os
import re
import random
import json
import pickle
import argparse
import ast
import scipy.sparse as ssp
from collections import defaultdict

from tqdm import tqdm
import pandas as pd
import numpy as np
import dgl
import torch
import torchtext.legacy as torchtext
from builder import PandasGraphBuilder
from data_utils import *
import layers
import sampler as sampler_module
import evaluation

### K-Deep fashion으로 적용

In [3]:
directory = "./KData"
output_path = "./graph_data/kdata_entire8.pkl"

In [4]:
users = pd.read_csv(os.path.join(directory, "user_data.csv"), index_col=0)
print(users.isna().sum())
print(users.shape)
users.head(2)

user         0
user_name    0
r_gender     0
age          0
mar          0
job          0
income       0
r_style1     0
r_style2     0
r_style3     0
r_style4     0
r_style5     0
dtype: int64
(822, 12)


Unnamed: 0,user,user_name,r_gender,age,mar,job,income,r_style1,r_style2,r_style3,r_style4,r_style5
0,0,27,1,4,2,4,1,2,6,2,2,1
1,1,133,1,2,1,6,2,1,1,2,2,2


In [5]:
columns = ['user_name', 'r_gender', 'age']#['user_name', 'r_gender', 'age', "mar", "job", "income", "r_style1", "r_style2", "r_style3", "r_style4", "r_style5"]
users = users[columns]
users.columns = ['userID', 'r_gender', 'age']
users = users.dropna(subset=['userID'])
print(users.shape)
users.head(2)

(822, 3)


Unnamed: 0,userID,r_gender,age
0,27,1,4
1,133,1,2


In [6]:
users = pd.get_dummies(users, columns = ['r_gender'])
users['user_feats'] = list(users[['r_gender_1', 'r_gender_2']].values)
del users["age"]
print(users.shape)
users.head(2)

(822, 4)


Unnamed: 0,userID,r_gender_1,r_gender_2,user_feats
0,27,1,0,"[1, 0]"
1,133,1,0,"[1, 0]"


In [7]:
items = pd.read_csv(os.path.join(directory, "item_data.csv"), index_col=0)
print(items.isna().sum())
print(items.shape)
items.head(2)

item                           0
item_name                      0
era                            0
style                          0
gender                         0
season                         0
tpo                            0
fit                            0
brightness                     0
temperature                    0
weight                         0
nice_nice                      0
nice_no                        0
urban_no                       0
urban_urban                    0
trendy_no                      0
trendy_trendy                  0
sophisticated_no               0
sophisticated_sophisticated    0
clean_clean                    0
clean_no                       0
magnificent_magnificent        0
magnificent_no                 0
unique_no                      0
unique_unique                  0
easy_easy                      0
easy_no                        0
open_no                        0
open_open mined                0
practical_no                   0
practical_

Unnamed: 0,item,item_name,era,style,gender,season,tpo,fit,brightness,temperature,...,comfortable_comfortable,comfortable_no,bubbly_bubbly,bubbly_no,feminine_feminine,feminine_no,manly_manly,manly_no,soft_no,soft_soft
0,0,W_00002_60_mods_M.jpg,1960,mods,M,spring fall,attendance,tight,bright,warm,...,0,1,0,1,0,1,1,0,1,0
1,1,W_00003_50_ivy_M.jpg,1950,ivy,M,spring fall winter,attendance event,appropriate loose,dark,cold warm,...,1,1,0,1,0,1,1,0,1,0


In [8]:
columns = ['item', 'era', 'style', 'gender', 'season'] + \
['tpo','fit','brightness','temperature','weight','nice_nice','nice_no','urban_no','urban_urban',
 'trendy_no','trendy_trendy','sophisticated_no','sophisticated_sophisticated','clean_clean','clean_no',
 'magnificent_magnificent','magnificent_no','unique_no','unique_unique','easy_easy','easy_no',
 'open_no','open_open mined','practical_no','practical_practical','activity_activity','activity_no',
 'comfortable_comfortable','comfortable_no','bubbly_bubbly','bubbly_no',
 'feminine_feminine','feminine_no','manly_manly','manly_no','soft_no','soft_soft']

items = items[columns]
print(items.columns)

Index(['item', 'era', 'style', 'gender', 'season', 'tpo', 'fit', 'brightness',
       'temperature', 'weight', 'nice_nice', 'nice_no', 'urban_no',
       'urban_urban', 'trendy_no', 'trendy_trendy', 'sophisticated_no',
       'sophisticated_sophisticated', 'clean_clean', 'clean_no',
       'magnificent_magnificent', 'magnificent_no', 'unique_no',
       'unique_unique', 'easy_easy', 'easy_no', 'open_no', 'open_open mined',
       'practical_no', 'practical_practical', 'activity_activity',
       'activity_no', 'comfortable_comfortable', 'comfortable_no',
       'bubbly_bubbly', 'bubbly_no', 'feminine_feminine', 'feminine_no',
       'manly_manly', 'manly_no', 'soft_no', 'soft_soft'],
      dtype='object')


In [9]:
items.columns = ['item_id', 'era', 'style', 'gender', 'season'] + \
['tpo','fit','brightness','temperature','weight','nice_nice','nice_no','urban_no','urban_urban',
 'trendy_no','trendy_trendy','sophisticated_no','sophisticated_sophisticated','clean_clean','clean_no',
 'magnificent_magnificent','magnificent_no','unique_no','unique_unique','easy_easy','easy_no',
 'open_no','open_open mined','practical_no','practical_practical','activity_activity','activity_no',
 'comfortable_comfortable','comfortable_no','bubbly_bubbly','bubbly_no',
 'feminine_feminine','feminine_no','manly_manly','manly_no','soft_no','soft_soft']
items = items.dropna(subset=['item_id'])
print(items.shape)
items.head(2)

(16454, 42)


Unnamed: 0,item_id,era,style,gender,season,tpo,fit,brightness,temperature,weight,...,comfortable_comfortable,comfortable_no,bubbly_bubbly,bubbly_no,feminine_feminine,feminine_no,manly_manly,manly_no,soft_no,soft_soft
0,0,1960,mods,M,spring fall,attendance,tight,bright,warm,light,...,0,1,0,1,0,1,1,0,1,0
1,1,1950,ivy,M,spring fall winter,attendance event,appropriate loose,dark,cold warm,heavy light,...,1,1,0,1,0,1,1,0,1,0


In [10]:
items = pd.get_dummies(items, columns = ["style", "era", 'gender'])
print(items.shape)
items

(16454, 76)


Unnamed: 0,item_id,season,tpo,fit,brightness,temperature,weight,nice_nice,nice_no,urban_no,...,era_1950,era_1960,era_1970,era_1980,era_1990,era_2000,era_2010,era_2019,gender_M,gender_W
0,0,spring fall,attendance,tight,bright,warm,light,0,1,1,...,0,1,0,0,0,0,0,0,1,0
1,1,spring fall winter,attendance event,appropriate loose,dark,cold warm,heavy light,1,1,1,...,1,0,0,0,0,0,0,0,1,0
2,2,spring fall,attendance,appropriate,bright,warm,light,0,1,0,...,1,0,0,0,0,0,0,0,1,0
3,3,spring fall winter,social gathering,appropriate loose,bright dark,cold warm,heavy light,1,1,1,...,0,1,0,0,0,0,0,0,1,0
4,4,spring fall,attendance event,appropriate tight,bright dark,cold warm,heavy light,1,1,1,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,16449,spring fall,social trip vacation,appropriate,bright,cold warm,light,0,1,1,...,0,0,0,0,1,0,0,0,0,1
16450,16450,spring fall winter,etc social gathering,appropriate tight,bright,cold warm,heavy light,0,1,1,...,0,0,0,0,0,1,0,0,0,1
16451,16451,spring fall winter,social gathering,appropriate,dark,cold warm,heavy,0,1,1,...,1,0,0,0,0,0,0,0,0,1
16452,16452,spring fall summer,event social gathering,appropriate tight,dark,warm,heavy,0,1,1,...,0,0,0,0,0,1,0,0,0,1


In [11]:
cat_columns = items.columns.drop(["item_id", "season", "tpo", "fit", "brightness", "temperature", "weight"])
cat_columns

Index(['nice_nice', 'nice_no', 'urban_no', 'urban_urban', 'trendy_no',
       'trendy_trendy', 'sophisticated_no', 'sophisticated_sophisticated',
       'clean_clean', 'clean_no', 'magnificent_magnificent', 'magnificent_no',
       'unique_no', 'unique_unique', 'easy_easy', 'easy_no', 'open_no',
       'open_open mined', 'practical_no', 'practical_practical',
       'activity_activity', 'activity_no', 'comfortable_comfortable',
       'comfortable_no', 'bubbly_bubbly', 'bubbly_no', 'feminine_feminine',
       'feminine_no', 'manly_manly', 'manly_no', 'soft_no', 'soft_soft',
       'style_athleisure', 'style_bodyconscious', 'style_bold',
       'style_cityglam', 'style_classic', 'style_disco', 'style_ecology',
       'style_feminine', 'style_genderless', 'style_grunge', 'style_hiphop',
       'style_hippie', 'style_ivy', 'style_kitsch', 'style_lingerie',
       'style_lounge', 'style_metrosexual', 'style_military', 'style_minimal',
       'style_mods', 'style_normcore', 'style_oriental'

In [12]:
items['item_feats'] = list(items[cat_columns].values)
print(items.shape)
items.head(2)

(16454, 77)


Unnamed: 0,item_id,season,tpo,fit,brightness,temperature,weight,nice_nice,nice_no,urban_no,...,era_1960,era_1970,era_1980,era_1990,era_2000,era_2010,era_2019,gender_M,gender_W,item_feats
0,0,spring fall,attendance,tight,bright,warm,light,0,1,1,...,1,0,0,0,0,0,0,1,0,"[0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, ..."
1,1,spring fall winter,attendance event,appropriate loose,dark,cold warm,heavy light,1,1,1,...,0,0,0,0,0,0,0,1,0,"[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, ..."


In [13]:
items = items[["item_id", "season", "tpo", "fit", "brightness", "temperature", "weight", "item_feats"]]
print(items.shape)
items.head(2)

(16454, 8)


Unnamed: 0,item_id,season,tpo,fit,brightness,temperature,weight,item_feats
0,0,spring fall,attendance,tight,bright,warm,light,"[0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, ..."
1,1,spring fall winter,attendance event,appropriate loose,dark,cold warm,heavy light,"[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, ..."


In [14]:
ratings = pd.read_csv(os.path.join(directory, "rate_data.csv"), index_col=0)
print(ratings.isna().sum())
print(ratings.shape)
ratings.head(2)

user    0
item    0
rate    0
dtype: int64
(38284, 3)


Unnamed: 0,user,item,rate
0,27,12843,3.0
1,27,13798,1.5


In [15]:
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings = ratings['user'].unique()
distinct_items_in_ratings = ratings['item'].unique()
users = users[users['userID'].isin(distinct_users_in_ratings)]
items = items[items['item_id'].isin(distinct_items_in_ratings)]

In [16]:
ratings.columns = ["userID", "item_id", "rating_per_user"]
print(ratings.isna().sum())
print(ratings.shape)
ratings

userID             0
item_id            0
rating_per_user    0
dtype: int64
(38284, 3)


Unnamed: 0,userID,item_id,rating_per_user
0,27,12843,3.0
1,27,13798,1.5
2,27,14541,2.5
3,27,401,3.0
4,27,5620,2.5
...,...,...,...
38279,63992,14540,1.5
38280,63992,5936,2.5
38281,63992,6058,2.5
38282,63992,14043,1.5


In [17]:
# Build Graph
# 아이템, 유저 DB에 존재하는 rating만 사용
user_intersect = set(ratings['userID'].values) & set(users['userID'].values)
item_intersect = set(ratings['item_id'].values) & set(items['item_id'].values)

new_users = users[users['userID'].isin(user_intersect)]
new_items = items[items['item_id'].isin(item_intersect)]
new_ratings = ratings[ratings['userID'].isin(user_intersect) & ratings['item_id'].isin(item_intersect)]
new_ratings = new_ratings.sort_values('userID')

In [18]:
label = []
for userID, df in new_ratings.groupby('userID'):
    idx = int(df.shape[0] * 0.8)
    timestamp = [0] * df.shape[0]
    timestamp = [x if i < idx else 1 for i, x in enumerate(timestamp)]
    label.extend(timestamp)
new_ratings['timestamp'] = label
new_ratings

Unnamed: 0,userID,item_id,rating_per_user,timestamp
0,27,12843,3.0,0
23,27,14256,2.5,0
22,27,13328,2.5,0
21,27,5825,2.5,0
20,27,5195,3.0,0
...,...,...,...,...
38253,63992,2404,2.5,1
38254,63992,9225,2.5,1
38255,63992,688,2.5,1
38244,63992,13341,1.5,1


In [30]:
label = []
for userID, df in new_ratings.groupby('userID'):
    idx = int(df.shape[0] * 0.8)
    idx2 = int(df.shape[0] * 0.9)
    timestamp = [0] * df.shape[0]
    tstamp = []
    for i, x in enumerate(timestamp):
        if idx <= i < idx2:
            tstamp.append(1)
        elif i >= idx2:
            tstamp.append(2)
        else:
            tstamp.append(x)
    label.extend(tstamp)
    #break
new_ratings['timestamp'] = label
print(new_ratings.loc[new_ratings['timestamp'] == 0, :].shape)
print(new_ratings.loc[new_ratings['timestamp'] == 1, :].shape)
print(new_ratings.loc[new_ratings['timestamp'] == 2, :].shape)
new_ratings

(30570, 4)
(3804, 4)
(3910, 4)


Unnamed: 0,userID,item_id,rating_per_user,timestamp
0,27,12843,3.0,0
23,27,14256,2.5,0
22,27,13328,2.5,0
21,27,5825,2.5,0
20,27,5195,3.0,0
...,...,...,...,...
38253,63992,2404,2.5,2
38254,63992,9225,2.5,2
38255,63992,688,2.5,2
38244,63992,13341,1.5,2


In [32]:
# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'userID', 'user')
graph_builder.add_entities(items, 'item_id', 'item')
graph_builder.add_binary_relations(new_ratings, 'userID', 'item_id', 'rated')
graph_builder.add_binary_relations(new_ratings, 'item_id', 'userID', 'rated-by')
g = graph_builder.build()

In [33]:
# Assign features.
node_dict = { 
    'user': [users, ['userID', 'user_feats'], ['cat', 'int']],
    'item': [items, ['item_id', 'item_feats'], ['cat', 'int']]
}
edge_dict = { 
    'rated': [new_ratings, ['rating_per_user', 'timestamp']],
    'rated-by': [new_ratings, ['rating_per_user', 'timestamp']]
}

In [34]:
for key, (df, features ,dtypes) in node_dict.items():
    for value, dtype in zip(features, dtypes):
        # key = 'user' or 'wine'
        # value = 'user_follower_count' 등등
        if dtype == 'int':
            array = np.array([i for i in df[value].values])
            g.nodes[key].data[value] = torch.FloatTensor(array)
        elif dtype == 'cat':
            g.nodes[key].data[value] = torch.LongTensor(df[value].astype('category').cat.codes.values)

for key, (df, features) in edge_dict.items():
    for value in features:
        g.edges[key].data[value] = torch.LongTensor(df[value].values.astype(np.float32))

  g.nodes[key].data[value] = torch.LongTensor(df[value].astype('category').cat.codes.values)


In [35]:
# 실제 ID와 카테고리 ID 딕셔너리
user_cat = users['userID'].astype('category').cat.codes.values
item_cat = items['item_id'].astype('category').cat.codes.values

user_cat_dict = {k: v for k, v in zip(user_cat, users['userID'].values)}
item_cat_dict = {k: v for k, v in zip(item_cat, items['item_id'].values)}


In [36]:
# Label
val_dict = defaultdict(set)
for userID, df in new_ratings.groupby('userID'):
    temp = df[df['timestamp'] == 1]
    val_dict[userID] = set(df[df['timestamp'] == 1]['item_id'].values)

In [37]:
# Label
te_dict = defaultdict(set)
for userID, df in new_ratings.groupby('userID'):
    temp = df[df['timestamp'] == 2]
    te_dict[userID] = set(df[df['timestamp'] == 2]['item_id'].values)

In [38]:
# Build title set
textual_feature = {
    "season" : items["season"].values,
    "tpo" : items["tpo"].values,
    "fit" : items["fit"].values,
    "brightness" : items["brightness"].values,
    "temperature" : items["temperature"].values,
}

In [39]:
new_ratings

Unnamed: 0,userID,item_id,rating_per_user,timestamp
0,27,12843,3.0,0
23,27,14256,2.5,0
22,27,13328,2.5,0
21,27,5825,2.5,0
20,27,5195,3.0,0
...,...,...,...,...
38253,63992,2404,2.5,2
38254,63992,9225,2.5,2
38255,63992,688,2.5,2
38244,63992,13341,1.5,2


In [40]:
new_ratings = new_ratings.sort_values(by="rating_per_user", ascending=False).reset_index(drop=True)

In [41]:
# Dump the graph and the datasets
dataset = {
    'train-graph': g,
    'user-data': users,
    'item-data': items, 
    'rating-data': new_ratings,
    'val-matrix': None,
    'test-matrix': torch.LongTensor([[0]]),
    'validset': val_dict,
    'testset': te_dict,
    'item-texts': textual_feature,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'item',
    'user-category': user_cat_dict,
    'item-category': item_cat_dict,
    'user-to-item-type': 'rated',
    'item-to-user-type': 'rated-by',
    'timestamp-edge-column': 'timestamp'}

In [42]:
output_path

'./graph_data/kdata_entire8.pkl'

In [43]:
with open(output_path, 'wb') as f:
    pickle.dump(dataset, f)

    
print('Processing Completed!')

Processing Completed!


In [44]:
dataset

{'train-graph': Graph(num_nodes={'item': 16454, 'user': 822},
       num_edges={('item', 'rated-by', 'user'): 38284, ('user', 'rated', 'item'): 38284},
       metagraph=[('item', 'user', 'rated-by'), ('user', 'item', 'rated')]),
 'user-data':      userID  r_gender_1  r_gender_2 user_feats
 0        27           1           0     [1, 0]
 1       133           1           0     [1, 0]
 2       179           0           1     [0, 1]
 3       289           0           1     [0, 1]
 4      1022           0           1     [0, 1]
 ..      ...         ...         ...        ...
 817   63970           1           0     [1, 0]
 818   63976           0           1     [0, 1]
 819   63986           1           0     [1, 0]
 820   63989           1           0     [1, 0]
 821   63992           1           0     [1, 0]
 
 [822 rows x 4 columns],
 'item-data':        item_id              season                     tpo                fit  \
 0            0         spring fall              attendance 

In [None]:
ratings["pos_pair"] = 0
for u in tqdm.tqdm(ratings.userID.unique().tolist()):
    ratings.loc[ratings.userID == u, "pos_pair"] = str(ratings.loc[ratings.userID == u, "item_id"].values.tolist())

def strTolst(x):
    try:
        return ast.literal_eval(str(x))   
    except Exception as e:
        print(e)
        return []

ratings['pos_pair'] = ratings["pos_pair"].apply(lambda x: strTolst(x))
pair_df = ratings[["item_id", "pos_pair"]]
pair_df = pair_df.drop_duplicates(subset='item_id',keep='first')
pair_df = pair_df.sample(frac=1).reset_index(drop=True)
pair_df.to_csv("./output/pair_df.csv")