In [1]:
import os
import re
import json
import pickle
import argparse
import scipy.sparse as ssp
from collections import defaultdict

from tqdm import tqdm
import pandas as pd
import numpy as np
import dgl
import torch
import torchtext.legacy as torchtext
from builder import PandasGraphBuilder
from data_utils import *
import layers
import sampler as sampler_module
import evaluation

### K-Deep fashion으로 적용

In [9]:
directory = "./kdata"
output_path = "./output/kdata_new9.pkl"

In [10]:
users = pd.read_csv(os.path.join(directory, "user_data.csv"), index_col=0)
print(users.isna().sum())
print(users.shape)
users.head(2)

user         0
user_name    0
r_gender     0
age          0
dtype: int64
(539, 4)


Unnamed: 0,user,user_name,r_gender,age
0,0,590,2,3
1,1,1403,2,1


In [11]:
columns = ['user_name', 'r_gender', 'age']
users = users[columns]
users.columns = ['userID', 'r_gender', 'age']
users = users.dropna(subset=['userID'])
print(users.shape)
users.head(2)

(539, 3)


Unnamed: 0,userID,r_gender,age
0,590,2,3
1,1403,2,1


In [12]:
users = pd.get_dummies(users, columns = ['r_gender'])
users['user_feats'] = list(users[['r_gender_1', 'r_gender_2']].values)
del users["age"]
print(users.shape)
users.head(2)

(539, 4)


Unnamed: 0,userID,r_gender_1,r_gender_2,user_feats
0,590,0,1,"[0, 1]"
1,1403,0,1,"[0, 1]"


In [13]:
items = pd.read_csv(os.path.join(directory, "item_data.csv"), index_col=0)
print(items.isna().sum())
print(items.shape)
items.head(2)

item         0
item_name    0
era          0
style        0
gender       0
season       0
tpo          0
dtype: int64
(8418, 7)


Unnamed: 0,item,item_name,era,style,gender,season,tpo
0,0,W_00001_60_M.jpg,1960,men mose look,M,spring fall summer,daily esleisure
1,1,W_00002_60_M.jpg,1960,men mose look,M,spring fall,business casual business formal daily


In [8]:
columns = ['item', 'style', 'gender', "season", "tpo"]
items = items[columns]
items.columns = ["item_id", "style", "gender", "season", "tpo"]
items = items.dropna(subset=['item_id'])
print(items.shape)
items.head(2)

(8418, 5)


Unnamed: 0,item_id,style,gender,season,tpo
0,0,men mose look,M,spring fall summer,daily esleisure
1,1,men mose look,M,spring fall,business casual business formal daily


In [None]:
items = pd.get_dummies(items, columns = ['gender'])
print(items.shape)
items

In [None]:
cat_columns = items.columns.drop(['item_id', 'style', "season", "tpo"])
cat_columns

In [None]:
items['item_feats'] = list(items[cat_columns].values)
print(items.shape)
items.head(2)

In [None]:
items = items[["item_id", "tpo", "item_feats"]]
print(items.shape)
items.head(2)

In [None]:
ratings = pd.read_csv(os.path.join(directory, "rate_data.csv"), index_col=0)
print(ratings.isna().sum())
print(ratings.shape)
ratings.head(2)

In [None]:
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings = ratings['user'].unique()
distinct_items_in_ratings = ratings['item'].unique()
users = users[users['userID'].isin(distinct_users_in_ratings)]
items = items[items['item_id'].isin(distinct_items_in_ratings)]

In [None]:
ratings.columns = ["userID", "item_id", "rating_per_user"]
print(ratings.isna().sum())
print(ratings.shape)
ratings

In [None]:
# Build Graph
# 아이템, 유저 DB에 존재하는 rating만 사용
user_intersect = set(ratings['userID'].values) & set(users['userID'].values)
item_intersect = set(ratings['item_id'].values) & set(items['item_id'].values)

new_users = users[users['userID'].isin(user_intersect)]
new_items = items[items['item_id'].isin(item_intersect)]
new_ratings = ratings[ratings['userID'].isin(user_intersect) & ratings['item_id'].isin(item_intersect)]
new_ratings = new_ratings.sort_values('userID')

In [None]:
label = []
for userID, df in new_ratings.groupby('userID'):
    idx = int(df.shape[0] * 0.7)
    timestamp = [0] * df.shape[0]
    timestamp = [x if i < idx else 1 for i, x in enumerate(timestamp)]
    label.extend(timestamp)
new_ratings['timestamp'] = label
print(new_ratings.isna().sum())
print(new_ratings.shape)
new_ratings

In [None]:
# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'userID', 'user')
graph_builder.add_entities(items, 'item_id', 'item')
graph_builder.add_binary_relations(new_ratings, 'userID', 'item_id', 'rated')
graph_builder.add_binary_relations(new_ratings, 'item_id', 'userID', 'rated-by')
g = graph_builder.build()

In [None]:
# Assign features.
node_dict = { 
    'user': [users, ['userID', 'user_feats'], ['cat', 'int']],
    'item': [items, ['item_id', 'item_feats'], ['cat', 'int']]
}
edge_dict = { 
    'rated': [new_ratings, ['rating_per_user', 'timestamp']],
    'rated-by': [new_ratings, ['rating_per_user', 'timestamp']]
}

# # Assign features.
# node_dict = { 
#     'user': [users, ['userID'], ['cat']],
#     'item': [items, ['item_id'], ['cat']]
# }
# edge_dict = { 
#     'rated': [new_ratings, ['rating_per_user', 'timestamp']],
#     'rated-by': [new_ratings, ['rating_per_user', 'timestamp']]
# }

In [None]:
for key, (df, features ,dtypes) in node_dict.items():
    for value, dtype in zip(features, dtypes):
        # key = 'user' or 'wine'
        # value = 'user_follower_count' 등등
        if dtype == 'int':
            array = np.array([i for i in df[value].values])
            g.nodes[key].data[value] = torch.FloatTensor(array)
        elif dtype == 'cat':
            g.nodes[key].data[value] = torch.LongTensor(df[value].astype('category').cat.codes.values)

for key, (df, features) in edge_dict.items():
    for value in features:
        g.edges[key].data[value] = torch.LongTensor(df[value].values.astype(np.float32))

In [None]:
# 실제 ID와 카테고리 ID 딕셔너리
user_cat = users['userID'].astype('category').cat.codes.values
item_cat = items['item_id'].astype('category').cat.codes.values

user_cat_dict = {k: v for k, v in zip(user_cat, users['userID'].values)}
item_cat_dict = {k: v for k, v in zip(item_cat, items['item_id'].values)}


In [None]:
# Label
val_dict = defaultdict(set)
for userID, df in new_ratings.groupby('userID'):
    temp = df[df['timestamp'] == 1]
    val_dict[userID] = set(df[df['timestamp'] == 1]['item_id'].values)

In [None]:
# Build title set
textual_feature = {
    #'style' : items['style'].values,
    #"season" : items["season"].values,
    "tpo" : items["tpo"].values
}

In [None]:
items

In [None]:
# Dump the graph and the datasets
dataset = {
    'train-graph': g,
    'user-data': users,
    'item-data': items, 
    'rating-data': new_ratings,
    'val-matrix': None,
    'test-matrix': torch.LongTensor([[0]]),
    'testset': val_dict, 
    'item-texts': textual_feature,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'item',
    'user-category': user_cat_dict,
    'item-category': item_cat_dict,
    'user-to-item-type': 'rated',
    'item-to-user-type': 'rated-by',
    'timestamp-edge-column': 'timestamp'}

In [None]:
output_path

In [None]:
with open(output_path, 'wb') as f:
    pickle.dump(dataset, f)

    
print('Processing Completed!')

# k-fashion Inference dataset 생성

In [None]:
ratings = pd.DataFrame([[590, 10 , 3], [590, 8413, 4], [590, 7000, 2]], columns = ["user", "item", "rate"])
ratings

In [None]:
# # Filter the users and items that never appear in the rating table.
# distinct_users_in_ratings = ratings['user'].unique()
# distinct_items_in_ratings = ratings['item'].unique()
# users = users[users['userID'].isin(distinct_users_in_ratings)]
# items = items[items['item_id'].isin(distinct_items_in_ratings)]

In [None]:
ratings.columns = ["userID", "item_id", "rating_per_user"]
ratings

In [None]:
# Build Graph
# 아이템, 유저 DB에 존재하는 rating만 사용
item_intersect = set(ratings['item_id'].values) & set(items['item_id'].values)
new_items = items[items['item_id'].isin(item_intersect)]
new_ratings = ratings[ratings['item_id'].isin(item_intersect)]
new_ratings = new_ratings.sort_values('userID')

In [None]:
new_ratings['timestamp'] = 1

In [None]:
new_ratings

In [None]:
# # Assign features.
# node_dict = { 
#     'user': [users, ['userID', 'user_feats'], ['cat', 'int']],
#     'item': [items, ['item_id', 'item_feats'], ['cat', 'int']]
# }
# edge_dict = { 
#     'rated': [new_ratings, ['rating_per_user', 'timestamp']],
#     'rated-by': [new_ratings, ['rating_per_user', 'timestamp']]
# }

# Assign features.
node_dict = { 
    'user': [users, ['userID'], ['cat']],
    'item': [items, ['item_id'], ['cat']]
}
edge_dict = { 
    'rated': [new_ratings, ['rating_per_user', 'timestamp']],
    'rated-by': [new_ratings, ['rating_per_user', 'timestamp']]
}

In [None]:
# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'userID', 'user')
graph_builder.add_entities(items, 'item_id', 'item')
graph_builder.add_binary_relations(new_ratings, 'userID', 'item_id', 'rated')
graph_builder.add_binary_relations(new_ratings, 'item_id', 'userID', 'rated-by')
g = graph_builder.build()

In [None]:
for key, (df, features ,dtypes) in node_dict.items():
    for value, dtype in zip(features, dtypes):
        # key = 'user' or 'wine'
        # value = 'user_follower_count' 등등
        if dtype == 'int':
            array = np.array([i for i in df[value].values])
            g.nodes[key].data[value] = torch.FloatTensor(array)
        elif dtype == 'cat':
            g.nodes[key].data[value] = torch.LongTensor(df[value].astype('category').cat.codes.values)

for key, (df, features) in edge_dict.items():
    for value in features:
        g.edges[key].data[value] = torch.LongTensor(df[value].values.astype(np.float32))

In [None]:
# 실제 ID와 카테고리 ID 딕셔너리
user_cat = users['userID'].astype('category').cat.codes.values
item_cat = items['item_id'].astype('category').cat.codes.values

user_cat_dict = {k: v for k, v in zip(user_cat, users['userID'].values)}
item_cat_dict = {k: v for k, v in zip(item_cat, items['item_id'].values)}


In [None]:
# Label
val_dict = defaultdict(set)
for userID, df in new_ratings.groupby('userID'):
    temp = df[df['timestamp'] == 1]
    val_dict[userID] = set(df[df['timestamp'] == 1]['item_id'].values)

In [None]:
# Build title set
textual_feature = {
    #'style' : items['style'].values, 
                   #"season" : items["season"].values,
                   "tpo" : items["tpo"].values}

# Dump the graph and the datasets
dataset = {
    'train-graph': g,
    'user-data': users,
    'item-data': items, 
    'rating-data': new_ratings,
    'val-matrix': None,
    'test-matrix': torch.LongTensor([[0]]),
    'testset': val_dict, 
    'item-texts': textual_feature,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'item',
    'user-category': user_cat_dict,
    'item-category': item_cat_dict,
    'user-to-item-type': 'rated',
    'item-to-user-type': 'rated-by',
    'timestamp-edge-column': 'timestamp'}

In [None]:
with open("./output/infer.pkl", 'wb') as f:
    pickle.dump(dataset, f)

    
print('Processing Completed!')