In [1]:
import os
import re
import argparse
import pickle

import json 
import gzip

import pandas as pd
import numpy as np
import scipy.sparse as ssp

import dgl
import networkx as nx

import torch
import torchtext
from builder import PandasGraphBuilder

from data_utils import *

Using backend: pytorch


In [85]:
from tqdm import tqdm

In [2]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)
        
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
df1 = getDF('AMAZON_FASHION.json.gz')
df2 = getDF('meta_AMAZON_FASHION.json.gz')

In [5]:
print(df1.shape)
df1.head()

(883636, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2.0,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4.0,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2.0,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3.0,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,


In [70]:
df1.reviewerID.nunique()

749233

In [71]:
from sklearn.preprocessing import LabelEncoder

In [79]:
le = LabelEncoder()
df1["reviewerID"] = le.fit_transform(df1["reviewerID"])

In [80]:
df1.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 20, 2014",72476,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2.0,True,"09 28, 2014",469548,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4.0,False,"08 25, 2014",323626,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2.0,True,"08 24, 2014",365298,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3.0,False,"07 27, 2014",597016,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,


In [87]:
tqdm.pandas()
df1["reviewTextLength"] = df1["reviewText"].progress_apply(lambda x : len(str(x)))

100%|██████████| 883636/883636 [00:01<00:00, 878082.96it/s] 


In [89]:
df1["summaryLength"] = df1["summary"].progress_apply(lambda x : len(str(x)))

100%|██████████| 883636/883636 [00:01<00:00, 767333.57it/s]


In [96]:
def WordCount(x) : 
    try : 
        return len(x.split())
    except : 
        return 0 

In [97]:
df1["reviewTextCount"] = df1["reviewText"].progress_apply(lambda x : WordCount(x))
df1["summaryCount"] = df1["summary"].progress_apply(lambda x : WordCount(x))

100%|██████████| 883636/883636 [00:02<00:00, 390307.52it/s]
100%|██████████| 883636/883636 [00:01<00:00, 783767.62it/s]


In [131]:
user = df1.groupby('reviewerID').agg({
    'overall' : [('meanRating', np.mean)], 
    'reviewTextLength' : [('meanReviewLength' , np.mean)], 
    'summaryLength' : [('meanSummaryLength' , np.mean)], 
    'reviewTextCount' : [('meanReviewWord' , np.mean), ('ReviewCount' , 'count')], 
    'summaryCount' : [('meanSummaryWord' , np.mean)], 
}).reset_index()
user.columns = user.columns.get_level_values(level=1)

In [132]:
user.columns = ["user_ID", 'meanRating', 'meanReviewLength', 'meanSummaryLength', 'meanReviewWord', 'ReviewCount', 'meanSummaryWord']

In [134]:
user = user[["user_ID", 'meanRating', 'ReviewCount', 'meanReviewLength', 'meanSummaryLength', 'meanReviewWord', 'meanSummaryWord']]

In [135]:
'''
user_ID : user ID 
meanRating : 평균 평점 
ReviewCount : 리뷰 개수

meanReviewLength : 평균 리뷰 길이 
meanSummaryLength : 평균 Summary 길이 
meanReviewWord : 평균 리뷰 단어 개수 
meanSummaryWord : 평균 Summary 단어 개수 
'''

print(user.shape)
user.head()

(749233, 7)


Unnamed: 0,user_ID,meanRating,ReviewCount,meanReviewLength,meanSummaryLength,meanReviewWord,meanSummaryWord
0,0,5.0,1,95.0,16.0,21.0,3.0
1,1,1.0,1,52.0,14.0,10.0,3.0
2,2,1.0,1,151.0,38.0,27.0,7.0
3,3,3.0,1,1478.0,16.0,279.0,3.0
4,4,5.0,1,49.0,10.0,9.0,2.0


In [136]:
user.tail()

Unnamed: 0,user_ID,meanRating,ReviewCount,meanReviewLength,meanSummaryLength,meanReviewWord,meanSummaryWord
749228,749228,3.0,1,22.0,11.0,5.0,2.0
749229,749229,5.0,1,225.0,17.0,43.0,2.0
749230,749230,5.0,1,100.0,10.0,19.0,2.0
749231,749231,2.0,1,346.0,17.0,65.0,3.0
749232,749232,5.0,1,4.0,4.0,1.0,1.0


In [142]:
user["user_ID"] = user["user_ID"].astype('category')

In [144]:
with open('user.pickle', 'wb') as f:
    pickle.dump(user, f)

In [139]:
with open('user.pickle', 'rb') as f : 
    df = pickle.load(f)

In [121]:
user[user["ReviewCount"] > 1]

Unnamed: 0,reviewerID,ReviewCount,meanRating,meanReviewLength,meanSummaryLength,meanReviewWord,meanSummaryWord
5,5,4,5.000000,50.000000,10.25,10.000000,1.75
13,13,2,4.000000,35.000000,10.50,7.000000,2.00
15,15,2,5.000000,10.500000,8.00,2.500000,2.00
18,18,2,3.500000,205.000000,35.00,41.500000,7.50
20,20,2,3.500000,36.500000,10.50,7.500000,2.00
...,...,...,...,...,...,...,...
749200,749200,4,4.500000,126.750000,8.75,22.000000,1.50
749201,749201,2,4.500000,22.000000,23.00,5.500000,5.50
749207,749207,2,2.000000,36.500000,8.00,7.000000,1.50
749220,749220,2,4.500000,366.000000,32.50,66.500000,6.50


In [122]:
user[user["ReviewCount"] > 1].describe()

Unnamed: 0,reviewerID,ReviewCount,meanRating,meanReviewLength,meanSummaryLength,meanReviewWord,meanSummaryWord
count,93913.0,93913.0,93913.0,93913.0,93913.0,93913.0,93913.0
mean,375905.334224,2.431144,3.989804,158.187964,21.5053,30.336176,4.144899
std,215870.555366,1.093152,1.108401,205.312254,15.279378,38.898708,3.010474
min,5.0,2.0,1.0,1.0,1.0,0.0,0.0
25%,188975.0,2.0,3.333333,45.0,10.0,8.5,2.0
50%,376539.0,2.0,4.333333,99.5,16.0,19.0,3.0
75%,562976.0,2.0,5.0,192.0,28.5,37.0,5.5
max,749224.0,40.0,5.0,6688.0,172.0,1289.0,36.0


### user 
1. userID (reviewerID) -> 1부터 ~ 
2. reviewText (평균) 길이 
3. summary (평균) 길이 
4. overall (평균) 점수 

### item 
1. brand (18000개, 1개 이상 10000개) / etc -> 기각 
2. Feature : dimension / weight 
3. rank : 전처리 해서 가져오기 -> 카테고리화 
4. price : NaN 많음 
5. 1/0 : imageURL, description 

### rating 
1. movielens 랑 똑같이 

In [5]:
print(df2.shape)
df2.head()

(186637, 16)


Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,764443682,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,XCC Qi promise new spider snake preparing men'...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,[For the professional or amateur magician. Ro...,,,,,,,
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,


In [50]:
df2.iloc[3]["title"]

'Ashes to Ashes, Oranges to Oranges'

In [34]:
df2.iloc[0]["date"]

'8.70 inches'

In [51]:
df2.iloc[11]["similar_item"]

nan

In [35]:
df2.iloc[111]["date"]

'5 star5 star (0%)'

In [36]:
df2.iloc[10]["date"]

'fishhook'

In [46]:
df2.iloc[1]["price"]

nan

In [9]:
!python process_movielens1m.py ./ml-1m ./data.pkl

Using backend: pytorch
  g.nodes['user'].data['gender'] = torch.LongTensor(users['gender'].cat.codes.values)
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  df = df.groupby(user, group_keys=False).apply(train_test_split).compute(scheduler='processes').sort_index()
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
    user_id  movie_id  rating  timestamp  train_mask  val_mask  test_mask
31        1      3186       4  978300019        True     False      False
27        1      1721       4  978300055        True     False      False
37        1      1022       5  978300055        True     False      False
22        1      1270       5  978300055        True     False      False
24        1      2340       3  978300103        True     Fals

In [10]:
with open('data.pkl', 'rb') as f:
    movielens = pickle.load(f)

In [11]:
movielens

{'train-graph': Graph(num_nodes={'movie': 3706, 'user': 6040},
       num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
       metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')]),
 'val-matrix': <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 6040 stored elements in COOrdinate format>,
 'test-matrix': <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 6040 stored elements in COOrdinate format>,
 'item-texts': {'title': array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Tigerland',
         'Two Family House', 'Contender, The'], dtype=object)},
 'item-images': None,
 'user-type': 'user',
 'item-type': 'movie',
 'user-to-item-type': 'watched',
 'item-to-user-type': 'watched-by',
 'timestamp-edge-column': 'timestamp'}

In [102]:
movielens['item-texts']['title'][:50]

array(['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale',
       'Father of the Bride Part II', 'Heat', 'Sabrina', 'Tom and Huck',
       'Sudden Death', 'GoldenEye', 'American President, The',
       'Dracula: Dead and Loving It', 'Balto', 'Nixon',
       'Cutthroat Island', 'Casino', 'Sense and Sensibility',
       'Four Rooms', 'Ace Ventura: When Nature Calls', 'Money Train',
       'Get Shorty', 'Copycat', 'Assassins', 'Powder',
       'Leaving Las Vegas', 'Othello', 'Now and Then', 'Persuasion',
       'City of Lost Children, The',
       'Shanghai Triad (Yao a yao yao dao waipo qiao)', 'Dangerous Minds',
       'Twelve Monkeys', 'Wings of Courage', 'Babe', 'Carrington',
       'Dead Man Walking', 'Across the Sea of Time', 'It Takes Two',
       'Clueless', 'Cry, the Beloved Country', 'Richard III',
       'Dead Presidents', 'Restoration', 'Mortal Kombat', 'To Die For',
       'How to Make an American Quilt', 'Seven (Se7en)', 'Pocahontas',
       'When Night Is Falli

# dgl.DGLHeteroGraph 
* https://docs.dgl.ai/en/0.4.x/api/python/heterograph.html 
* https://docs.dgl.ai/en/0.4.x/tutorials/basics/5_hetero.html

## train-graph
* `(srctype, edgetype, dsttype)`

In [98]:
print(type(movielens["train-graph"]))

<class 'dgl.heterograph.DGLHeteroGraph'>


In [12]:
movielens["train-graph"]

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])

In [33]:
movielens["train-graph"].number_of_nodes()

9746

In [34]:
movielens["train-graph"].number_of_edges()

1976258

In [56]:
movielens["train-graph"].nodes('user')

tensor([   0,    1,    2,  ..., 6037, 6038, 6039])

In [57]:
movielens["train-graph"].nodes('movie')

tensor([   0,    1,    2,  ..., 3703, 3704, 3705])

In [65]:
print("node type : ", movielens["train-graph"].ntypes)
print("edge type : ", movielens["train-graph"].etypes)

node type :  ['movie', 'user']
edge type :  ['watched-by', 'watched']


In [66]:
print("#Users : ", movielens["train-graph"].number_of_nodes('user'))
print("#Movies : ", movielens["train-graph"].number_of_nodes('movie'))

#Users :  6040
#Movies :  3706


In [67]:
# User 1 이 본 영화 
print(movielens["train-graph"].successors(1, etype='watched'))

tensor([1259, 2853, 1414,  626, 2013,  627, 2078, 2426, 1031, 2708, 3235, 1120,
        1623, 1550, 1123, 3341, 2674, 2816, 1127, 2889,  420, 1945, 2891, 2892,
        2821, 1161, 1478,  283, 2046, 2856,  841,  358, 1167, 2931, 1201, 1099,
        3031, 3032, 3033,  106, 2086, 2296, 1656, 3238,  575, 1553, 3412, 2645,
        1765,  920, 1693, 2651, 1135, 1848,  501,  428, 2120,  258, 1306, 1012,
        3457,  466, 1337, 1886, 1173, 1271, 1104, 1631, 1273, 2160, 3107, 2234,
         576, 1106, 1428,  702, 1108,  579, 2166,   92, 2512, 2374, 2203, 3493,
        1773, 1466, 3566, 1774, 1775,  228, 1047, 1777,  157,   20,  159, 2128,
        1018,  370, 2307,  339,  443, 1024,  859,  445, 1406, 3186, 1286, 3219,
        2879,  737,  484, 2523, 1822, 1782, 1618, 2735, 1826,  309, 1117, 1788,
        3436, 1152,  346, 1153, 1154, 3647, 1155])


In [69]:
# 1259 를 본 사람들 
print(movielens["train-graph"].successors(1259, etype='watched-by'))

tensor([   1,    7,    9,   17,   32,   52,   58,   68,  110,  130,  145,  150,
         174,  186,  191,  194,  197,  200,  207,  211,  215,  224,  244,  258,
         280,  286,  292,  318,  330,  332,  338,  346,  356,  377,  391,  411,
         414,  429,  481,  492,  508,  517,  523,  527,  540,  548,  549,  550,
         557,  586,  587,  589,  592,  593,  624,  653,  654,  655,  659,  672,
         675,  710,  720,  723,  727,  730,  751,  753,  801,  816,  821,  823,
         838,  853,  888,  890,  896,  898,  923,  927,  941,  942,  944,  954,
         962,  964,  969,  972,  974,  976, 1000, 1009, 1014, 1049, 1050, 1058,
        1068, 1087, 1095, 1098, 1102, 1111, 1116, 1118, 1119, 1120, 1124, 1127,
        1149, 1157, 1163, 1169, 1193, 1202, 1210, 1217, 1226, 1240, 1241, 1243,
        1245, 1251, 1254, 1263, 1265, 1270, 1272, 1278, 1283, 1287, 1295, 1297,
        1300, 1314, 1330, 1339, 1350, 1353, 1374, 1382, 1386, 1390, 1395, 1419,
        1421, 1439, 1446, 1448, 1454, 14

In [86]:
# Draw the metagraph using graphviz.
import pygraphviz as pgv
def plot_graph(nxg):
    ag = pgv.AGraph(strict=False, directed=True)
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)
    ag.layout('dot')
    ag.draw('graph.png')

In [87]:
plot_graph(movielens["train-graph"].metagraph)

AttributeError: 'function' object has no attribute 'edges'

In [94]:
movielens["train-graph"].metagraph

<bound method DGLHeteroGraph.metagraph of Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])>

## val-matrix

In [51]:
print(movielens['val-matrix'].tocsr())

  (0, 1439)	1
  (1, 1420)	1
  (2, 101)	1
  (3, 2743)	1
  (4, 1371)	1
  (5, 1441)	1
  (6, 1575)	1
  (7, 704)	1
  (8, 2102)	1
  (9, 2489)	1
  (10, 1895)	1
  (11, 1160)	1
  (12, 1873)	1
  (13, 2526)	1
  (14, 3512)	1
  (15, 2496)	1
  (16, 1470)	1
  (17, 259)	1
  (18, 309)	1
  (19, 1148)	1
  (20, 582)	1
  (21, 847)	1
  (22, 1938)	1
  (23, 1779)	1
  (24, 3503)	1
  :	:
  (6015, 3520)	1
  (6016, 2699)	1
  (6017, 532)	1
  (6018, 784)	1
  (6019, 2708)	1
  (6020, 1696)	1
  (6021, 466)	1
  (6022, 724)	1
  (6023, 1927)	1
  (6024, 525)	1
  (6025, 513)	1
  (6026, 1398)	1
  (6027, 2773)	1
  (6028, 2624)	1
  (6029, 651)	1
  (6030, 354)	1
  (6031, 1906)	1
  (6032, 283)	1
  (6033, 881)	1
  (6034, 1583)	1
  (6035, 2602)	1
  (6036, 907)	1
  (6037, 2495)	1
  (6038, 861)	1
  (6039, 155)	1


In [53]:
movielens['val-matrix'].tocsr().shape

(6040, 3706)

In [62]:
print(type(movielens['val-matrix']))
print('#Users:', movielens['val-matrix'].shape[0])
print('#Movies:', movielens['val-matrix'].shape[1])
print('#Links:', movielens['val-matrix'].nnz)

<class 'scipy.sparse.coo.coo_matrix'>
#Users: 6040
#Movies: 3706
#Links: 6040


In [63]:
print(type(movielens['test-matrix']))
print('#Users:', movielens['test-matrix'].shape[0])
print('#Movies:', movielens['test-matrix'].shape[1])
print('#Links:', movielens['test-matrix'].nnz)

<class 'scipy.sparse.coo.coo_matrix'>
#Users: 6040
#Movies: 3706
#Links: 6040


In [103]:
!pip install --upgrade dgl==0.5.2

Collecting dgl==0.5.2
  Downloading dgl-0.5.2-cp38-cp38-macosx_10_9_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 745 kB/s eta 0:00:01
Installing collected packages: dgl
  Attempting uninstall: dgl
    Found existing installation: dgl 0.6.1
    Uninstalling dgl-0.6.1:
      Successfully uninstalled dgl-0.6.1
Successfully installed dgl-0.5.2


In [105]:
# Nearest neighbor recommendation 
!python model.py data.pkl --num-epochs 1 --num-workers 0 --hidden-dims 64

Using backend: pytorch
100%|█████████████████████████████████████| 20000/20000 [12:10<00:00, 27.38it/s]
0.05662251655629139
