In [1]:
# built-in imports
import os
import re
import argparse
import pickle
import sys

# third-party imports
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

# local imports
sys.path.insert(0, '../src/pinsage')
from builder import PandasGraphBuilder
from data_utils import *

Using backend: pytorch


In [3]:
# get directory of data files
directory = './amazon_beauty'
# get output file path
output_path = '../pinsage/processed_beauty.pkl'

In [6]:
# Read amazon beauty dataset
amazon = pd.read_csv('./amazon_beauty/ratings_Beauty.csv')
print(amazon.shape)
amazon.dtypes

(2023070, 4)


UserId        object
ProductId     object
Rating       float64
Timestamp      int64
dtype: object

In [18]:
# check if there's users with no ratings
amazon[amazon['Rating']==None]

Unnamed: 0,UserId,ProductId,Rating,Timestamp


In [19]:
amazon['UserId'].value_counts()

A3KEZLJ59C1JVH    389
A281NPSIMI1C2R    336
A3M174IC0VXOS2    326
A2V5R832QCSOMX    278
A3LJLRIZL38GG3    276
                 ... 
AFLE9ZCCERY6L       1
A2U2AW7L2BU1S       1
A2KNQZY2DU4H8I      1
A6J6SJ1063P79       1
A3MQDRRGC9070R      1
Name: UserId, Length: 1210271, dtype: int64

In [30]:
amazon['ProductId'].value_counts()

B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
              ... 
B004U81OBC       1
B004U7R0EI       1
B004U7Q2O2       1
B004U7NKRE       1
B00LU0LTOU       1
Name: ProductId, Length: 249274, dtype: int64

In [49]:
user_col = amazon.drop(['ProductId','Rating','Timestamp'], axis=1).astype('category')
item_col = amazon.drop(['UserId','Rating','Timestamp'], axis=1).astype('category')

In [74]:
# Get distinct users and items
distinct_users = user_col['UserId'].unique()
distinct_items = item_col['ProductId'].unique()
users = pd.DataFrame(distinct_users, columns=['UserId'])
items = pd.DataFrame(distinct_items, columns=['ProductId'])

In [77]:
# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'UserId', 'user')
graph_builder.add_entities(items, 'ProductId', 'item')
graph_builder.add_binary_relations(amazon, 'UserId','ProductId','purchased')
graph_builder.add_binary_relations(amazon, 'ProductId', 'UserId', 'purchased-by')

In [79]:
g = graph_builder.build()

In [80]:
g

Graph(num_nodes={'item': 249274, 'user': 1210271},
      num_edges={('item', 'purchased-by', 'user'): 2023070, ('user', 'purchased', 'item'): 2023070},
      metagraph=[('item', 'user', 'purchased-by'), ('user', 'item', 'purchased')])

In [82]:
# Add user-item edge features
g.edges['purchased'].data['rating'] = torch.LongTensor(amazon['Rating'].values)
g.edges['purchased'].data['timestamp'] = torch.LongTensor(amazon['Timestamp'].values)
g.edges['purchased-by'].data['rating'] = torch.LongTensor(amazon['Rating'].values)
g.edges['purchased-by'].data['timestamp'] = torch.LongTensor(amazon['Timestamp'].values)

In [85]:
# Train, Test, Validation split by time
train_indices, val_indices, test_indices = train_test_split_by_time(amazon, 'Timestamp', 'UserId')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  df = df.groupby(user, group_keys=False).apply(train_test_split).compute(scheduler='processes').sort_index()


                 UserId   ProductId  Rating   Timestamp  train_mask  val_mask  \
0        A39HTATAQ9V7YF  0205616461     5.0  1369699200        True     False   
899125   A39HTATAQ9V7YF  B002OVV7F0     3.0  1369699200        True     False   
969482   A39HTATAQ9V7YF  B0031IH5FQ     5.0  1369699200       False      True   
1499680  A39HTATAQ9V7YF  B006GQPZ8E     4.0  1369699200       False     False   

         test_mask  
0            False  
899125       False  
969482       False  
1499680       True  


In [86]:
print("train", train_indices.shape)
print("val", val_indices.shape)
print("test", test_indices.shape)

train (1553205,)
val (146995,)
test (322870,)


In [87]:
# Build Train Graph
train_g = build_train_graph(g, train_indices, 'user','item','purchased','purchased-by')
assert train_g.out_degrees(etype='purchased').min() > 0

In [88]:
train_g

Graph(num_nodes={'item': 249274, 'user': 1210271},
      num_edges={('item', 'purchased-by', 'user'): 1553205, ('user', 'purchased', 'item'): 1553205},
      metagraph=[('item', 'user', 'purchased-by'), ('user', 'item', 'purchased')])

In [91]:
val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user','item','purchased')

In [92]:
dataset = {
    'train-graph': train_g,
    'val-matrix': val_matrix,
    'test-matrix': test_matrix,
    'item-texts': None,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'item',
    'user-to-item-type': 'purchased',
    'item-to-user-type': 'purchased-by',
    'timestamp-edge-column': 'timestamp'}

In [93]:
with open(output_path, 'wb') as f:
    pickle.dump(dataset, f)

In [94]:
with open('processed_beauty.pkl', 'rb') as fr:
    data = pickle.load(fr)
print(data)

{'train-graph': Graph(num_nodes={'item': 249274, 'user': 1210271},
      num_edges={('item', 'purchased-by', 'user'): 1553205, ('user', 'purchased', 'item'): 1553205},
      metagraph=[('item', 'user', 'purchased-by'), ('user', 'item', 'purchased')]), 'val-matrix': <1210271x249274 sparse matrix of type '<class 'numpy.int64'>'
	with 146995 stored elements in COOrdinate format>, 'test-matrix': <1210271x249274 sparse matrix of type '<class 'numpy.int64'>'
	with 322870 stored elements in COOrdinate format>, 'item-texts': None, 'item-images': None, 'user-type': 'user', 'item-type': 'item', 'user-to-item-type': 'purchased', 'item-to-user-type': 'purchased-by', 'timestamp-edge-column': 'timestamp'}
