In [1]:
# built-in imports
import os
import re
import argparse
import pickle
import sys

# third-party imports
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

# local imports
sys.path.insert(0, '../src/pinsage')
from builder import PandasGraphBuilder
from data_utils import *

Using backend: pytorch


In [3]:
# get directory of data files
directory = './amazon_beauty'
# get output file path
output_path = '../pinsage/processed_beauty.pkl'

In [6]:
# Read amazon beauty dataset
amazon = pd.read_csv('./amazon_beauty/ratings_Beauty.csv')
print(amazon.shape)
amazon.dtypes

(2023070, 4)


UserId        object
ProductId     object
Rating       float64
Timestamp      int64
dtype: object

In [18]:
# check if there's users with no ratings
amazon[amazon['Rating']==None]

Unnamed: 0,UserId,ProductId,Rating,Timestamp


In [19]:
amazon['UserId'].value_counts()

A3KEZLJ59C1JVH    389
A281NPSIMI1C2R    336
A3M174IC0VXOS2    326
A2V5R832QCSOMX    278
A3LJLRIZL38GG3    276
                 ... 
AFLE9ZCCERY6L       1
A2U2AW7L2BU1S       1
A2KNQZY2DU4H8I      1
A6J6SJ1063P79       1
A3MQDRRGC9070R      1
Name: UserId, Length: 1210271, dtype: int64

In [30]:
amazon['ProductId'].value_counts()

B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
              ... 
B004U81OBC       1
B004U7R0EI       1
B004U7Q2O2       1
B004U7NKRE       1
B00LU0LTOU       1
Name: ProductId, Length: 249274, dtype: int64

In [49]:
user_col = amazon.drop(['ProductId','Rating','Timestamp'], axis=1).astype('category')
item_col = amazon.drop(['UserId','Rating','Timestamp'], axis=1).astype('category')

In [74]:
# Get distinct users and items
distinct_users = user_col['UserId'].unique()
distinct_items = item_col['ProductId'].unique()
users = pd.DataFrame(distinct_users, columns=['UserId'])
items = pd.DataFrame(distinct_items, columns=['ProductId'])

In [77]:
# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'UserId', 'user')
graph_builder.add_entities(items, 'ProductId', 'item')
graph_builder.add_binary_relations(amazon, 'UserId','ProductId','purchased')
graph_builder.add_binary_relations(amazon, 'ProductId', 'UserId', 'purchased-by')

In [79]:
g = graph_builder.build()

In [80]:
g

Graph(num_nodes={'item': 249274, 'user': 1210271},
      num_edges={('item', 'purchased-by', 'user'): 2023070, ('user', 'purchased', 'item'): 2023070},
      metagraph=[('item', 'user', 'purchased-by'), ('user', 'item', 'purchased')])