# Neural Graph Collaborative Filtering


In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [2]:
os.listdir('./data/ml-latest-small')

['links.csv', 'movies.csv', 'ratings.csv', 'README.txt', 'tags.csv']

In [3]:
d1 = pd.read_csv('./data/ml-latest-small/movies.csv')
d2 = pd.read_csv('./data/ml-latest-small/ratings.csv')

display(d1.head(3))
display(d2.head(3))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [4]:
print(d1.shape, d2.shape)

(9742, 3) (100836, 4)


In [5]:
data = pd.read_csv('./data/ml-latest-small/ratings.csv', names=['user','item', 'ratings','timestamps'],
                  dtype= {'user': int, 'item':int, 'ratings' : float, 'timestamps':float},
                  engine='python', header=0)

data

Unnamed: 0,user,item,ratings,timestamps
0,1,1,4.0,9.649827e+08
1,1,3,4.0,9.649812e+08
2,1,6,4.0,9.649822e+08
3,1,47,5.0,9.649838e+08
4,1,50,5.0,9.649829e+08
...,...,...,...,...
100831,610,166534,4.0,1.493848e+09
100832,610,168248,5.0,1.493850e+09
100833,610,168250,5.0,1.494273e+09
100834,610,168252,5.0,1.493846e+09


In [6]:
num_users, num_items = len(pd.unique(data.user)), len(pd.unique(data.item))
print(num_users, num_items)

610 9724


In [7]:
data['org_ratings'] = data['ratings']
data['ratings'] = 1.0

In [8]:
num_items_by_user = data.groupby('user', as_index=False).size()
num_items_by_user = num_items_by_user.set_index('user')
num_items_by_user

Unnamed: 0_level_0,size
user,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [9]:
num_users_by_item = data.groupby('item', as_index=False).size()
num_users_by_item = num_users_by_item.set_index('item')
num_users_by_item

Unnamed: 0_level_0,size
item,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


In [10]:
user_frame = num_items_by_user
user_frame.columns = ['item_cnt']
user_frame

Unnamed: 0_level_0,item_cnt
user,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [11]:
order_by_popularity = True

In [12]:
if order_by_popularity:
    user_frame = user_frame.sort_values(by='item_cnt', ascending=False)
    
user_frame['new_id'] = list(range(num_users))
user_frame

Unnamed: 0_level_0,item_cnt,new_id
user,Unnamed: 1_level_1,Unnamed: 2_level_1
414,2698,0
599,2478,1
474,2108,2
448,1864,3
274,1346,4
...,...,...
442,20,605
569,20,606
320,20,607
576,20,608


In [13]:
frame_dict = user_frame.to_dict()
frame_dict

{'item_cnt': {414: 2698,
  599: 2478,
  474: 2108,
  448: 1864,
  274: 1346,
  610: 1302,
  68: 1260,
  380: 1218,
  606: 1115,
  288: 1055,
  249: 1046,
  387: 1027,
  182: 977,
  307: 975,
  603: 943,
  298: 939,
  177: 904,
  318: 879,
  232: 862,
  480: 836,
  608: 831,
  600: 763,
  483: 728,
  590: 728,
  105: 722,
  19: 703,
  305: 677,
  489: 648,
  111: 646,
  438: 635,
  217: 613,
  140: 608,
  477: 600,
  555: 578,
  91: 575,
  28: 570,
  219: 528,
  534: 520,
  89: 518,
  64: 517,
  226: 507,
  561: 505,
  18: 502,
  525: 500,
  57: 476,
  381: 474,
  368: 469,
  509: 467,
  469: 465,
  560: 458,
  462: 455,
  292: 446,
  21: 443,
  597: 443,
  42: 440,
  294: 437,
  160: 437,
  580: 436,
  596: 411,
  202: 403,
  275: 403,
  517: 400,
  45: 399,
  156: 398,
  514: 397,
  391: 386,
  567: 385,
  357: 383,
  103: 377,
  339: 371,
  62: 366,
  199: 363,
  125: 360,
  51: 359,
  132: 347,
  66: 345,
  313: 340,
  200: 334,
  221: 331,
  6: 314,
  453: 311,
  50: 310,
  425: 30

In [14]:
user_id_dict = frame_dict['new_id']
user_id_dict

{414: 0,
 599: 1,
 474: 2,
 448: 3,
 274: 4,
 610: 5,
 68: 6,
 380: 7,
 606: 8,
 288: 9,
 249: 10,
 387: 11,
 182: 12,
 307: 13,
 603: 14,
 298: 15,
 177: 16,
 318: 17,
 232: 18,
 480: 19,
 608: 20,
 600: 21,
 483: 22,
 590: 23,
 105: 24,
 19: 25,
 305: 26,
 489: 27,
 111: 28,
 438: 29,
 217: 30,
 140: 31,
 477: 32,
 555: 33,
 91: 34,
 28: 35,
 219: 36,
 534: 37,
 89: 38,
 64: 39,
 226: 40,
 561: 41,
 18: 42,
 525: 43,
 57: 44,
 381: 45,
 368: 46,
 509: 47,
 469: 48,
 560: 49,
 462: 50,
 292: 51,
 21: 52,
 597: 53,
 42: 54,
 294: 55,
 160: 56,
 580: 57,
 596: 58,
 202: 59,
 275: 60,
 517: 61,
 45: 62,
 156: 63,
 514: 64,
 391: 65,
 567: 66,
 357: 67,
 103: 68,
 339: 69,
 62: 70,
 199: 71,
 125: 72,
 51: 73,
 132: 74,
 66: 75,
 313: 76,
 200: 77,
 221: 78,
 6: 79,
 453: 80,
 50: 81,
 425: 82,
 428: 83,
 573: 84,
 352: 85,
 84: 86,
 122: 87,
 382: 88,
 356: 89,
 239: 90,
 135: 91,
 365: 92,
 484: 93,
 104: 94,
 63: 95,
 325: 96,
 169: 97,
 332: 98,
 290: 99,
 495: 100,
 432: 101,
 187: 1

In [15]:
user_frame = user_frame.set_index('new_id')
user_frame

Unnamed: 0_level_0,item_cnt
new_id,Unnamed: 1_level_1
0,2698
1,2478
2,2108
3,1864
4,1346
...,...
605,20
606,20
607,20
608,20


In [16]:
user_to_num_items = user_frame.to_dict()['item_cnt']
user_to_num_items

{0: 2698,
 1: 2478,
 2: 2108,
 3: 1864,
 4: 1346,
 5: 1302,
 6: 1260,
 7: 1218,
 8: 1115,
 9: 1055,
 10: 1046,
 11: 1027,
 12: 977,
 13: 975,
 14: 943,
 15: 939,
 16: 904,
 17: 879,
 18: 862,
 19: 836,
 20: 831,
 21: 763,
 22: 728,
 23: 728,
 24: 722,
 25: 703,
 26: 677,
 27: 648,
 28: 646,
 29: 635,
 30: 613,
 31: 608,
 32: 600,
 33: 578,
 34: 575,
 35: 570,
 36: 528,
 37: 520,
 38: 518,
 39: 517,
 40: 507,
 41: 505,
 42: 502,
 43: 500,
 44: 476,
 45: 474,
 46: 469,
 47: 467,
 48: 465,
 49: 458,
 50: 455,
 51: 446,
 52: 443,
 53: 443,
 54: 440,
 55: 437,
 56: 437,
 57: 436,
 58: 411,
 59: 403,
 60: 403,
 61: 400,
 62: 399,
 63: 398,
 64: 397,
 65: 386,
 66: 385,
 67: 383,
 68: 377,
 69: 371,
 70: 366,
 71: 363,
 72: 360,
 73: 359,
 74: 347,
 75: 345,
 76: 340,
 77: 334,
 78: 331,
 79: 314,
 80: 311,
 81: 310,
 82: 306,
 83: 300,
 84: 299,
 85: 294,
 86: 293,
 87: 292,
 88: 291,
 89: 289,
 90: 279,
 91: 279,
 92: 277,
 93: 275,
 94: 273,
 95: 271,
 96: 270,
 97: 269,
 98: 267,
 99: 267

In [17]:
data.user = [user_id_dict[x] for x in data.user.tolist()]
data

Unnamed: 0,user,item,ratings,timestamps,org_ratings
0,112,1,1.0,9.649827e+08,4.0
1,112,3,1.0,9.649812e+08,4.0
2,112,6,1.0,9.649822e+08,4.0
3,112,47,1.0,9.649838e+08,5.0
4,112,50,1.0,9.649829e+08,5.0
...,...,...,...,...,...
100831,5,166534,1.0,1.493848e+09,4.0
100832,5,168248,1.0,1.493850e+09,5.0
100833,5,168250,1.0,1.494273e+09,5.0
100834,5,168252,1.0,1.493846e+09,5.0


In [18]:
item_frame = num_users_by_item
item_frame

Unnamed: 0_level_0,size
item,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


In [19]:
item_frame.columns = ['user_cnt']

if order_by_popularity:
    item_frame = item_frame.sort_values(by='user_cnt', ascending=False)
    
item_frame['new_id'] = range(num_items)
item_frame

Unnamed: 0_level_0,user_cnt,new_id
item,Unnamed: 1_level_1,Unnamed: 2_level_1
356,329,0
318,317,1
296,307,2
593,279,3
2571,278,4
...,...,...
4093,1,9719
4089,1,9720
58351,1,9721
4083,1,9722


In [20]:
frame_dict = item_frame.to_dict()
frame_dict

{'user_cnt': {356: 329,
  318: 317,
  296: 307,
  593: 279,
  2571: 278,
  260: 251,
  480: 238,
  110: 237,
  589: 224,
  527: 220,
  2959: 218,
  1: 215,
  1196: 211,
  50: 204,
  2858: 204,
  47: 203,
  780: 202,
  150: 201,
  1198: 200,
  4993: 198,
  1210: 196,
  858: 192,
  457: 190,
  592: 189,
  5952: 188,
  2028: 188,
  7153: 185,
  588: 183,
  608: 181,
  2762: 179,
  380: 178,
  32: 177,
  364: 172,
  1270: 171,
  377: 171,
  3578: 170,
  4306: 170,
  1580: 165,
  590: 164,
  648: 162,
  344: 161,
  4226: 159,
  367: 157,
  58559: 149,
  6539: 149,
  1214: 146,
  595: 146,
  1036: 145,
  165: 144,
  500: 144,
  1265: 143,
  79132: 143,
  1197: 142,
  6377: 141,
  1704: 141,
  316: 140,
  2628: 140,
  1291: 140,
  1721: 140,
  153: 137,
  1136: 136,
  597: 135,
  293: 133,
  1193: 133,
  3793: 133,
  231: 133,
  4886: 132,
  10: 132,
  1089: 131,
  7361: 131,
  6874: 131,
  1240: 131,
  1221: 129,
  2329: 129,
  34: 128,
  1200: 126,
  1213: 126,
  1682: 125,
  8961: 125,
  5

In [21]:
item_id_dict = frame_dict['new_id']
item_frame = item_frame.set_index('new_id')
item_frame

Unnamed: 0_level_0,user_cnt
new_id,Unnamed: 1_level_1
0,329
1,317
2,307
3,279
4,278
...,...
9719,1
9720,1
9721,1
9722,1


In [23]:
item_to_num_users = item_frame.to_dict()['user_cnt']
data['item'] = [item_id_dict[x] for x in data.item.tolist()]
data

Unnamed: 0,user,item,ratings,timestamps,org_ratings
0,112,11,1.0,9.649827e+08,4.0
1,112,420,1.0,9.649812e+08,4.0
2,112,127,1.0,9.649822e+08,4.0
3,112,15,1.0,9.649838e+08,5.0
4,112,13,1.0,9.649829e+08,5.0
...,...,...,...,...,...
100831,5,3155,1.0,1.493848e+09,4.0
100832,5,2920,1.0,1.493850e+09,5.0
100833,5,1623,1.0,1.494273e+09,5.0
100834,5,1028,1.0,1.493846e+09,5.0


In [24]:
num_users, num_items = len(user_id_dict), len(item_id_dict)
print(num_users, num_items)

610 9724


In [25]:
num_ratings = len(data)
num_ratings

100836

--------

In [40]:
# split data into train/test

data_group = data.groupby('user')
data_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D9F501AE80>

In [54]:

train_list, test_list = [],[]
num_zero_train, num_zero_test = 0,0

for _,group in data_group:
    user = pd.unique(group['user'])[0]
    num_items_user = len(group)
    num_train = int(0.8 * num_items_user) # train_ratio = 0.8
    num_test = num_items_user - num_train
    
    group = group.sort_values(by='timestamps')
    idx = np.ones(num_items_user, dtype='bool')
    test_idx = np.random.choice(num_items_user, num_test, replace=False)
    idx[test_idx] = False
    
    if len(group[idx])==0:
        num_zero_train +=1
        
    else:
        train_list.append(group[idx])
        
    if len(group[np.logical_not(idx)])==0:
        num_zero_test +=1
    else:
        test_list.append(group[np.logical_not(idx)])

        
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print(num_zero_train, num_zero_test)

0 0


In [55]:
train_df

Unnamed: 0,user,item,ratings,timestamps,org_ratings
63362,0,327,1.0,9.614362e+08,3.0
62618,0,1604,1.0,9.614362e+08,1.0
62832,0,999,1.0,9.614362e+08,4.0
63270,0,3509,1.0,9.614362e+08,4.0
62731,0,20,1.0,9.614362e+08,5.0
...,...,...,...,...,...
7918,609,1000,1.0,1.237748e+09,5.0
7915,609,1322,1.0,1.237748e+09,5.0
7912,609,1351,1.0,1.237748e+09,5.0
7920,609,1234,1.0,1.237748e+09,5.0


In [56]:
test_df

Unnamed: 0,user,item,ratings,timestamps,org_ratings
63049,0,3486,1.0,9.614365e+08,2.0
63174,0,402,1.0,9.614365e+08,4.0
63290,0,139,1.0,9.614365e+08,5.0
63206,0,1162,1.0,9.614365e+08,3.0
63211,0,906,1.0,9.614367e+08,2.0
...,...,...,...,...,...
89049,608,669,1.0,1.358152e+09,5.0
7916,609,1161,1.0,1.237748e+09,5.0
7919,609,966,1.0,1.237748e+09,5.0
7929,609,1146,1.0,1.237748e+09,5.0


In [63]:
def df_to_sparse(df, shape):
    rows, cols = df.user, df.item
    values = df.ratings
    org_values = df.org_ratings
    
    sp_data = sp.csr_matrix((values, (rows,cols)), dtype='float64', shape=shape)
    sp_data2 = sp.csr_matrix((org_values, (rows,cols)), dtype='float64', shape=shape)
    
    num_nonzeros = np.diff(sp_data.indptr)
    row_to_drop = num_nonzeros == 0
    
    if sum(row_to_drop)> 0 :
        print(f"empty users are dropped from matrix : {sum(rows_to_drop)}")
        sp_data = sp_data[num_nonzeros != 0]
        
    return sp_data, sp_data2
    

In [64]:
train_sparse, org_train_sparse = df_to_sparse(train_df, shape=(num_users, num_items))
test_sparse, org_test_sparse = df_to_sparse(test_df, shape=(num_users, num_items))

In [66]:
display(train_sparse)
display(org_train_sparse)

display(test_sparse)
display(org_test_sparse)

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 80419 stored elements in Compressed Sparse Row format>

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 80419 stored elements in Compressed Sparse Row format>

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 20417 stored elements in Compressed Sparse Row format>

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 20417 stored elements in Compressed Sparse Row format>

In [68]:
    # Save data and statistics
data_to_save = {
    'train_mat': train_sparse,
    'test_mat': test_sparse,
    'org_train_mat': org_train_sparse,
    'org_test_mat': org_test_sparse,
    'user_id_dict': user_id_dict,
    'user_popularity': user_to_num_items,
    'item_id_dict': item_id_dict,
    'item_popularity': item_to_num_users,
    'num_users': num_users,
    'num_items': num_items
}

In [70]:
os.listdir('./')

['.ipynb_checkpoints',
 'data',
 'matrix',
 'Neural Graph Collaborative Filtering.ipynb']

In [78]:
for k,v in data_to_save.items():
    pickle.dump(v, open(f'./matrix/{k}.pkl', 'wb'))

In [82]:
ratings_per_user = list(user_to_num_items.values())
ratings_per_user

[2698,
 2478,
 2108,
 1864,
 1346,
 1302,
 1260,
 1218,
 1115,
 1055,
 1046,
 1027,
 977,
 975,
 943,
 939,
 904,
 879,
 862,
 836,
 831,
 763,
 728,
 728,
 722,
 703,
 677,
 648,
 646,
 635,
 613,
 608,
 600,
 578,
 575,
 570,
 528,
 520,
 518,
 517,
 507,
 505,
 502,
 500,
 476,
 474,
 469,
 467,
 465,
 458,
 455,
 446,
 443,
 443,
 440,
 437,
 437,
 436,
 411,
 403,
 403,
 400,
 399,
 398,
 397,
 386,
 385,
 383,
 377,
 371,
 366,
 363,
 360,
 359,
 347,
 345,
 340,
 334,
 331,
 314,
 311,
 310,
 306,
 300,
 299,
 294,
 293,
 292,
 291,
 289,
 279,
 279,
 277,
 275,
 273,
 271,
 270,
 269,
 267,
 267,
 265,
 260,
 258,
 255,
 250,
 248,
 247,
 242,
 237,
 236,
 233,
 232,
 232,
 231,
 230,
 227,
 226,
 226,
 223,
 221,
 217,
 216,
 216,
 215,
 213,
 210,
 210,
 208,
 207,
 204,
 202,
 202,
 201,
 200,
 196,
 194,
 191,
 190,
 189,
 188,
 187,
 187,
 186,
 185,
 181,
 180,
 179,
 177,
 176,
 174,
 173,
 168,
 168,
 168,
 167,
 167,
 167,
 165,
 165,
 164,
 164,
 163,
 161,
 157,
 156

In [85]:
info_lines = []
info_lines.append('# users: %d, # items: %d, # ratings: %d' % (num_users, num_items, num_ratings))
info_lines.append("Sparsity : %.2f%%" % ((1 - (num_ratings / (num_users * num_items))) * 100))
info_lines.append("Min/Max/Avg. ratings per users (full data): %d %d %.2f\n" % (
min(ratings_per_user), max(ratings_per_user), np.mean(ratings_per_user)))

info_lines.append('# train users: %d, # train ratings: %d' % (train_sparse.shape[0], train_sparse.nnz))
info_lines.append('# test users: %d, # test ratings: %d' % (test_sparse.shape[0], test_sparse.nnz))


In [86]:
info_lines

['# users: 610, # items: 9724, # ratings: 100836',
 'Sparsity : 98.30%',
 'Min/Max/Avg. ratings per users (full data): 20 2698 165.30\n',
 '# train users: 610, # train ratings: 80419',
 '# test users: 610, # test ratings: 20417']

In [90]:
data_path = './matrix/'

In [91]:
def load_data(data_path):
    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    train_matrix, test_matrix, org_train_matrix, org_test_matrix, user_id_map, user_popularity, item_id_map, item_popularity, num_users, num_items = \
        data['train_mat'], data['test_mat'], data['org_train_mat'], data['org_test_mat'], data['user_id_dict'], data['user_popularity'], \
        data['item_id_dict'], data['item_popularity'], data['num_users'], data['num_items']

    return train_matrix, test_matrix, org_train_matrix, org_test_matrix, user_id_map, user_popularity, item_id_map, item_popularity, num_users, num_items

In [94]:
os.listdir('./matrix/')

['item_id_dict.pkl',
 'item_popularity.pkl',
 'num_items.pkl',
 'num_users.pkl',
 'org_test_mat.pkl',
 'org_train_mat.pkl',
 'test_mat.pkl',
 'train_mat.pkl',
 'user_id_dict.pkl',
 'user_popularity.pkl']