In [1]:
import pandas as pd
import torch
import dgl
import pickle
from utils import HGT
import random

In [2]:
graph = dgl.load_graphs('training_data/graph.dgl')
graph = graph[0][0]

with open('training_data/train.obj', 'rb') as fp:
	train = pickle.load(fp)

with open('training_data/val.obj', 'rb') as fp:
	val = pickle.load(fp)

with open('training_data/test.obj', 'rb') as fp:
	test = pickle.load(fp)

In [3]:
graph.nodes('business').shape[0]

150243

In [4]:
edges = {}
for canonical_etype in graph.canonical_etypes:
    edges[canonical_etype] = graph.edges(etype=canonical_etype)

edges[('category', 'category_to_business', 'business')] = (graph.edges(etype='business_has_category')[1], graph.edges(etype='business_has_category')[0])
edges[('business', 'business_to_review', 'review')] = (graph.edges(etype='review_to_business')[1], graph.edges(etype='review_to_business')[0])
edges[('business', 'business_to_tip', 'tip')] = (graph.edges(etype='tip_to_business')[1], graph.edges(etype='tip_to_business')[0])
edges[('review', 'review_to_user', 'user')] = (graph.edges(etype='user_to_review')[1], graph.edges(etype='user_to_review')[0])
edges[('tip', 'tip_to_user', 'user')] = (graph.edges(etype='user_to_tip')[1], graph.edges(etype='user_to_tip')[0])

num_nodes_dict = {} 
for ntype in graph.ntypes:
    num_nodes_dict[ntype] = graph.nodes(ntype).shape[0]

g = dgl.heterograph(edges, num_nodes_dict = num_nodes_dict)
g.ndata['feat'] = {k: torch.tensor(v, dtype=torch.float32) for k, v in graph.ndata['feat'].items() }
del graph

  g.ndata['feat'] = {k: torch.tensor(v, dtype=torch.float32) for k, v in graph.ndata['feat'].items() }


In [5]:
g.num_nodes

<bound method DGLGraph.num_nodes of Graph(num_nodes={'business': 150243, 'category': 1311, 'review': 6339837, 'tip': 908878, 'user': 1987897},
      num_edges={('business', 'business_has_category', 'category'): 668592, ('business', 'business_to_review', 'review'): 6339837, ('business', 'business_to_tip', 'tip'): 908878, ('category', 'category_to_business', 'business'): 668592, ('review', 'review_to_business', 'business'): 6339837, ('review', 'review_to_user', 'user'): 6339837, ('tip', 'tip_to_business', 'business'): 908878, ('tip', 'tip_to_user', 'user'): 908878, ('user', 'user_to_review', 'review'): 6339837, ('user', 'user_to_tip', 'tip'): 908878, ('user', 'user_to_user', 'user'): 437928},
      metagraph=[('business', 'category', 'business_has_category'), ('business', 'review', 'business_to_review'), ('business', 'tip', 'business_to_tip'), ('category', 'business', 'category_to_business'), ('review', 'business', 'review_to_business'), ('review', 'user', 'review_to_user'), ('tip', 'bus

In [6]:
node_dict = { ntype: g.ntypes.index(ntype) for ntype in g.ntypes }
edge_dict = { canonical_etype: g.canonical_etypes.index(canonical_etype) for canonical_etype in g.canonical_etypes }
feature_dim_dict = { ntype: g.ndata['feat'][ntype].shape[1] for ntype in g.ntypes }

In [7]:
node_dict

{'business': 0, 'category': 1, 'review': 2, 'tip': 3, 'user': 4}

In [8]:
model = HGT(node_dict, edge_dict, feature_dim_dict, n_hid=256, n_out=128, n_layers=4, n_heads=8, use_norm=False)
opt = torch.optim.AdamW(model.parameters(), 1e-4)
sampler = dgl.dataloading.NeighborSampler([24, 24, 24, 24])
criterion = torch.nn.MarginRankingLoss(margin=0.1)
# dgl.dataloading.NeighborSampler([
#     {('user', 'follows', 'user'): 5,
#      ('user', 'plays', 'game'): 4,
#      ('game', 'played-by', 'user'): 3}] * 3)

In [9]:
train_pos_ids = list(range(train['pos'][0].shape[0]))
train_neg_ids = list(range(train['neg'][0].shape[0]))

In [10]:
len(train_neg_ids)

135108

In [11]:
g.ndata['feat']['category'].dtype

torch.float32

In [12]:
def predict(g, model, pos_ids, neg_ids, relation_tuple, sampler, batch_size):
    pos_users = torch.index_select(relation_tuple['pos'][0], 0, torch.tensor(pos_ids))
    pos_users_unique, pos_users_inverse = torch.unique(pos_users, return_inverse=True)
    pos_block_user = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
        g, {'user': pos_users_unique}, sampler,
        batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]

    pos_business = torch.index_select(relation_tuple['pos'][1], 0, torch.tensor(pos_ids))
    pos_business_unique, pos_business_inverse = torch.unique(pos_business, return_inverse=True)
    pos_block_business = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
        g, {'business': pos_business_unique }, sampler,
        batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]
    
    neg_users = torch.index_select(relation_tuple['neg'][0], 0, torch.tensor(neg_ids))
    neg_users_unique, neg_users_inverse = torch.unique(neg_users, return_inverse=True)
    neg_block_user = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
        g, {'user': neg_users_unique }, sampler,
        batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]
    
    neg_business = torch.index_select(relation_tuple['neg'][1], 0, torch.tensor(neg_ids))
    neg_business_unique, neg_business_inverse = torch.unique(neg_business, return_inverse=True)
    neg_block_business = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
        g, {'business': neg_business_unique }, sampler,
        batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]

    pos_user_logits = torch.index_select(model(pos_block_user, 'user'), 0, pos_users_inverse)
    pos_business_logits = torch.index_select(model(pos_block_business, 'business'), 0, pos_business_inverse)
    neg_user_logits = torch.index_select(model(neg_block_user, 'user'), 0, neg_users_inverse)
    neg_business_logits = torch.index_select(model(neg_block_business, 'business'), 0, neg_business_inverse)
    return pos_user_logits, pos_business_logits, neg_user_logits, neg_business_logits

In [13]:
def split(list_a, chunk_size):
    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

batch_size = 64
for epoch in range(1):
    model.train()
    random.shuffle(train_pos_ids)
    random.shuffle(train_neg_ids)
    for batch in split(list(zip(train_pos_ids, train_neg_ids)), batch_size):
        opt.zero_grad()
        pos_ids, neg_ids = list(zip(*batch))
        pos_user_logits, pos_business_logits, neg_user_logits, neg_business_logits = predict(g, model, pos_ids, neg_ids, train, sampler, batch_size)
        pos_score = torch.bmm(pos_user_logits.view(batch_size, 1, model.n_out), pos_business_logits.view(batch_size, model.n_out, 1)).squeeze()
        neg_score = torch.bmm(neg_user_logits.view(batch_size, 1, model.n_out), neg_business_logits.view(batch_size, model.n_out, 1)).squeeze()
        loss = criterion(pos_score, neg_score, torch.ones(batch_size))
        loss.backward()
        opt.step()
        print(loss.item())
    break

  assert input.numel() == input.storage().size(), (


0.10345187783241272
0.10159596800804138
0.10099367797374725
0.10106093436479568
0.10034532845020294
0.0988563820719719
0.09787221252918243
0.09768624603748322
0.09934961050748825
0.09831322729587555
0.09636179357767105
0.09732170403003693
0.09519215673208237
0.09284672141075134
0.0941537618637085
0.09199901670217514
0.09243879467248917
0.09165386855602264
0.08929087966680527
0.09127331525087357
0.09083104133605957
0.09100508689880371
0.08245374262332916
0.08667824417352676
0.0898139476776123
0.08351676166057587
0.08246343582868576
0.08488159626722336
0.08278439939022064
0.07845573872327805
0.07928439974784851
0.07706654071807861
0.08241486549377441
0.07168830186128616
0.08841340243816376
0.07437971234321594
0.08074168115854263
0.06672988086938858
0.07615679502487183
0.06875477731227875
0.07358470559120178
0.07405633479356766
0.07372436672449112
0.07940811663866043
0.0912003368139267
0.07253234088420868
0.07213099300861359
0.05858322232961655
0.06751268357038498
0.06283405423164368
0.06

KeyboardInterrupt: 

## experiment

In [44]:
user_pos_id = train['pos'][0][0]

In [30]:
pos_business = train['pos'][1][train['pos'][0] == train['pos'][0][0]]
neg_business = train['neg'][1][train['neg'][0] == train['pos'][0][0]]

In [32]:
pos_business

tensor([111108,  80217,  25553,  73239,  66461,  23450,  20934, 120272,  86879,
         38967,  84152,  13612])

In [31]:
neg_business

tensor([120919])

In [43]:
user_df = pd.read_csv('preprocessed/user_ids.csv')
business_df = pd.read_csv('preprocessed/business_ids.csv')

In [46]:
user_df.loc[user_pos_id.tolist()]

user_id:ID    U438yUH5aBVntI_CbVt8jg
Name: 207300, dtype: object

In [48]:
business_df.loc[pos_business.tolist()]

Unnamed: 0,business_id:ID
111108,a0d09197752174e42a05ff2cf445fa91
80217,064a4a8a97aa17167a9427a19aca98ef
25553,33d2e8ccd5b8f4d14ad8e83b11444bc0
73239,3f2388115a0b7cc98b242191fdad7bf4
66461,8e64483dbe1cb3d0662df91b83867345
23450,1fd668fc67cb62812e523ab153b411ce
20934,f30c7b0034d553e0da7e07811841868b
120272,3d6954a8431403d9e6e8b293a943d6d2
86879,15aaec95654f4ba868dfb2547ec72193
38967,e1756e58d54f74ce2392ec5fe40d0eb5


In [49]:
business_df.loc[neg_business.tolist()]

Unnamed: 0,business_id:ID
120919,82ad544c4332ea410b5018b6b69b5a2d


In [33]:
model.eval()
block_user = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
    g, {'user': [user_pos_id] }, sampler,
    batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]

pos_block_business = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
    g, {'business': pos_business }, sampler,
    batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]

neg_block_business = [blocks for _, _, blocks in dgl.dataloading.DataLoader(
    g, {'business': neg_business }, sampler,
    batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1)][0]

user = model(block_user, 'user')[0]
p_business = model(pos_block_business, 'business')
n_business = model(neg_block_business, 'business')


  assert input.numel() == input.storage().size(), (


In [41]:
torch.bmm(user.repeat(p_business.shape[0], 1).view(p_business.shape[0], 1, model.n_out), p_business.view(p_business.shape[0], model.n_out, 1)).squeeze()

tensor([0.5728, 0.5628, 0.5803, 0.5380, 0.5070, 0.5362, 0.5807, 0.5817, 0.5664,
        0.5480, 0.5818, 0.5637], grad_fn=<SqueezeBackward0>)

In [42]:
torch.bmm(user.repeat(n_business.shape[0], 1).view(n_business.shape[0], 1, model.n_out), n_business.view(n_business.shape[0], model.n_out, 1)).squeeze()

tensor(0.5402, grad_fn=<SqueezeBackward0>)