In [1]:
import pandas as pd
import numpy as np

In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

Using backend: pytorch


In [7]:
import matplotlib.pyplot as plt

In [41]:
from sklearn import preprocessing

### Resources
https://www.kaggle.com/rounakbanik/the-movies-dataset<br>
https://docs.dgl.ai/en/0.6.x/generated/dgl.DGLHeteroGraph.ndata.html?highlight=ndata#dgl.DGLHeteroGraph.ndata<br>
https://docs.dgl.ai/en/0.6.x/generated/dgl.heterograph.html<br>
https://docs.dgl.ai/en/0.6.x/guide/message-heterograph.html<br>
https://docs.dgl.ai/en/0.6.x/tutorials/basics/5_hetero.html<br>

In [3]:
df = pd.read_csv('ratings_small.csv')

In [13]:
df.groupby('userId').size().describe()

count     671.000000
mean      149.037258
std       231.226948
min        20.000000
25%        37.000000
50%        71.000000
75%       161.000000
max      2391.000000
dtype: float64

In [22]:
df.groupby('movieId').size().describe()

count    9066.000000
mean       11.030664
std        24.050800
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       341.000000
dtype: float64

In [18]:
df.drop(columns=['timestamp'], inplace=True)

In [384]:
df.head()

Unnamed: 0,userId,movieId,rating,encoded_user,encoded_movie
0,1,31,2.5,0,30
1,1,1029,3.0,0,833
2,1,1061,3.0,0,859
3,1,1129,2.0,0,906
4,1,1172,4.0,0,931


In [42]:
encoders = {
    'user': preprocessing.LabelEncoder(),
    'movie': preprocessing.LabelEncoder(),
}

In [43]:
df['encoded_user'] = encoders['user'].fit_transform(df.userId)
df['encoded_movie'] = encoders['movie'].fit_transform(df.movieId)

In [46]:
df.encoded_user.describe()

count    100004.000000
mean        346.011310
std         195.163838
min           0.000000
25%         181.000000
50%         366.000000
75%         519.000000
max         670.000000
Name: encoded_user, dtype: float64

## Graph construction
Dataset`ratings_small`: rating as edge feature

In [385]:
data_dict = {
    ('user', 'rates', 'movie'): (torch.tensor(df.encoded_user), torch.tensor(df.encoded_movie))
}

#num_nodes_dict = {'user': df.userId.nunique(), 'movie': df.movieId.nunique()}

g = dgl.heterograph(data_dict, num_nodes_dict=num_nodes_dict)

In [386]:
g

Graph(num_nodes={'movie': 9066, 'user': 671},
      num_edges={('user', 'rates', 'movie'): 100004},
      metagraph=[('user', 'movie', 'rates')])

In [387]:
g.edata['rating'] = torch.tensor(df.rating)

## Training and test set
https://docs.dgl.ai/tutorials/blitz/4_link_predict.html?highlight=split%20edge%20set%20training%20testing

In [250]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)

# 20% test
test_size = int(len(eids) * 0.2)
train_size = g.number_of_edges() - test_size


train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]

#### Negative sampling

In [251]:
# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = -1*adj.todense()+1
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [252]:
assert adj.todense()[neg_u,neg_v].all() == 0

In [253]:
train_g = dgl.remove_edges(g, eids[:test_size])

## GNN Definition

In [295]:
import dgl.function as fn

In [456]:
class CustomGNN(nn.Module):
    def __init__(self, in_dim, out_dim):
        
        super(CustomGNN, self).__init__()
        
        # Weights
        self.user_w = nn.Parameter(torch.FloatTensor(in_dim,out_dim))
        self.movie_w = nn.Parameter(torch.FloatTensor(in_dim,out_dim))
        self.edge_w = nn.Parameter(torch.FloatTensor(1,out_dim))
            
        self.reset_parameters()
        
        
    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        
        #attention layer init
        nn.init.xavier_normal_(self.user_w, gain=gain)
        nn.init.xavier_normal_(self.movie_w, gain=gain)
        nn.init.xavier_normal_(self.edge_w, gain=gain)
        
    def forward(self, g, user_feat, movie_feat, edge_features):
        with g.local_scope():
            g.ndata['feat'] = {'movie':movie_feat@self.movie_w, 'user':user_feat@self.user_w}
            g.edata['e_feat'] = edge_features.view(-1,1).float()@self.edge_w
            
            g.update_all(fn.u_mul_e('feat', 'e_feat', 'm'), fn.mean('m', 'h_out'))
            
            # User nodes dont get updated in message passing
            g.nodes['user'].data['h_out'] = g.ndata['feat']['user']
            
            return {ntype : g.nodes[ntype].data['h_out'] for ntype in g.ntypes}

In [470]:
user_feat = nn.Embedding(g.num_nodes('user'), 128)
movie_feat = nn.Embedding(g.num_nodes('movie'), 128)

In [471]:
gnn = CustomGNN(128, 256)

In [472]:
out = gnn(g, user_feat.weight, movie_feat.weight, g.edata['rating'])

In [473]:
out['movie'].shape

torch.Size([9066, 256])

In [479]:
out['user'].shape

torch.Size([671, 256])

#### Continue link prediction: dgl
https://docs.dgl.ai/en/0.7.x/tutorials/blitz/4_link_predict.html<br>
https://docs.dgl.ai/en/0.6.x/_modules/dgl/nn/pytorch/hetero.html

In [488]:
from dgl.nn.pytorch.hetero import HeteroGraphConv

In [498]:
conv = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(256,256, aggregator_type='mean')})