In [1]:
import pandas as pd
import numpy as np

In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

Using backend: pytorch


In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn import preprocessing

### Resources
https://www.kaggle.com/rounakbanik/the-movies-dataset<br>
https://docs.dgl.ai/en/0.6.x/generated/dgl.DGLHeteroGraph.ndata.html?highlight=ndata#dgl.DGLHeteroGraph.ndata<br>
https://docs.dgl.ai/en/0.6.x/generated/dgl.heterograph.html<br>
https://docs.dgl.ai/en/0.6.x/guide/message-heterograph.html<br>
https://docs.dgl.ai/en/0.6.x/tutorials/basics/5_hetero.html<br>

In [140]:
df = pd.read_csv('ratings_small.csv')

In [141]:
df.groupby('userId').size().describe()

count     671.000000
mean      149.037258
std       231.226948
min        20.000000
25%        37.000000
50%        71.000000
75%       161.000000
max      2391.000000
dtype: float64

In [142]:
df.groupby('movieId').size().describe()

count    9066.000000
mean       11.030664
std        24.050800
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       341.000000
dtype: float64

In [143]:
df.drop(columns=['timestamp'], inplace=True)

In [144]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [145]:
encoders = {
    'user': preprocessing.LabelEncoder(),
    'movie': preprocessing.LabelEncoder(),
}

In [146]:
df['encoded_user'] = encoders['user'].fit_transform(df.userId)
df['encoded_movie'] = encoders['movie'].fit_transform(df.movieId)

In [147]:
df.encoded_user.describe()

count    100004.000000
mean        346.011310
std         195.163838
min           0.000000
25%         181.000000
50%         366.000000
75%         519.000000
max         670.000000
Name: encoded_user, dtype: float64

## Graph construction
Dataset`ratings_small`: rating as edge feature

In [13]:
data_dict = {
    ('user', 'rates', 'movie'): (torch.tensor(df.encoded_user), torch.tensor(df.encoded_movie))
}

#num_nodes_dict = {'user': df.userId.nunique(), 'movie': df.movieId.nunique()}

g = dgl.heterograph(data_dict)

In [14]:
g

Graph(num_nodes={'movie': 9066, 'user': 671},
      num_edges={('user', 'rates', 'movie'): 100004},
      metagraph=[('user', 'movie', 'rates')])

In [15]:
g.edata['rating'] = torch.tensor(df.rating)

## Training and test set
https://docs.dgl.ai/tutorials/blitz/4_link_predict.html?highlight=split%20edge%20set%20training%20testing

In [16]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)

# 20% test
test_size = int(len(eids) * 0.2)
train_size = g.number_of_edges() - test_size


train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]

#### Negative sampling

In [17]:
# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = -1*adj.todense()+1
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [18]:
assert adj.todense()[neg_u,neg_v].all() == 0

In [19]:
train_g = dgl.remove_edges(g, eids[:test_size])

## GNN Definition

In [20]:
import dgl.function as fn

In [21]:
class CustomGNN(nn.Module):
    def __init__(self, in_dim, out_dim):
        
        super(CustomGNN, self).__init__()
        
        # Weights
        self.user_w = nn.Parameter(torch.FloatTensor(in_dim,out_dim))
        self.movie_w = nn.Parameter(torch.FloatTensor(in_dim,out_dim))
        self.edge_w = nn.Parameter(torch.FloatTensor(1,out_dim))
            
        self.reset_parameters()
        
        
    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        
        #attention layer init
        nn.init.xavier_normal_(self.user_w, gain=gain)
        nn.init.xavier_normal_(self.movie_w, gain=gain)
        nn.init.xavier_normal_(self.edge_w, gain=gain)
        
    def forward(self, g, user_feat, movie_feat, edge_features):
        with g.local_scope():
            g.ndata['feat'] = {'movie':movie_feat@self.movie_w, 'user':user_feat@self.user_w}
            g.edata['e_feat'] = edge_features.view(-1,1).float()@self.edge_w
            
            g.update_all(fn.u_mul_e('feat', 'e_feat', 'm'), fn.mean('m', 'h_out'))
            
            # User nodes dont get updated in message passing
            g.nodes['user'].data['h_out'] = g.ndata['feat']['user']
            
            return {ntype : g.nodes[ntype].data['h_out'] for ntype in g.ntypes}

In [22]:
user_feat = nn.Embedding(g.num_nodes('user'), 128)
movie_feat = nn.Embedding(g.num_nodes('movie'), 128)

In [23]:
gnn = CustomGNN(128, 256)

In [24]:
out = gnn(g, user_feat.weight, movie_feat.weight, g.edata['rating'])

In [25]:
out['movie'].shape

torch.Size([9066, 256])

In [26]:
out['user'].shape

torch.Size([671, 256])

#### Continue link prediction: dgl
https://docs.dgl.ai/en/0.7.x/tutorials/blitz/4_link_predict.html<br>
https://docs.dgl.ai/en/0.6.x/_modules/dgl/nn/pytorch/hetero.html

In [27]:
from dgl.nn.pytorch.hetero import HeteroGraphConv

In [28]:
conv = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(256,256, aggregator_type='mean')})

In [29]:
conv(g, out)

{'movie': tensor([[-4.5027e-02,  4.9270e-02,  5.8726e-03,  ...,  4.7681e-02,
          -2.0149e-01,  1.0145e-01],
         [ 7.7432e-02,  1.2719e-03, -2.5787e-02,  ..., -5.6943e-03,
          -2.5850e-02, -1.5203e-01],
         [ 2.0834e-01,  1.4458e-01, -1.9749e-03,  ...,  1.3594e-02,
          -3.2250e-01,  1.5177e-03],
         ...,
         [-1.2448e+00,  7.9304e-01,  2.4640e+00,  ...,  1.8480e+00,
           1.0009e+00,  7.8667e-01],
         [-9.3411e-01,  5.8674e-01,  2.3965e+00,  ...,  1.0952e+00,
           9.0079e-01,  1.0845e+00],
         [ 1.0812e+00, -7.5007e-01,  2.9362e+00,  ..., -1.6688e+00,
           5.0723e-02, -2.4468e+00]], grad_fn=<SumBackward1>)}

In [30]:
from dgl.nn import SAGEConv

class GraphSAGEHetero(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGEHetero, self).__init__()
        self.conv = CustomGNN(in_feats, h_feats)
        self.conv1 = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(h_feats,h_feats, aggregator_type='mean')})
        self.conv2 = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(h_feats,h_feats, aggregator_type='mean')})
    
    def forward(self, g, user_feat, movie_feat, edge_features):
        
        out = self.conv(g, user_feat, movie_feat, edge_features)
        
        hidden = self.conv1(g, out)
        hidden['user'] = out['user']
        hidden['movie'] = F.relu(hidden['movie'])
        
        
        h = self.conv2(g, hidden)
        h['user'] = hidden['user']
        
        return h

In [31]:
gnn = GraphSAGEHetero(128, 256)

---

#### Positive and Negative graph

In [32]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

#### Pred test

In [33]:
class DotPredictor(nn.Module):
    def forward(self, g, user_feat, movie_feat):
        u, v = g.edges()
        
        dot = (user_feat[u]*movie_feat[v]).sum(1)
        
        return dot
    
    def predict(self, nodes_u, nodes_v):
        return (nodes_u*nodes_v).sum(1)

In [34]:
gnn = GraphSAGEHetero(128, 256)
out = gnn(train_g, user_feat.weight, movie_feat.weight, g.edata['rating'][eids[test_size:]])

In [35]:
pred = DotPredictor()
pred(train_g, user_feat.weight, movie_feat.weight)

tensor([  3.5904, -10.0490, -14.8368,  ...,  -7.8864,   3.9985, -18.1859],
       grad_fn=<SumBackward1>)

In [36]:
train_pos_g.edges()

(tensor([593, 231, 451,  ..., 194, 142,  30]),
 tensor([ 957, 1040,  313,  ..., 2487, 5479, 6365]))

### Train loop

In [37]:
from sklearn.metrics import roc_auc_score

In [63]:


# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(hidden)

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [64]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop

user_feat = nn.Embedding(g.num_nodes('user'), 16)
movie_feat = nn.Embedding(g.num_nodes('movie'), 16)
model = GraphSAGEHetero(16, 32)
pred = DotPredictor()

optimizer = torch.optim.Adam(itertools.chain(model.parameters(), 
                                             pred.parameters(), 
                                             user_feat.parameters(), 
                                             movie_feat.parameters()), 
                             lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, user_feat.weight, movie_feat.weight, g.edata['rating'][eids[test_size:]])
    pos_score = pred(train_g, user_feat.weight, movie_feat.weight)
    neg_score = pred(train_neg_g, user_feat.weight, movie_feat.weight)
    
    loss = compute_loss(pos_score, neg_score)
    
    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, user_feat.weight, movie_feat.weight)
    neg_score = pred(test_neg_g, user_feat.weight, movie_feat.weight)
    print('AUC', compute_auc(pos_score, neg_score))


# Thumbnail credits: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 1.7351487874984741
In epoch 5, loss: 1.5717809200286865
In epoch 10, loss: 1.4255683422088623
In epoch 15, loss: 1.2960543632507324
In epoch 20, loss: 1.182146430015564
In epoch 25, loss: 1.0823965072631836
In epoch 30, loss: 0.9951698780059814
In epoch 35, loss: 0.9187524914741516
In epoch 40, loss: 0.8514353632926941
In epoch 45, loss: 0.7915824055671692
In epoch 50, loss: 0.7376825213432312
In epoch 55, loss: 0.6883888244628906
In epoch 60, loss: 0.6425591707229614
In epoch 65, loss: 0.5993055105209351
In epoch 70, loss: 0.5580393671989441
In epoch 75, loss: 0.5184904336929321
In epoch 80, loss: 0.48067429661750793
In epoch 85, loss: 0.44480451941490173
In epoch 90, loss: 0.4111679792404175
In epoch 95, loss: 0.38000550866127014
AUC 0.65745383625


## Testing

#### Read movie data

In [176]:
df_movies = pd.read_csv('movies.csv')

#### Get embeddings

In [130]:
with torch.no_grad():
    features = model(g, user_feat.weight, movie_feat.weight, g.edata['rating'])

In [131]:
features['user'].shape

torch.Size([671, 32])

In [132]:
features['movie'].shape

torch.Size([9066, 32])

#### Params

In [133]:
k = 10

#### User id #0

In [148]:
idx = 0

user_emb = features['user'][idx]

In [149]:
top_k_user = df[df.encoded_user == 0][['movieId', 'rating']].sort_values('rating', ascending=False).iloc[:k]

In [182]:
df_movies[df_movies.movieId.isin(top_k_user.movieId)]

Unnamed: 0,movieId,title,genres
30,31,Dangerous Minds (1995),Drama
786,1029,Dumbo (1941),Animation|Children|Drama|Musical
811,1061,Sleepers (1996),Thriller
878,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
1027,1339,Dracula (Bram Stoker's Dracula) (1992),Fantasy|Horror|Romance|Thriller
1430,1953,"French Connection, The (1971)",Action|Crime|Thriller
1567,2105,Tron (1982),Action|Adventure|Sci-Fi
1608,2150,"Gods Must Be Crazy, The (1980)",Adventure|Comedy
1846,2455,"Fly, The (1986)",Drama|Horror|Sci-Fi|Thriller
2733,3671,Blazing Saddles (1974),Comedy|Western


### Recommended movies (not rated by user)

In [189]:
df

Unnamed: 0,userId,movieId,rating,encoded_user,encoded_movie
0,1,31,2.5,0,30
1,1,1029,3.0,0,833
2,1,1061,3.0,0,859
3,1,1129,2.0,0,906
4,1,1172,4.0,0,931
...,...,...,...,...,...
99999,671,6268,2.5,670,4545
100000,671,6269,4.0,670,4546
100001,671,6365,4.0,670,4597
100002,671,6385,2.5,670,4610


In [197]:
movies_not_rated_idx = df[~df['movieId'].isin(top_k_user.movieId)]['encoded_movie'].unique()

In [201]:
pred(,features['movie'][movies_not_rated_idx])

tensor([[-0.4526,  0.0787, -0.0632,  ...,  0.3854, -0.7574,  0.1950],
        [-1.0649,  0.7201,  0.8518,  ...,  0.1992, -0.6613,  0.8012],
        [-0.9472,  0.6054,  0.1511,  ...,  0.5567, -1.3346,  0.6363],
        ...,
        [-1.1546, -0.3386,  1.4607,  ...,  1.8604, -1.1157,  0.7176],
        [-0.1992,  0.1527,  0.8658,  ...,  0.8137, -0.6951,  1.2395],
        [ 5.2577,  2.8392, -1.3795,  ..., -6.9084,  5.3330,  8.4222]])