In [1]:
import pandas as pd
import numpy as np

In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

Using backend: pytorch


In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn import preprocessing

### Resources
https://www.kaggle.com/rounakbanik/the-movies-dataset<br>
https://docs.dgl.ai/en/0.6.x/generated/dgl.DGLHeteroGraph.ndata.html?highlight=ndata#dgl.DGLHeteroGraph.ndata<br>
https://docs.dgl.ai/en/0.6.x/generated/dgl.heterograph.html<br>
https://docs.dgl.ai/en/0.6.x/guide/message-heterograph.html<br>
https://docs.dgl.ai/en/0.6.x/tutorials/basics/5_hetero.html<br>

In [5]:
df = pd.read_csv('ratings.csv')

In [6]:
df.groupby('userId').size().describe()

count     610.000000
mean      165.304918
std       269.480584
min        20.000000
25%        35.000000
50%        70.500000
75%       168.000000
max      2698.000000
dtype: float64

In [7]:
df.groupby('movieId').size().describe()

count    9724.000000
mean       10.369807
std        22.401005
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
dtype: float64

In [8]:
df.drop(columns=['timestamp'], inplace=True)

In [9]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [10]:
encoders = {
    'user': preprocessing.LabelEncoder(),
    'movie': preprocessing.LabelEncoder(),
}

In [11]:
df['encoded_user'] = encoders['user'].fit_transform(df.userId)
df['encoded_movie'] = encoders['movie'].fit_transform(df.movieId)

In [12]:
df.encoded_user.describe()

count    100836.000000
mean        325.127564
std         182.618491
min           0.000000
25%         176.000000
50%         324.000000
75%         476.000000
max         609.000000
Name: encoded_user, dtype: float64

## Graph construction
Dataset`ratings_small`: rating as edge feature

In [13]:
data_dict = {
    ('user', 'rates', 'movie'): (torch.tensor(df.encoded_user), torch.tensor(df.encoded_movie))
}

#num_nodes_dict = {'user': df.userId.nunique(), 'movie': df.movieId.nunique()}

g = dgl.heterograph(data_dict)

In [14]:
g

Graph(num_nodes={'movie': 9724, 'user': 610},
      num_edges={('user', 'rates', 'movie'): 100836},
      metagraph=[('user', 'movie', 'rates')])

In [15]:
g.edata['rating'] = torch.tensor(df.rating)

## Training and test set
https://docs.dgl.ai/tutorials/blitz/4_link_predict.html?highlight=split%20edge%20set%20training%20testing

In [16]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)

# 20% test
test_size = int(len(eids) * 0.2)
train_size = g.number_of_edges() - test_size


train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]

#### Negative sampling

In [17]:
# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = -1*adj.todense()+1
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [18]:
assert adj.todense()[neg_u,neg_v].all() == 0

In [19]:
train_g = dgl.remove_edges(g, eids[:test_size])

## GNN Definition

In [20]:
import dgl.function as fn

In [21]:
class CustomGNN(nn.Module):
    def __init__(self, in_dim, out_dim):
        
        super(CustomGNN, self).__init__()
        
        # Weights
        self.user_w = nn.Parameter(torch.FloatTensor(in_dim,out_dim))
        self.movie_w = nn.Parameter(torch.FloatTensor(in_dim,out_dim))
        self.edge_w = nn.Parameter(torch.FloatTensor(1,out_dim))
            
        self.reset_parameters()
        
        
    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        
        #attention layer init
        nn.init.xavier_normal_(self.user_w, gain=gain)
        nn.init.xavier_normal_(self.movie_w, gain=gain)
        nn.init.xavier_normal_(self.edge_w, gain=gain)
        
    def forward(self, g, user_feat, movie_feat, edge_features):
        with g.local_scope():
            g.ndata['feat'] = {'movie':movie_feat@self.movie_w, 'user':user_feat@self.user_w}
            g.edata['e_feat'] = edge_features.view(-1,1).float()@self.edge_w
            
            g.update_all(fn.u_mul_e('feat', 'e_feat', 'm'), fn.mean('m', 'h_out'))
            
            # User nodes dont get updated in message passing
            g.nodes['user'].data['h_out'] = g.ndata['feat']['user']
            
            return {ntype : g.nodes[ntype].data['h_out'] for ntype in g.ntypes}

In [22]:
user_feat = nn.Embedding(g.num_nodes('user'), 128)
movie_feat = nn.Embedding(g.num_nodes('movie'), 128)

In [23]:
gnn = CustomGNN(128, 256)

In [24]:
out = gnn(g, user_feat.weight, movie_feat.weight, g.edata['rating'])

In [25]:
out['movie'].shape

torch.Size([9724, 256])

In [26]:
out['user'].shape

torch.Size([610, 256])

#### Continue link prediction: dgl
https://docs.dgl.ai/en/0.7.x/tutorials/blitz/4_link_predict.html<br>
https://docs.dgl.ai/en/0.6.x/_modules/dgl/nn/pytorch/hetero.html

In [27]:
from dgl.nn.pytorch.hetero import HeteroGraphConv

In [28]:
conv = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(256,256, aggregator_type='mean')})

In [29]:
conv(g, out)

{'movie': tensor([[ 0.1380,  0.0207,  0.1918,  ...,  0.0743, -0.0477, -0.0509],
         [ 0.1700,  0.0984,  0.1431,  ..., -0.1172,  0.0784,  0.0902],
         [ 0.1093,  0.3728,  0.2976,  ..., -0.2210,  0.0170, -0.2950],
         ...,
         [-0.4243, -1.0367,  0.3093,  ..., -0.7338,  0.7758,  0.9527],
         [-0.4243, -1.0367,  0.3093,  ..., -0.7338,  0.7758,  0.9527],
         [-1.0335, -3.5448, -2.0143,  ..., -1.5799, -0.8559, -1.5146]],
        grad_fn=<SumBackward1>)}

In [30]:
from dgl.nn import SAGEConv

class GraphSAGEHetero(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGEHetero, self).__init__()
        self.conv = CustomGNN(in_feats, h_feats)
        self.conv1 = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(h_feats,h_feats, aggregator_type='mean')})
        self.conv2 = HeteroGraphConv({'rates' : dgl.nn.SAGEConv(h_feats,h_feats, aggregator_type='mean')})
    
    def forward(self, g, user_feat, movie_feat, edge_features):
        
        out = self.conv(g, user_feat, movie_feat, edge_features)
        
        hidden = self.conv1(g, out)
        hidden['user'] = out['user']
        hidden['movie'] = F.relu(hidden['movie'])
        
        
        h = self.conv2(g, hidden)
        h['user'] = hidden['user']
        
        return h

In [31]:
gnn = GraphSAGEHetero(128, 256)

---

#### Positive and Negative graph

In [32]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

#### Pred test

In [33]:
class DotPredictor(nn.Module):
    def forward(self, g, user_feat, movie_feat):
        u, v = g.edges()
        
        dot = (user_feat[u]*movie_feat[v]).sum(1)
        
        return dot
    
    def predict(self, nodes_u, nodes_v):
        return torch.sigmoid((nodes_u*nodes_v).sum(1))

In [34]:
gnn = GraphSAGEHetero(128, 256)
out = gnn(train_g, user_feat.weight, movie_feat.weight, g.edata['rating'][eids[test_size:]])

In [35]:
pred = DotPredictor()
pred(train_g, user_feat.weight, movie_feat.weight)

tensor([ 2.7286e+00,  2.9155e+00, -3.7165e-03,  ...,  3.2878e+00,
        -7.1065e+00,  1.9937e+00], grad_fn=<SumBackward1>)

In [36]:
train_pos_g.edges()

(tensor([ 67, 291, 186,  ..., 595, 380, 599]),
 tensor([7154, 5475, 3979,  ..., 4155, 3910, 1177]))

### Train loop

In [54]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [81]:


# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(hidden)

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

def compute_recall(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    scores = torch.sigmoid(scores)
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return recall_score(labels, torch.round(scores))

def compute_precision(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    scores = torch.sigmoid(scores)
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return precision_score(labels, torch.round(scores))

In [82]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop

user_feat = nn.Embedding(g.num_nodes('user'), 16)
movie_feat = nn.Embedding(g.num_nodes('movie'), 16)
model = GraphSAGEHetero(16, 32)
pred = DotPredictor()

optimizer = torch.optim.Adam(itertools.chain(model.parameters(), 
                                             pred.parameters(), 
                                             user_feat.parameters(), 
                                             movie_feat.parameters()), 
                             lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(50):
    # forward
    h = model(train_g, user_feat.weight, movie_feat.weight, g.edata['rating'][eids[test_size:]])
    pos_score = pred(train_g, user_feat.weight, movie_feat.weight)
    neg_score = pred(train_neg_g, user_feat.weight, movie_feat.weight)
    
    loss = compute_loss(pos_score, neg_score)
    
    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, user_feat.weight, movie_feat.weight)
    neg_score = pred(test_neg_g, user_feat.weight, movie_feat.weight)
    print('AUC', compute_auc(pos_score, neg_score))
    print('Recall', compute_recall(pos_score, neg_score))
    print('Precision', compute_precision(pos_score, neg_score))


# Thumbnail credits: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 1.715492844581604
In epoch 5, loss: 1.5523698329925537
In epoch 10, loss: 1.4066458940505981
In epoch 15, loss: 1.2778429985046387
In epoch 20, loss: 1.1648211479187012
In epoch 25, loss: 1.066038727760315
In epoch 30, loss: 0.9797517657279968
In epoch 35, loss: 0.9041666388511658
In epoch 40, loss: 0.8375397324562073
In epoch 45, loss: 0.7782294750213623
[-1.2369193 -5.275935  -0.8983321  2.1247125 -1.419433 ]
AUC 0.5206376080401036
Recall 0.532057321366589
Precision 0.5174825174825175


## Testing

#### Read movie data

In [70]:
df_movies = pd.read_csv('movies.csv')

#### Get embeddings

In [71]:
with torch.no_grad():
    features = model(g, user_feat.weight, movie_feat.weight, g.edata['rating'])

In [72]:
features['user'].shape

torch.Size([610, 32])

In [73]:
features['movie'].shape

torch.Size([9724, 32])

#### Params

In [74]:
k = 10

#### User id #0

In [75]:
idx = 7

user_emb = features['user'][idx]

In [76]:
top_k_user = df[df.encoded_user == idx][['movieId', 'rating']].sort_values('rating', ascending=False).iloc[:k]

In [77]:
df_movies[df_movies.movieId.isin(top_k_user.movieId)]

Unnamed: 0,movieId,title,genres
32,34,Babe (1995),Children|Drama
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
156,185,"Net, The (1995)",Action|Crime|Thriller
217,253,Interview with the Vampire: The Vampire Chroni...,Drama|Horror
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
315,357,Four Weddings and a Funeral (1994),Comedy|Romance
322,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
337,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
461,527,Schindler's List (1993),Drama|War
508,590,Dances with Wolves (1990),Adventure|Drama|Western


### Recommended movies

In [78]:
prediction = pred.predict(user_emb.view(1,-1),features['movie'])
prediction_index = prediction.topk(k).indices

In [79]:
df_movies[df_movies.movieId.isin(encoders['movie'].inverse_transform(prediction_index))]

Unnamed: 0,movieId,title,genres
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime|Drama
37,41,Richard III (1995),Drama|War
39,43,Restoration (1995),Drama
45,49,When Night Is Falling (1995),Drama|Romance
49,54,"Big Green, The (1995)",Children|Comedy
50,55,Georgia (1995),Drama
57,64,Two if by Sea (1996),Comedy|Romance
63,71,Fair Game (1995),Action
67,75,Big Bully (1996),Comedy|Drama
69,77,Nico Icon (1995),Documentary


In [80]:
df_movies[df_movies.movieId.isin(top_k_user.movieId)]

Unnamed: 0,movieId,title,genres
32,34,Babe (1995),Children|Drama
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
156,185,"Net, The (1995)",Action|Crime|Thriller
217,253,Interview with the Vampire: The Vampire Chroni...,Drama|Horror
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
315,357,Four Weddings and a Funeral (1994),Comedy|Romance
322,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
337,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
461,527,Schindler's List (1993),Drama|War
508,590,Dances with Wolves (1990),Adventure|Drama|Western
