In [5]:
import pandas as pd
import numpy as np
import random
import tqdm
import gc
gc.enable()

%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import download_url, extract_zip, HeteroData
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch_geometric.transforms as T

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting ./ml-latest-small.zip


In [7]:
# Load movies data
movies_df = pd.read_csv(movies_path, index_col='movieId')
print(movies_df.head())

                                      title  \
movieId                                       
1                          Toy Story (1995)   
2                            Jumanji (1995)   
3                   Grumpier Old Men (1995)   
4                  Waiting to Exhale (1995)   
5        Father of the Bride Part II (1995)   

                                              genres  
movieId                                               
1        Adventure|Animation|Children|Comedy|Fantasy  
2                         Adventure|Children|Fantasy  
3                                     Comedy|Romance  
4                               Comedy|Drama|Romance  
5                                             Comedy  


In [8]:
# Create indicator movie genre variables
genres = movies_df.genres.str.get_dummies('|')
print(genres[['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Comedy']].head())

# Create tensor to hold movie features (20 genre indicators)
movie_feat = torch.from_numpy(genres.values).to(torch.float)
print("Movie features: ", movie_feat.size())

         (no genres listed)  Action  Adventure  Animation  Comedy
movieId                                                          
1                         0       0          1          1       1
2                         0       0          1          0       0
3                         0       0          0          0       1
4                         0       0          0          0       1
5                         0       0          0          0       1
Movie features:  torch.Size([9742, 20])


In [9]:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
print("Movie features: ", movie_feat.size())

Movie features:  torch.Size([9742, 20])


In [10]:
# Load ratings data
ratings_df = pd.read_csv(ratings_path)
print(ratings_df.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [11]:
# Create dummy user data
fname_pool = ['Amy','Bob','Charles','David','Elena','Fahim','George','Harriet','Isabel','James',
              'Katherine','Laith','Mary','Nora','Oliver','Percy','Quincy','Rachel','Stewart']
lname_pool = ['Adams','Bamford','Clint','Davids','Ellsworth','Finn','Garcia','Harlow','Irwin','Johnson',
              'Kringle','Luo','Mann','Navarro','Oswald','Werther','Yee']

users_df = pd.DataFrame()
for _, user in enumerate(ratings_df.userId.unique()):
    # tmp_df = pd.DataFrame(data={
    #     'userId'            : user,
    #     'name'              : f"{random.choice(fname_pool)} {random.choice(lname_pool)}",
    # }, index=[idx])
    tmp_df = ratings_df[ratings_df.userId==user]
    try:
        tmp_sum = genres.iloc[tmp_df.movieId.values].sum() # How many movies of each genre has user watched?
    except: # running into indexing problems halfway through (tried iterrows)
        indexable = []
        for x in tmp_df.movieId.values:
            try:
                genres[x]
                indexable.append(x)
            except:
                pass
        tmp_sum = genres.iloc[indexable].sum()
    tmp_df = pd.DataFrame(tmp_sum).T
    tmp_df.index = [user]
    tmp_df.loc[:,'name'] = f"{random.choice(fname_pool)} {random.choice(lname_pool)}"
    users_df = pd.concat([users_df,tmp_df])

In [12]:
print(users_df.head())

# Detach user features
user_feat = torch.from_numpy(users_df.drop(columns=['name']).values).to(torch.float)
print("User features: ", user_feat.shape)

   (no genres listed)  Action  Adventure  Animation  Children  Comedy  Crime  \
1                   0      37         27         10        18      65     26   
2                   0       0          0          0         0       0      0   
3                   0       0          0          0         0       0      0   
4                   0      39         21          5        12      69     26   
5                   0      13          6          3         5      14      2   

   Documentary  Drama  Fantasy  ...  Horror  IMAX  Musical  Mystery  Romance  \
1            6    107       13  ...      30     0        8       14       40   
2            0      0        0  ...       0     0        0        0        0   
3            0      0        0  ...       0     0        0        0        0   
4            4    110       13  ...      24     0        5       14       47   
5            1     17        3  ...       4     0        1        1        6   

   Sci-Fi  Thriller  War  Western     

In [13]:
# Create mappings due to indexing problems
unique_user_id = ratings_df.userId.unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedId': pd.RangeIndex(len(unique_user_id))
})
print(unique_user_id.head())

unique_movie_id = ratings_df.movieId.unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedId': pd.RangeIndex(len(unique_movie_id))
})
print(unique_movie_id.head())

   userId  mappedId
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4
   movieId  mappedId
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4


In [14]:
ratings_user_id = pd.merge(ratings_df.userId, unique_user_id,
                           left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id.mappedId.values)
ratings_movie_id = pd.merge(ratings_df.movieId, unique_movie_id,
                            left_on='movieId', right_on='movieId')
ratings_movie_id = torch.from_numpy(ratings_movie_id.mappedId.values)

# ratings_user_id = 1st endpoint of ratings, ratings_movie_id = 2nd endpoint 
print(ratings_user_id, ratings_user_id.shape)
print(ratings_movie_id, ratings_movie_id.shape)
# TODO: unsuer why the mappings are necessary

tensor([  0,   0,   0,  ..., 609, 609, 609]) torch.Size([100836])
tensor([   0,    1,    2,  ..., 3121, 1392, 2873]) torch.Size([100836])


In [15]:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
print(edge_index_user_to_movie)

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])


In [16]:
# Create data object
data = HeteroData()

# Save node indices
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(unique_movie_id))

# Add node features
data["movie"].x = movie_feat
data["user"].x = user_feat

# Save edge indices
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie # has shape (2, num_edges)

# Add edge features (timestamp normalized)
data["user", "rates", "movie"].edge_attr = torch.from_numpy(ratings_df.timestamp.values).to(torch.float) / ratings_df.timestamp.min()

# Add edge labels (regression)
data["user", "rates", "movie"].edge_label  = torch.from_numpy(ratings_df.rating.values).to(torch.float)

# Add reverse edge
data = T.ToUndirected()(data)

In [None]:
# # Skipping train, val, test split -> we will have multiple graphs
# transform = T.RandomLinkSplit(
#     num_val=0.1,
#     num_test=0.1,
#     disjoint_train_ratio=0.3,
#     neg_sampling_ratio=1,
#     add_negative_train_samples=False,
#     edge_types=("user","rates","movie"),
#     rev_edge_types=("movie","rates","user")
# )
# train_data, val_data, test_data = transform(data)

In [243]:
# Minibatch loader
edge_index = data["user", "rates", "movie"].edge_index
edge_label = data["user", "rates", "movie"].edge_label

train_loader = LinkNeighborLoader(
    data=data,
    num_neighbors=[10, 10],    # sample at most 10 neighbors in each hop
    neg_sampling_ratio=1,
    edge_label_index = (("user","rates","movie"), edge_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True
)

""" changed from edge_label_index to edge_index -> because we are using the whole graph for training,
no difference between edges used for supervision vs message passing """

' changed from edge_label_index to edge_index -> because we are using the whole graph for training,\nno difference between edges used for supervision vs message passing '

In [279]:
# Create GNN model
class GNN(nn.Module):
    def __init__(self,
                 hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self,
                x           : torch.Tensor,
                edge_index  : torch.Tensor) -> torch.Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Final classifier applies dot-product to source and destination
class Regressor(nn.Module):
    def forward(self,
                x_user      : torch.Tensor,
                x_movie     : torch.Tensor,
                edge_label_index: torch.Tensor) -> torch.Tensor:

        # Convert node embeddings to edge-level representations
        src, dest = edge_label_index
        
        # Apply dot product
        return (x_user[src] * x_movie[dest]).sum(dim=-1)
    
class Model(nn.Module):
    def __init__(self,
                 hidden_channels):
        super().__init__()
        
        # Create embeddings
        self.movie_embed = nn.Linear(20, hidden_channels)
        self.user_embed  = nn.Linear(20, hidden_channels)

        # GNN
        self.gnn = GNN(hidden_channels)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.regressor = Regressor()
    
    def forward(self,
                data: HeteroData) -> torch.Tensor:
        # x_dict holds feature matrices of nodes
        # edge_index_dict holdes edge indices
        x_dict = {
            "user": self.user_embed(data['user'].x),
            'movie': self.movie_embed(data['movie'].x)
        }
        # print(x_dict['user'][0])

        # Message passing
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        # print(x_dict['user'][0])
        # print(x_dict['user'].shape)
        # print(x_dict['movie'].shape)
        
        # Get edge predictions
        pred = self.regressor(
            x_dict['user'],
            x_dict['movie'],
            data['user','rates','movie'].edge_index
        )

        return pred

In [280]:
model = Model(hidden_channels=64)

In [281]:
test = model(data)
test.shape

torch.Size([100836])

In [275]:
610 * 9742

5942620

## Edge regression

In [None]:
self.mlp = nn.Sequential(
    nn.Linear(2 * n_features + n_edge_features, hiddens),
    nn.ReLU(),
    nn.Linear(hiddens, n_targets)
)

row, col = edge_index
new_edge_attr = self.mlp(torch.cat([x[row], x[col], edge_attr], dim=-1))