# Practicing Link Prediction on Heterogneous Graphs with PyG
source: https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70

In [1]:
from torch_geometric.data import download_url, extract_zip, HeteroData
import pandas as pd
import torch
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F

## Dataset

### Downloading the dataset
in the `datasets/` folder under the current directory

In [2]:
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, 'datasets'), 'datasets')

movies_path = 'datasets/ml-latest-small/movies.csv'
ratings_path = 'datasets/ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting datasets\ml-latest-small.zip


### Exploring the dataset

In [3]:
movies_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies_df.shape, ratings_df.shape

((9742, 3), (100836, 4))

## Preprocessing

In [7]:
# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|') # did not know it was possible to do this
# here creating a feature matrix for the movies using the genres
genres.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# Use genres as movie input features:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
# converting the features df to tensor
movie_feat.shape

torch.Size([9742, 20])

In [9]:
movie_feat # 9742 movies, 20 genres, 1 if movie has genre, 0 if not, each row is a movie, each column is a genre

tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [10]:
# mapping user id to consecutive unique ids
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
unique_user_id.head()

Unnamed: 0,userId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [11]:
unique_user_id.shape

(610, 2)

In [12]:
# mapping movie id to consecutive unique ids
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
unique_movie_id.head()

Unnamed: 0,movieId,mappedID
0,1,0
1,3,1
2,6,2
3,47,3
4,50,4


In [13]:
unique_movie_id.shape

(9724, 2)

In [14]:
# merging the ratings df with the unique user id df in order to get the mapped id as the first column of the edge index
ratings_user_id_df = pd.merge(ratings_df['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')

# creating the first column of the edge index, which is the user id
ratings_user_id = torch.from_numpy(ratings_user_id_df['mappedID'].values)
ratings_user_id

tensor([  0,   0,   0,  ..., 609, 609, 609])

In [15]:
ratings_user_id_df.head()

Unnamed: 0,userId,mappedID
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [16]:
# merging the ratings df with the unique movie id df in order to get the mapped id as the second column of the edge index
ratings_movie_id_df = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
# creating the second column of the edge index, which is the movie id
ratings_movie_id = torch.from_numpy(ratings_movie_id_df['mappedID'].values)
ratings_movie_id

tensor([   0,    1,    2,  ..., 3121, 1392, 2873])

In [17]:
ratings_movie_id_df.head()

Unnamed: 0,movieId,mappedID
0,1,0
1,3,1
2,6,2
3,47,3
4,50,4


In [18]:
# creating the edge index
edge_index = torch.stack([
        ratings_user_id, ratings_movie_id
], dim=0)
edge_index

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])

### Creating the heterogeneous graph

In [19]:
data = HeteroData()

data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(unique_movie_id))

In [20]:
torch.arange(len(unique_user_id)) == torch.from_numpy(unique_user_id['mappedID'].values) # not sure, but possibly `arrange` is the faster way to do this

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [21]:
data["movie"].x = movie_feat
# NOTE: the `x` attribute is used to store the feature matrix of each node type
# and there's no feature matrix for the user nodes
data["movie"].x.shape

torch.Size([9742, 20])

In [22]:
data["user", "rates", "movie"].edge_index = edge_index

In [23]:
# `user` nodes only hase the node_id attribute, no features or other attributes
# `movie` nodes have the node_id attribute and a feature matrix
# `user` nodes are connected to `movie` nodes by edges with the `rates` relation

data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9724],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] }
)

In [24]:
# transforming the directed graph into an undirected graph by introducing the reverse edges, `rev_rates` relation 
data = T.ToUndirected()(data)
data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9724],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

In [25]:
test_data = HeteroData()
test_data["yo", "gos", "bro"].edge_index = torch.randint(0, 10, (2, 5))
test_data

HeteroData(
  (yo, gos, bro)={ edge_index=[2, 5] }
)

In [26]:
T.ToUndirected()(test_data)

HeteroData(
  (yo, gos, bro)={ edge_index=[2, 5] },
  (bro, rev_gos, yo)={ edge_index=[2, 5] }
)

### Train-val-test split

In [27]:
transform = T.RandomLinkSplit(
    num_val = 0.1, # 10% of the edges will be used for validation
    num_test = 0.2, # 20% of the edges will be used for testing, the rest 70% of the edges will be used for training
    disjoint_train_ratio= 0.3, # i think 30% of the training edges will be for supervision during training and the rest 70% is for message passing
    # `edge_index` will hold the training edges and `edge_label_index` will hold the supervision edges of trainning data
    # not sure about the other two sets though
    # when `disjoint_train_ratio` is set to 0, all training edges will be used for supervision during training
    neg_sampling_ratio= 2.0, # 2 negative samples will be added for each positive sample
    add_negative_train_samples=False, # not going to add negative samples to the training set, is this not removing the effect
    # of `neg_sampling_ratio` parameter?
    edge_types=('user', 'rates', 'movie'), # defining the relation type"
    rev_edge_types=('movie', 'rev_rates', 'user'), # defining the reverse relation type
)

In [28]:
train_data, val_data, test_data = transform(data)
[train_data, val_data, test_data]
# train data set has 49411+21175=70586 edges, 49411 edges for supervision and 21175 edges for message passing
# all unique
# val data set has 70586 edges, 30249 of which has labels
# test data set has 80699 edges, 60501 of which has labels
# so how is that? 
# how can the test data set have more edges than the train data set?

[HeteroData(
   user={ node_id=[610] },
   movie={
     node_id=[9724],
     x=[9742, 20],
   },
   (user, rates, movie)={
     edge_index=[2, 49411],
     edge_label=[21175],
     edge_label_index=[2, 21175],
   },
   (movie, rev_rates, user)={ edge_index=[2, 49411] }
 ),
 HeteroData(
   user={ node_id=[610] },
   movie={
     node_id=[9724],
     x=[9742, 20],
   },
   (user, rates, movie)={
     edge_index=[2, 70586],
     edge_label=[30249],
     edge_label_index=[2, 30249],
   },
   (movie, rev_rates, user)={ edge_index=[2, 70586] }
 ),
 HeteroData(
   user={ node_id=[610] },
   movie={
     node_id=[9724],
     x=[9742, 20],
   },
   (user, rates, movie)={
     edge_index=[2, 80669],
     edge_label=[60501],
     edge_label_index=[2, 60501],
   },
   (movie, rev_rates, user)={ edge_index=[2, 80669] }
 )]

### Mini-batching

In [29]:
def linkLoaderWrapper(data: HeteroData):
    loader = LinkNeighborLoader(
        data = data,
        
        num_neighbors = [20, 10], # In the first hop, we sample at most 20 neighbors. In the second hop, we sample at most 10 neighbors. not sure yet what this means
        neg_sampling_ratio = 2.0, # 2 negative samples will be added for each positive sample 'on-the-fly'
        edge_label_index = (
            ("user", "rates", "movie"), 
            data["user", "rates", "movie"].edge_label_index
        ), # edges used to create the mini-batches / subgraphs
        edge_label= data["user", "rates", "movie"].edge_label,

        batch_size = 128,
        shuffle = True
    )
    return loader

In [30]:
train_loader = linkLoaderWrapper(train_data)
train_loader

LinkNeighborLoader()

In [31]:
sampled_data = next(iter(train_loader))
sampled_data

HeteroData(
  user={
    node_id=[607],
    n_id=[607],
  },
  movie={
    node_id=[9724],
    x=[2783, 20],
    n_id=[2783],
  },
  (user, rates, movie)={
    edge_index=[2, 16887],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[16887],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7689],
    e_id=[7689],
  }
)

In [32]:
sampled_data = next(iter(train_loader))
sampled_data

HeteroData(
  user={
    node_id=[608],
    n_id=[608],
  },
  movie={
    node_id=[9724],
    x=[2732, 20],
    n_id=[2732],
  },
  (user, rates, movie)={
    edge_index=[2, 16745],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[16745],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7548],
    e_id=[7548],
  }
)

In [33]:
sampled_data = next(iter(train_loader))
sampled_data

HeteroData(
  user={
    node_id=[608],
    n_id=[608],
  },
  movie={
    node_id=[9724],
    x=[2738, 20],
    n_id=[2738],
  },
  (user, rates, movie)={
    edge_index=[2, 16673],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[16673],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7717],
    e_id=[7717],
  }
)

In [34]:
sampled_data["user"].node_id.shape, sampled_data["user"].num_nodes

(torch.Size([608]), 608)

In [35]:
sampled_data.edge_index_dict

{('user',
  'rates',
  'movie'): tensor([[ 282,  104,  114,  ...,  256,  281,  281],
         [   0,    0,    0,  ..., 2318, 2318, 2319]]),
 ('movie',
  'rev_rates',
  'user'): tensor([[ 375,  376,  377,  ..., 2124,  207,  602],
         [   0,    0,    0,  ...,  536,  536,  536]])}

In [36]:
sampled_data.x_dict

{'movie': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.]])}

## Model

In [37]:
class GNN(torch.nn.Module): # encoder
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv((hidden_channels, hidden_channels), hidden_channels)
        
    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2((x, x), edge_index)
        return x

In [38]:
class Classifier(torch.nn.Module): # decoder
    def forward(self, x_user: torch.Tensor, x_movie: torch.Tensor, edge_label_index: torch.Tensor):
        edge_feat_user = x_user[edge_label_index[0]] # selecting the features of the nodes only in the subgraph defined by the edge_label_index (supervision set)
        edge_feat_movie = x_movie[edge_label_index[1]]
        return (edge_feat_user * edge_feat_movie).sum(dim=-1) # computing the dot product of the features of the nodes in the subgraph, sum(subject_nodes * object_nodes)

In [39]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.movie_lin = torch.nn.Linear(20, hidden_channels)

        # as dataset is not very feature rich, we generate (learn) two embeddings for each node type
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels) # generating random(!?) embeddings for the user nodes
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        self.gnn = GNN(hidden_channels) # encoder
        self.gnn = to_hetero(self.gnn, data.metadata()) # converting the GNN to a heterogenous GNN!
        self.classifier = Classifier() # decoder
    
    
    def forward(self, data: HeteroData) -> torch.Tensor:
        # `x_dict` holds the feature matrix of all node type
        # the `x_dict` with the `data` does not have anything for the `user` node type, as the `user` nodes do not have any features
        # so we are building an `x_dict` with the features of the `movie` nodes and the embeddings of the `user` nodes
        x_dict = {
            "user": self.user_emb(data["user"].node_id), # selecting the embeddings of the user nodes, are we not selecting all the embeddings?
            "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id), # concatenating the (trained) features of the movies with the new embeddings of the movies
            # so we are learning embeddings for the movies twice?
        }

        x_dict = self.gnn(x_dict, data.edge_index_dict)
        # `edge_index_dict` holds the edge index of all relation types

        pred = self.classifier(x_dict["user"], x_dict["movie"], data["user", "rates", "movie"].edge_label_index)

        return pred

In [40]:
test_emb = torch.nn.Embedding(data["user"].num_nodes, 10)
test_emb

Embedding(610, 10)

In [41]:
test_emb.weight.shape

torch.Size([610, 10])

In [42]:
data.metadata() # metadata of the heterogenous graph
# metadata[0] holds the node types
# metadata[1] holds the edge types

(['user', 'movie'],
 [('user', 'rates', 'movie'), ('movie', 'rev_rates', 'user')])

this whole thing's quite confusing, specially the namings \
it seems like sometimes same thing can be accessed by different names \
or different things can be accessed by names quite similar to each other \
hella confusing, at least so far

examples: \
`edge_index` and `edge_label_index` \
`data["user", "rates", "movie"].edge_label_index` and `data.edge_index_dict[("user", "rates", "movie")]` \
etc.

## Trainning

In [43]:
train_loader = linkLoaderWrapper(train_data)