# Practicing Link Prediction on Heterogneous Graphs with PyG - Take 2
source: https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70 \
companion notebooks: 
1. https://colab.research.google.com/drive/1r_FWLSFf9iL0OWeHeD31d_Opt031P1Nq
2. https://colab.research.google.com/drive/1xpzn1Nvai1ygd_P5Yambc_oe4VBPK_ZT [some values needs to be filled]

Could not figure out what's wrong with take 1, so trying take 2

In [1]:
from torch_geometric.data import download_url, extract_zip, HeteroData
import pandas as pd
import torch
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
import tqdm
import numpy as np

## Dataset

### Gathering data

In [2]:
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, 'datasets'), 'datasets')

movies_path = 'datasets/ml-latest-small/movies.csv'
ratings_path = 'datasets/ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting datasets\ml-latest-small.zip


In [3]:
movies_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)

In [4]:
print(movies_df.shape)
movies_df.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print(ratings_df.shape)
ratings_df.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Preporcessing

### Generating features

In [6]:
genres = movies_df['genres'].str.get_dummies('|')
print(genres.shape)
genres.head()

(9742, 20)


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
movie_feats = torch.from_numpy(genres.values).to(torch.float)
print(movie_feats.shape)
movie_feats

torch.Size([9742, 20])


tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [8]:
def generate_movie_feats(movies_df: pd.DataFrame) -> torch.Tensor:
    genres = movies_df['genres'].str.get_dummies('|')
    return torch.from_numpy(genres.values).to(torch.float)

### Generating edges

In [9]:
movie_ids_to_idx_mp = {id: i for i, id in enumerate(movies_df['movieId'].unique())} # not using this for now

In [10]:
unique_movie_ids = movies_df['movieId'].unique()
movie_ids_to_idx_df = pd.DataFrame({
    'movieId': unique_movie_ids,
    'movie_idx': np.arange(len(unique_movie_ids))
})
print(movie_ids_to_idx_df.shape)
movie_ids_to_idx_df.head()

(9742, 2)


Unnamed: 0,movieId,movie_idx
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [11]:
user_ids_to_idx_mp = {id: i for i, id in enumerate(ratings_df['userId'].unique())} # not using this for now

In [12]:
unique_user_ids = ratings_df['userId'].unique()
user_ids_to_idx_df = pd.DataFrame({
    'userId': unique_user_ids,
    'user_idx': np.arange(len(unique_user_ids))
})
print(user_ids_to_idx_df.shape)
user_ids_to_idx_df.head()

(610, 2)


Unnamed: 0,userId,user_idx
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [13]:
ratings_df.replace({'movieId': movie_ids_to_idx_mp, 'userId': user_ids_to_idx_mp}, inplace=False) # not using this for now

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,2,4.0,964981247
2,0,5,4.0,964982224
3,0,43,5.0,964983815
4,0,46,5.0,964982931
...,...,...,...,...
100831,609,9434,4.0,1493848402
100832,609,9461,5.0,1493850091
100833,609,9462,5.0,1494273047
100834,609,9463,5.0,1493846352


In [14]:
user_rates_movies = pd.merge( # edges
    left = pd.merge(
        left = ratings_df,
        right = user_ids_to_idx_df,
        on = 'userId',
        how = 'left'
    ),
    right = movie_ids_to_idx_df,
    on = 'movieId',
    how = 'left'
).loc[:, ['user_idx', 'movie_idx']].values
user_rates_movies = torch.from_numpy(user_rates_movies).t().to(torch.long).contiguous()
print(user_rates_movies.shape)
user_rates_movies

torch.Size([2, 100836])


tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]])

In [15]:
def generate_edges(movies_df: pd.DataFrame, ratings_df: pd.DataFrame) -> tuple[torch.Tensor]:
    
    # creating movies id to index mapping
    unique_movie_ids = movies_df['movieId'].unique()
    movie_ids_to_idx_df = pd.DataFrame({
        'movieId': unique_movie_ids,
        'movie_idx': np.arange(len(unique_movie_ids))
    })
    
    # creating users id to index mapping
    unique_user_ids = ratings_df['userId'].unique()
    user_ids_to_idx_df = pd.DataFrame({
        'userId': unique_user_ids,
        'user_idx': np.arange(len(unique_user_ids))
    })
    
    # generating user to movie edges
    user_rates_movies = pd.merge( # edges
        left = pd.merge(
            left = ratings_df,
            right = user_ids_to_idx_df,
            on = 'userId',
            how = 'left'
        ),
        right = movie_ids_to_idx_df,
        on = 'movieId',
        how = 'left'
    ).loc[:, ['user_idx', 'movie_idx']].values
    
    return torch.from_numpy(user_rates_movies).to(torch.long).t().contiguous(), unique_user_ids, unique_movie_ids

### Building Graph

In [16]:
data = HeteroData()

data['user'].node_id = torch.arange(len(unique_user_ids))

data['movie'].node_id = torch.arange(len(unique_movie_ids))
data['movie'].x = movie_feats


data['user', 'rates', 'movie'].edge_index = user_rates_movies

data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] }
)

In [17]:
data = T.ToUndirected()(data)
data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

In [18]:
def build_graph(user_num_nodes: int, movie_num_nodes: int, edge_index: torch.Tensor, movie_feats: torch.Tensor) -> HeteroData:
    graph = HeteroData()

    graph['user'].node_id = torch.arange(user_num_nodes)

    graph['movie'].node_id = torch.arange(movie_num_nodes)
    graph['movie'].x = movie_feats

    graph['user', 'rates', 'movie'].edge_index = edge_index

    graph = T.ToUndirected()(graph)

    return graph

In [19]:
def prepare_data(movies_df: pd.DataFrame, ratings_df: pd.DataFrame) -> HeteroData:
    movie_feats = generate_movie_feats(movies_df)
    edge_index, unique_user_ids, unique_movie_ids = generate_edges(movies_df, ratings_df)
    return build_graph(len(unique_user_ids), len(unique_movie_ids), edge_index, movie_feats)


In [20]:
data = prepare_data(movies_df, ratings_df)
data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

## Train-val-test split

In [21]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)
transform

RandomLinkSplit(num_val=0.1, num_test=0.1)

In [22]:
train_data, val_data, test_data = transform(data)
train_data, val_data, test_data

(HeteroData(
   user={ node_id=[610] },
   movie={
     node_id=[9742],
     x=[9742, 20],
   },
   (user, rates, movie)={
     edge_index=[2, 56469],
     edge_label=[24201],
     edge_label_index=[2, 24201],
   },
   (movie, rev_rates, user)={ edge_index=[2, 56469] }
 ),
 HeteroData(
   user={ node_id=[610] },
   movie={
     node_id=[9742],
     x=[9742, 20],
   },
   (user, rates, movie)={
     edge_index=[2, 80670],
     edge_label=[30249],
     edge_label_index=[2, 30249],
   },
   (movie, rev_rates, user)={ edge_index=[2, 80670] }
 ),
 HeteroData(
   user={ node_id=[610] },
   movie={
     node_id=[9742],
     x=[9742, 20],
   },
   (user, rates, movie)={
     edge_index=[2, 90753],
     edge_label=[30249],
     edge_label_index=[2, 30249],
   },
   (movie, rev_rates, user)={ edge_index=[2, 90753] }
 ))

## Defining batches

In [23]:
def LinkNeighborLoaderWrapper(data: HeteroData) -> LinkNeighborLoader:
    edge_label_index = data['user', 'rates', 'movie'].edge_label_index
    edge_label = data['user', 'rates', 'movie'].edge_label

    loader = LinkNeighborLoader(
        data = data,
        num_neighbors = [20, 10],
        neg_sampling_ratio = 2.0,
        edge_label_index = (("user", "rates", "movie"), edge_label_index),
        edge_label = edge_label,
        batch_size = 128,
        shuffle = True,
    )

    return loader

In [24]:
loader = LinkNeighborLoaderWrapper(train_data)
loader

LinkNeighborLoader()

In [25]:
sampled_data = next(iter(loader))
sampled_data

HeteroData(
  user={
    node_id=[608],
    n_id=[608],
  },
  movie={
    node_id=[2779],
    x=[2779, 20],
    n_id=[2779],
  },
  (user, rates, movie)={
    edge_index=[2, 16734],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[16734],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7707],
    e_id=[7707],
  }
)

## Model

In [26]:
# copy-pasted

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: torch.Tensor, x_movie: torch.Tensor, edge_label_index: torch.Tensor) -> torch.Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> torch.Tensor:
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred

## Training

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [28]:
train_loader = LinkNeighborLoaderWrapper(train_data)
train_loader

LinkNeighborLoader()

In [29]:
model = Model(hidden_channels=64)
model = model.to(device)
model

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,


Model(
  (movie_lin): Linear(in_features=20, out_features=64, bias=True)
  (user_emb): Embedding(610, 64)
  (movie_emb): Embedding(9742, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)

In [30]:
# copy-pasted

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 6):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data.to(device)
        pred = model(sampled_data)

        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

100%|██████████| 190/190 [00:19<00:00,  9.76it/s]


Epoch: 001, Loss: 0.4379


100%|██████████| 190/190 [00:20<00:00,  9.42it/s]


Epoch: 002, Loss: 0.3508


100%|██████████| 190/190 [00:19<00:00,  9.71it/s]


Epoch: 003, Loss: 0.3298


100%|██████████| 190/190 [00:19<00:00,  9.54it/s]


Epoch: 004, Loss: 0.3143


100%|██████████| 190/190 [00:18<00:00, 10.08it/s]

Epoch: 005, Loss: 0.3037





## Evaluation

In [31]:
# copy-pasted

# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 128
assert sampled_data["user", "rates", "movie"].edge_label.min() >= 0
assert sampled_data["user", "rates", "movie"].edge_label.max() <= 1

Sampled mini-batch:
HeteroData(
  user={
    node_id=[609],
    n_id=[609],
  },
  movie={
    node_id=[2709],
    x=[2709, 20],
    n_id=[2709],
  },
  (user, rates, movie)={
    edge_index=[2, 19067],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[19067],
    input_id=[384],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7961],
    e_id=[7961],
  }
)


In [32]:
from sklearn.metrics import roc_auc_score

preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

100%|██████████| 79/79 [00:06<00:00, 12.19it/s]


Validation AUC: 0.9286



