In [1]:
import os
import numpy as np
import pandas as pd
import modin.pandas as mpd
import random

from tqdm import tqdm
import torch
from torch import Tensor

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from torch_geometric.loader import LinkNeighborLoader

from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

from torch_geometric.nn import GATConv, to_hetero
import torch.nn.functional as F

In [2]:
savefd = "model/"
os.makedirs(savefd, exist_ok=True)

In [3]:
playlist_song = mpd.DataFrame(pd.read_parquet("data/gnn_playlists2songs.parquet"))
song_artist = mpd.DataFrame(pd.read_parquet("data/gnn_songs2artists.parquet"))
playlist_dj = mpd.DataFrame(pd.read_parquet("data/gnn_playlists2djs.parquet"))

2025-03-12 06:14:44,946	INFO worker.py:1841 -- Started a local Ray instance.


In [4]:
unique_playlist_id = np.unique(playlist_song['playlist_id'].values)
unique_playlist_id = mpd.DataFrame(data={
    'playlist_id': unique_playlist_id,
    'mappedID': pd.RangeIndex(len(unique_playlist_id)),
})
unique_song_id = np.unique(playlist_song['song_id'].values)
unique_song_id = mpd.DataFrame(data={
    'song_id': unique_song_id,
    'mappedID': pd.RangeIndex(len(unique_song_id)),
})
unique_artist = np.unique(song_artist['artist_id'].values)
unique_artist = mpd.DataFrame(data={
    'artist': unique_artist,
    'mappedID': pd.RangeIndex(len(unique_artist)),
})
unique_dj = np.unique(playlist_dj['dj_id'].values)
unique_dj = mpd.DataFrame(data={
    'dj': unique_dj,
    'mappedID': pd.RangeIndex(len(unique_dj)),
})

In [5]:
def make_edge(edge_df, u0, u1):
    edge_df.drop_duplicates(inplace=True)
    ekey0, ekey1 = list(edge_df.columns)
    ukey0 = u0.columns[0]
    ukey1 = u1.columns[0]
    temp0 = mpd.merge(edge_df[ekey0], u0, left_on=ekey0, right_on=ukey0, how='left')
    temp0 = torch.from_numpy(temp0['mappedID'].values)
    temp1 = mpd.merge(edge_df[ekey1], u1, left_on=ekey1, right_on=ukey1, how='left')
    temp1 = torch.from_numpy(temp1['mappedID'].values)
    return torch.stack([temp0, temp1], dim=0)

edge_playlist_song = make_edge(playlist_song, unique_playlist_id, unique_song_id)
edge_song_artist = make_edge(song_artist, unique_song_id, unique_artist)
edge_playlist_dj = make_edge(playlist_dj, unique_playlist_id, unique_dj)

In [6]:
data = HeteroData()

data["playlist"].node_id = torch.arange(len(unique_playlist_id))
data["song"].node_id = torch.arange(len(unique_song_id))
data["artist"].node_id = torch.arange(len(unique_artist))
data["dj"].node_id = torch.arange(len(unique_dj))

data["playlist", "playlist2song", "song"].edge_index = edge_playlist_song
data["song", "song2artist", "artist"].edge_index = edge_song_artist
data["playlist", "playlist2dj", "dj"].edge_index = edge_playlist_dj

data = T.ToUndirected()(data)

In [7]:
from torch_geometric.nn import GATConv, to_hetero
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, heads=8):
        super().__init__()
        self.conv1 = GATConv(hidden_channels, hidden_channels, heads, dropout=0.6, add_self_loops=False)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1, dropout=0.6, add_self_loops=False)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

    
class Classifier(torch.nn.Module):
    def forward(self, x_playlist: Tensor, x_song: Tensor, edge_label_index: Tensor) -> Tensor:
        edge_feat_playlist = x_playlist[edge_label_index[0]]
        edge_feat_song = x_song[edge_label_index[1]]
        return (edge_feat_playlist * edge_feat_song).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.playlist_emb = torch.nn.Embedding(data["playlist"].num_nodes, hidden_channels)
        self.song_emb = torch.nn.Embedding(data["song"].num_nodes, hidden_channels)
        self.dj_emb = torch.nn.Embedding(data["dj"].num_nodes, hidden_channels)
        self.artist_emb = torch.nn.Embedding(data["artist"].num_nodes, hidden_channels)
        self.gnn = GNN(hidden_channels)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "playlist": self.playlist_emb(data["playlist"].node_id),
          "song": self.song_emb(data["song"].node_id),
          "artist": self.artist_emb(data["artist"].node_id),
          "dj": self.dj_emb(data["dj"].node_id),
        } 

        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["playlist"],
            x_dict["song"],
            data["playlist", "playlist2song", "song"].edge_label_index,
        )

        return pred

        
model = Model(hidden_channels=64)
batch_size = 512
epochs = 1000



In [8]:
from torch_geometric.loader import LinkNeighborLoader

transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("playlist", "playlist2song", "song"),
    rev_edge_types=("song", "rev_playlist2song", "playlist"), 
)

train_data, val_data, test_data = transform(data)

In [9]:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[10, 5],
    neg_sampling_ratio=2.0,
    edge_label_index=(("playlist", "playlist2song", "song"), train_data["playlist", "playlist2song", "song"].edge_label_index),
    edge_label=train_data["playlist", "playlist2song", "song"].edge_label,
    batch_size=batch_size,
    shuffle=True,
)

In [10]:
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("playlist", "playlist2song", "song"), val_data["playlist", "playlist2song", "song"].edge_label_index),
    edge_label=val_data["playlist", "playlist2song", "song"].edge_label,
    batch_size=3 * batch_size,
    shuffle=False,
)

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[20, 10],
    edge_label_index=(("playlist", "playlist2song", "song"), test_data["playlist", "playlist2song", "song"].edge_label_index),
    edge_label=test_data["playlist", "playlist2song", "song"].edge_label,
    batch_size=3 * batch_size,
    shuffle=False,
)

In [None]:
from tqdm import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

best_f1 = 0
pbar = tqdm(range(epochs))
for epoch in pbar:
    total_loss = total_examples = 0
    model.train()
    for sampled_data in train_loader:
        optimizer.zero_grad()

        sampled_data.to(device)
        pred = model(sampled_data)

        ground_truth = sampled_data["playlist", "playlist2song", "song"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

        loss.backward()
        optimizer.step()
    # if epoch % 10 == 0:
    preds = []
    ground_truths = []
    model.eval()
    # Generate predictions and ground truths
    for sampled_data in val_loader:
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data))
            ground_truths.append(sampled_data["playlist", "playlist2song", "song"].edge_label)

    # Concatenate all predictions and ground truths
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()

    predicted = np.where(pred > 0, 1, 0)        
    cm = confusion_matrix(ground_truth, predicted, labels=[0, 1])

    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    pbar.set_description(f"Acc: {accuracy:.4f}, Prec: {precision:.4f}, Rec: {recall:.4f}, F1: {f1_score:.4f}")
        
        # if best_f1 <= f1_score:
            # best_f1 = f1_score
            # best_state_dict = model.state_dict().copy()
    filename = f"{savefd}/epoch_{str(epoch).zfill(4)}__f1_{str(int(f1_score*1000)).zfill(4)}.pth"
    torch.save(model, filename)
    # print(filename, " saved")

Device: 'cuda'


Acc: 0.8018, Prec: 0.7692, Rec: 0.5792, F1: 0.6608:   2%|▏         | 18/1000 [22:34<20:30:49, 75.20s/it]