In [None]:
%conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
%conda install pyg -c pyg
%conda install matplotlib
# This pip install is needed due to a compatibility issue with the LinkNeighborLoader class used below
!pip install --verbose git+https://github.com/pyg-team/pyg-lib.git

In [2]:
import os
import json
import torch
from torch_geometric.data import Data
import torch_geometric.transforms as T

In [None]:
data_dir = 'spotify_million_playlist_dataset/data'

playlists_set = set()
songs_set = set()

edges = []

file_count = 0
for filename in os.listdir(data_dir):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        file_count += 1
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)
            for playlist in data['playlists']:
                playlist_id = playlist['pid']
                playlists_set.add(playlist_id)
                for track in playlist['tracks']:
                    song_id = track['track_uri']
                    songs_set.add(song_id)
                    edges.append((playlist_id, song_id))
    if file_count == 10:
        break

playlist_count = len(playlists_set)
song_id_map = {song_id: i + playlist_count for i, song_id in enumerate(songs_set)}
edges = [(playlist_id, song_id_map[song_id]) for playlist_id, song_id in edges]
num_nodes = playlist_count + len(songs_set)

In [4]:
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
graph_data = Data(edge_index=edge_index, num_nodes=num_nodes)
graph_data = T.ToUndirected()(graph_data)
print("Graph Data:", graph_data)

Graph Data: Data(edge_index=[2, 1323680], num_nodes=179657)


In [5]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.15, 
    num_test=0.15,
    is_undirected=True,
    add_negative_train_samples=False, 
    neg_sampling_ratio=1,
   )
train_data, val_data, test_data = transform(graph_data)
print(train_data)
print(val_data)
print(test_data)

Data(edge_index=[2, 926580], num_nodes=179657, edge_label=[463290], edge_label_index=[2, 463290])
Data(edge_index=[2, 926580], num_nodes=179657, edge_label=[198552], edge_label_index=[2, 198552])
Data(edge_index=[2, 1125132], num_nodes=179657, edge_label=[198552], edge_label_index=[2, 198552])


In [None]:
from torch_geometric.loader import LinkNeighborLoader

edge_label_index = train_data.edge_label_index
edge_label = train_data.edge_label
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=1,
    edge_label_index=edge_label_index,
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

edge_label_index = val_data.edge_label_index
edge_label = val_data.edge_label
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=edge_label_index,
    edge_label=edge_label,
    batch_size=128 * 3,
    shuffle=False,
)

In [7]:
from torch_geometric.nn import LightGCN

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LightGCN(
    num_nodes=graph_data.num_nodes,
    embedding_dim=64,
    num_layers=3,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

cuda


In [8]:
from tqdm import tqdm
def train():
    total_loss = total_examples = 0
    for sampled_data in tqdm(train_loader):

        sampled_data.to(device)

        # CODE TO GENERATE NEGATIVE SAMPLES MANUALLY
        # pos_edge_label_index = sampled_data.edge_label_index
        # neg_edge_label_index = torch.stack([
        #     pos_edge_label_index[0],
        #     torch.randint(playlist_count, num_nodes,
        #                   (pos_edge_label_index.size(1), ), device=device)
        # ], dim=0)
        # edge_label_index = torch.cat([
        #     pos_edge_label_index,
        #     neg_edge_label_index,
        # ], dim=1)

        optimizer.zero_grad()
            
        pos_pred, neg_pred = model(sampled_data.edge_index, sampled_data.edge_label_index).chunk(2)
        loss = model.recommendation_loss(
            pos_pred,
            neg_pred,
            node_id=sampled_data.edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pos_pred.numel()
        total_examples += pos_pred.numel()

    return total_loss / total_examples

In [9]:
from sklearn.metrics import roc_auc_score

def test():
    preds = []
    ground_truths = []
    recs = []
    for sampled_data in tqdm(val_loader):
        
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data.edge_index, sampled_data.edge_label_index))
            ground_truths.append(sampled_data.edge_label)

    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    print(f"Validation AUC: {auc:.4f}")
    return auc

In [None]:
epochs = 10

train_losses = []
val_auc_scores = []

for epoch in range(epochs):
    train_loss = train()
    train_losses.append((epoch, train_loss))

    # For if training for longer than 10 epochs, which I would like to eventually do
    if epoch in range(11) or epoch % 5 == 0:
        val_auc_score = test()
        val_auc_scores.append((epoch, val_auc_score))
        print(f"Epoch {epoch}: train loss={train_loss}, val_auc={val_auc_score}")
    else:
        print(f"Epoch {epoch}: train loss={train_loss}")

In [None]:
edge_label_index = test_data.edge_label_index
edge_label = test_data.edge_label
test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[20, 10],
    edge_label_index=edge_label_index,
    edge_label=edge_label,
    batch_size=128 * 3,
    shuffle=False,
)

In [None]:
preds = []
ground_truths = []
recs = []
for sampled_data in tqdm(test_loader):
    
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data.edge_index, sampled_data.edge_label_index))
        ground_truths.append(sampled_data.edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print(f"Testing AUC: {auc:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.plot([x[0] for x in train_losses], [x[1] for x in train_losses])
plt.xlabel("Epoch")
plt.ylabel("Training loss")

plt.show()

In [None]:
plt.plot([x[0] for x in val_auc_scores], [x[1] for x in val_auc_scores])
plt.xlabel("Epoch")
plt.ylabel("Validation AUC Scores")
plt.show()

In [None]:
from sklearn.decomposition import PCA

embs = model.get_embedding(graph_data.edge_index.to(device)).cpu().detach().numpy()
print(embs.shape)

pca = PCA(n_components=2).fit_transform(embs)

plt.figure(figsize=(10, 7))
plt.scatter(pca[:, 0], pca[:, 1], alpha=0.5)
plt.title('PCA of Embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar()
plt.show()

In [None]:
data_dir = 'spotify_million_playlist_dataset/data'
country_playlists = []
rap_playlists = []
folk_playlists = []
metal_playlists = []
pop_playlists = []
rock_playlists = []
jazz_playlists = []
file_count = 0
for filename in os.listdir(data_dir):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        file_count += 1
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)
            for playlist in data['playlists']:
                if 'country' in playlist['name'].lower():
                    country_playlists.append(playlist['pid'])
                    print('good')
                if 'rap' in playlist['name'].lower():
                    rap_playlists.append(playlist['pid'])
                if 'folk' in playlist['name'].lower():
                    folk_playlists.append(playlist['pid'])
                if 'metal' in playlist['name'].lower():
                    metal_playlists.append(playlist['pid'])
                if 'pop' in playlist['name'].lower():
                    pop_playlists.append(playlist['pid'])
                if 'rock' in playlist['name'].lower():
                    rock_playlists.append(playlist['pid'])
                if 'jazz' in playlist['name'].lower():
                    jazz_playlists.append(playlist['pid'])
    if file_count == 10:
        break

In [None]:
pca = PCA(n_components=2).fit_transform(embs)

plt.figure(figsize=(10, 7))
plt.scatter(pca[:, 0], pca[:, 1], color='blue', alpha=0.5, label='All Playlists and Songs')

# Replace rap_playlists with other genre to get visualisation
plt.scatter(pca[rap_playlists, 0], pca[rap_playlists, 1], color='red', alpha=0.7, label='Raap Playlists')

plt.title('Rap playlists (red)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()


In [4]:
em
for filename in os.listdir(data_dir):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        file_count += 1
        with open(os.path.join(data_dir, filename), 'r') as f:
            data = json.load(f)
            for playlist in data['playlists']:
                tracks = playlist['tracks']
                for track in tracks:
                    if track['artist'] = 'Eminem':
                        song_id = track['track_uri']
                        node = song_id_map[song_id]
                        eminem_nodes.append(node)
                        
    if file_count == 10:
        break

NameError: name 'PCA' is not defined