In [2]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch_geometric.nn import GCNConv
from torch.utils.data import DataLoader
from sklearn.decomposition import PCA
from tqdm import tqdm
from joblib import Parallel, delayed

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def normalize_features(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)

def load_and_merge_numeric_features(file_list, merge_on='id'):
    base_df = pd.read_csv(file_list[0], sep='\t')
    for path in file_list[1:]:
        df_next = pd.read_csv(path, sep='\t')
        base_df = pd.merge(base_df, df_next, on=merge_on, how='inner')
    base_df.set_index(merge_on, inplace=True)
    return base_df

def create_item_item_edge_index(feature_matrix, top_k=10):
    similarity = cosine_similarity(feature_matrix)
    num_items = similarity.shape[0]
    rows, cols = [], []
    for i in range(num_items):
        k = min(top_k, num_items - 1)
        top_k_indices = similarity[i].argsort()[::-1][1:k+1]
        if len(top_k_indices) > 0:
            rows.extend([i] * len(top_k_indices))
            cols.extend(top_k_indices)
    edge_index = torch.tensor([rows, cols], dtype=torch.long)
    if edge_index.numel() > 0:
        edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)
    return edge_index

class MKGCN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, latent_dim=32, dropout=0.3):
        super(MKGCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, latent_dim)
        self.conv3 = GCNConv(latent_dim, hidden_dim)
        self.conv4 = GCNConv(hidden_dim, input_dim)
        self.act = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index, reconstruct=False):
        z = self.act(self.conv1(x, edge_index))
        z = self.dropout(z)
        z = self.act(self.conv2(z, edge_index))
        if reconstruct:
            rec = self.act(self.conv3(z, edge_index))
            rec = self.dropout(rec)
            rec = self.conv4(rec, edge_index)
            return rec
        else:
            return z

class FineTuneClassifier(nn.Module):
    def __init__(self, mkgcn, latent_dim, num_classes):
        super(FineTuneClassifier, self).__init__()
        self.mkgcn = mkgcn
        for param in self.mkgcn.parameters():
            param.requires_grad = False
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.LeakyReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x, edge_index):
        with torch.no_grad():
            z = self.mkgcn(x, edge_index, reconstruct=False)
        logits = self.classifier(z)
        return logits

def train_mkgcn_full_batch(model, features, edge_index, num_epochs=50, lr=1e-3, patience=5, device='cuda'):
    model.to(device)
    features = features.to(device)
    edge_index = edge_index.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)

    best_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        x_recon = model(features, edge_index, reconstruct=True)
        loss = criterion(x_recon, features)
        loss.backward()
        optimizer.step()

        avg_loss = loss.item()
        print(f"[Epoch {epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")

        scheduler.step(avg_loss)

        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

def train_classifier(model, features, edge_index, labels, num_epochs=20, lr=1e-3, patience=5, device='cuda'):
    model.to(device)
    features = features.to(device)
    edge_index = edge_index.to(device)
    labels = labels.to(device)
    optimizer = optim.Adam(model.classifier.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    best_loss = float("inf")
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        logits = model(features, edge_index)
        loss = criterion(logits, labels.squeeze().long())
        loss.backward()
        optimizer.step()

        avg_loss = loss.item()
        print(f"[Classifier Epoch {epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

def compute_all_embeddings(features, model, edge_index, device='cuda'):
    model.eval()
    features = features.to(device)
    edge_index = edge_index.to(device)
    
    with torch.no_grad():
        embeddings = model(features, edge_index, reconstruct=False)
    
    return embeddings.cpu().numpy()

def compute_similarities(i, embeddings, topK):
    vec_i = embeddings[i].reshape(1, -1)  
    sims = np.dot(embeddings, vec_i.T).flatten()
    sims /= np.linalg.norm(embeddings, axis=1)  
    sims[i] = -1e9  
    topk_idx = np.argsort(-sims)[:topK]
    topk_vals = sims[topk_idx]  
    return i, topk_idx, topk_vals


def build_recommendation_matrix(embeddings, topK=10, n_jobs=-1):
    N = len(embeddings)
    rec_matrix = np.zeros((N, N), dtype=np.float32)
    norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    results = Parallel(n_jobs=n_jobs, backend="threading")(
        delayed(compute_similarities)(i, norm_embeddings, topK) for i in range(N)
    )
    
    for i, topk_idx, topk_vals in results:
        rec_matrix[i, topk_idx] = topk_vals

    return rec_matrix

def reduce_dimensions(features, n_components=100):
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(features)
    return reduced_features

device = "cuda" if torch.cuda.is_available() else "cpu"

numeric_files = [
    "./dataset/id_blf_correlation_mmsr.tsv",
    "./dataset/id_blf_deltaspectral_mmsr.tsv",
    "./dataset/id_blf_logfluc_mmsr.tsv",
    "./dataset/id_blf_spectral_mmsr.tsv",
    "./dataset/id_blf_spectralcontrast_mmsr.tsv",
    "./dataset/id_blf_vardeltaspectral_mmsr.tsv",
    "./dataset/id_incp_mmsr.tsv",
    "./dataset/id_ivec256_mmsr.tsv",
    "./dataset/id_ivec512_mmsr.tsv",
    "./dataset/id_ivec1024_mmsr.tsv",
    "./dataset/id_lyrics_tf-idf_mmsr.tsv",
    "./dataset/id_lyrics_word2vec_mmsr.tsv",
    "./dataset/id_lyrics_bert_mmsr.tsv",
    "./dataset/id_mfcc_bow_mmsr.tsv",
    "./dataset/id_mfcc_stats_mmsr.tsv",
    "./dataset/id_musicnn_mmsr.tsv",
    "./dataset/id_resnet_mmsr.tsv",
    "./dataset/id_vgg19_mmsr.tsv",
]

fused_df = load_and_merge_numeric_features(numeric_files, merge_on='id')
fused_df_normalized = normalize_features(fused_df)

labels_path = "./dataset/id_genres_mmsr.tsv"
if os.path.exists(labels_path):
    labels_df = pd.read_csv(labels_path, sep='\t').set_index('id')
    unique_genres = labels_df['genre'].unique()
    genre_to_idx = {g: i for i, g in enumerate(unique_genres)}
    labels_df['genre_id'] = labels_df['genre'].map(genre_to_idx)
    final_labels_df = labels_df[['genre_id']]
else:
    final_labels_df = None

feature_train, feature_test, label_train, label_test = train_test_split(
    fused_df_normalized, final_labels_df, test_size=0.2, random_state=42
)

features = torch.tensor(fused_df_normalized.values, dtype=torch.float32)
labels = torch.tensor(final_labels_df['genre_id'].values, dtype=torch.long)

edge_index = create_item_item_edge_index(features.numpy(), top_k=10)

input_dim = features.shape[1]
latent_dim = 128

n_components = 100  
reduced_feature_matrix = reduce_dimensions(fused_df_normalized.values, n_components=n_components)

features = torch.tensor(reduced_feature_matrix, dtype=torch.float32)

mkgcn_model = MKGCN(input_dim=n_components, hidden_dim=64, latent_dim=128, dropout=0.3)
train_mkgcn_full_batch(
    model=mkgcn_model,
    features=features,
    edge_index=edge_index,
    num_epochs=10000,
    lr=1e-3,
    patience=1000,
    device=device
)

if final_labels_df is not None:
    num_classes = len(np.unique(final_labels_df['genre_id']))
    classifier_model = FineTuneClassifier(mkgcn_model, latent_dim, num_classes)
    train_classifier(
        model=classifier_model,
        features=features,
        edge_index=edge_index,
        labels=labels,
        num_epochs=10000,
        lr=1e-3,
        patience=1000,
        device=device
    )

embeddings = compute_all_embeddings(features, mkgcn_model, edge_index, device=device)
recommendation_matrix = build_recommendation_matrix(embeddings, topK=100)
np.savetxt("./predictions/rets_MKGCN_100_matrix.csv", recommendation_matrix, delimiter="\t")

[Epoch 1/10000] Loss: 169.3459
[Epoch 2/10000] Loss: 166.8094
[Epoch 3/10000] Loss: 164.6617
[Epoch 4/10000] Loss: 162.8308
[Epoch 5/10000] Loss: 161.4374
[Epoch 6/10000] Loss: 160.2400
[Epoch 7/10000] Loss: 159.0657
[Epoch 8/10000] Loss: 158.0350
[Epoch 9/10000] Loss: 156.9487
[Epoch 10/10000] Loss: 156.1604
[Epoch 11/10000] Loss: 155.0868
[Epoch 12/10000] Loss: 154.1494
[Epoch 13/10000] Loss: 153.1001
[Epoch 14/10000] Loss: 151.9899
[Epoch 15/10000] Loss: 151.0607
[Epoch 16/10000] Loss: 149.8577
[Epoch 17/10000] Loss: 148.6460
[Epoch 18/10000] Loss: 147.6561
[Epoch 19/10000] Loss: 146.1983
[Epoch 20/10000] Loss: 144.8682
[Epoch 21/10000] Loss: 143.4345
[Epoch 22/10000] Loss: 141.8285
[Epoch 23/10000] Loss: 140.2762
[Epoch 24/10000] Loss: 138.5407
[Epoch 25/10000] Loss: 136.6371
[Epoch 26/10000] Loss: 134.7560
[Epoch 27/10000] Loss: 132.8145
[Epoch 28/10000] Loss: 130.9146
[Epoch 29/10000] Loss: 129.0535
[Epoch 30/10000] Loss: 127.1178
[Epoch 31/10000] Loss: 125.0402
[Epoch 32/10000] 

In [3]:
def compute_all_embeddings(features, model, edge_index, device='cuda'):
    model.eval()
    features = features.to(device)
    edge_index = edge_index.to(device)
    
    with torch.no_grad():
        embeddings = model(features, edge_index, reconstruct=False)
    
    return embeddings.cpu().numpy()

def compute_similarities_with_metadata(i, embeddings, ids, topK):
    vec_i = embeddings[i].reshape(1, -1)  
    sims = np.dot(embeddings, vec_i.T).flatten()
    sims /= np.linalg.norm(embeddings, axis=1)  
    sims[i] = -1e9  
    topk_idx = np.argsort(-sims)[:topK]  
    topk_vals = sims[topk_idx]  
    
    recommendations = [
        {"source_id": ids[i], "target_id": ids[j], "similarity": sim}
        for j, sim in zip(topk_idx, topk_vals)
    ]
    return recommendations

def build_recommendations(embeddings, ids, topK=10, n_jobs=-1):
    norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    results = Parallel(n_jobs=n_jobs, backend="threading")(
        delayed(compute_similarities_with_metadata)(i, norm_embeddings, ids, topK) for i in range(len(embeddings))
    )
    
    recommendations = [rec for sublist in results for rec in sublist]
    return pd.DataFrame(recommendations)

embeddings = compute_all_embeddings(features, mkgcn_model, edge_index, device=device)

ids = fused_df_normalized.index.tolist()
recommendations = build_recommendations(embeddings, ids, topK=100)
recommendations.to_csv("recs_mkgcn_10.csv", index=False)