In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")


def cosine_similarity(vec_a, vec_b, eps=1e-9):
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / ((norm_a * norm_b) + eps)


def load_and_merge_numeric_features(file_list, merge_on='id'):
    base_df = pd.read_csv(file_list[0], sep='\t')
    for path in file_list[1:]:
        df_next = pd.read_csv(path, sep='\t')
        base_df = pd.merge(base_df, df_next, on=merge_on, how='inner')
    base_df.set_index(merge_on, inplace=True)
    return base_df

def normalize_features(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)

class Music4AllOnionDataset(Dataset):
    def __init__(self, feature_df, label_df=None, transform=None):
        self.transform = transform
        feature_df = feature_df.sort_index()
        self.feature_df = feature_df

        if label_df is not None:
            label_df = label_df.sort_index()
            common_idx = feature_df.index.intersection(label_df.index)
            self.feature_df = feature_df.loc[common_idx]
            self.label_df = label_df.loc[common_idx]
            self.has_labels = True
        else:
            self.label_df = None
            self.has_labels = False

        self.feature_data = self.feature_df.values.astype(np.float32)
        if self.transform:
            self.feature_data = self.transform(self.feature_data)

    def __len__(self):
        return len(self.feature_data)

    def __getitem__(self, idx):
        x = self.feature_data[idx]
        x = torch.tensor(x, dtype=torch.float32)

        if self.has_labels:
            y_val = self.label_df.iloc[idx].values
            y_val = torch.tensor(y_val, dtype=torch.long)
            return x, y_val
        else:
            return x

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim=128):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, input_dim)
        )

    def forward(self, x, reconstruct=False):
        z = self.encoder(x)
        if reconstruct:
            return self.decoder(z)
        return z

class FineTuneClassifier(nn.Module):
    def __init__(self, autoenc, latent_dim, num_classes):
        super(FineTuneClassifier, self).__init__()
        self.autoenc = autoenc
        for param in self.autoenc.encoder.parameters():
            param.requires_grad = False  

        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.LeakyReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        with torch.no_grad():
            z = self.autoenc.encoder(x)  
        logits = self.classifier(z)
        return logits

def train_autoenc(model, dataloader, num_epochs=50, lr=1e-3, patience=5, device='cuda'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)

    best_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for x, _ in dataloader:
            x = x.to(device)
            optimizer.zero_grad()
            x_recon = model(x, reconstruct=True)
            loss = criterion(x_recon, x)  
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
        print(f"[Epoch {epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")

        scheduler.step(avg_loss)

        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)


def train_classifier(
    model, dataloader, num_epochs=20, lr=1e-3, patience=5, device='cuda'
):
    model.to(device)
    optimizer = optim.Adam(model.classifier.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    best_loss = float("inf")
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y.squeeze().long())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
        print(f"[Classifier Epoch {epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

def compute_all_embeddings(dataset, model, device='cuda'):
    model.eval()
    all_embeddings = []

    for i in range(len(dataset)):
        item = dataset[i]
        x = item[0] if isinstance(item, tuple) else item
        x = x.unsqueeze(0).to(device)
        with torch.no_grad():
            z = model(x)
        z_np = z.cpu().numpy().flatten()
        all_embeddings.append(z_np)

    embeddings = np.vstack(all_embeddings)
    return embeddings

def build_recommendation_matrix(embeddings, topK=10):
    N = len(embeddings)
    rec_matrix = np.zeros((N, N), dtype=np.float32)

    for i in tqdm(range(N), desc="Building Recs"):
        vec_i = embeddings[i]
        sims = [cosine_similarity(vec_i, embeddings[j]) for j in range(N)]
        sims = np.array(sims)
        sims[i] = -1e9
        topk_idx = sims.argsort()[::-1][:topK]
        topk_vals = sims[topk_idx]
        rec_matrix[i, topk_idx] = topk_vals

    return rec_matrix

device = "cuda" if torch.cuda.is_available() else "cpu"

numeric_files = [
        "./dataset/id_blf_correlation_mmsr.tsv",
        "./dataset/id_blf_deltaspectral_mmsr.tsv",
        "./dataset/id_blf_logfluc_mmsr.tsv",
        "./dataset/id_blf_spectral_mmsr.tsv",
        "./dataset/id_blf_spectralcontrast_mmsr.tsv",
        "./dataset/id_blf_vardeltaspectral_mmsr.tsv",
        "./dataset/id_incp_mmsr.tsv",
        "./dataset/id_ivec256_mmsr.tsv",
        "./dataset/id_ivec512_mmsr.tsv",
        "./dataset/id_ivec1024_mmsr.tsv",
        "./dataset/id_lyrics_tf-idf_mmsr.tsv",
        "./dataset/id_lyrics_word2vec_mmsr.tsv",
        "./dataset/id_lyrics_bert_mmsr.tsv",
        "./dataset/id_mfcc_bow_mmsr.tsv",
        "./dataset/id_mfcc_stats_mmsr.tsv",
        "./dataset/id_musicnn_mmsr.tsv",
        "./dataset/id_resnet_mmsr.tsv",
        "./dataset/id_vgg19_mmsr.tsv",
        "./dataset/id_total_listens.tsv"
]

fused_df = load_and_merge_numeric_features(numeric_files, merge_on='id')
fused_df_normalized = normalize_features(fused_df)

labels_path = "id_genres_mmsr.tsv"
if os.path.exists(labels_path):
        labels_df = pd.read_csv(labels_path, sep='\t').set_index('id')
        unique_genres = labels_df['genre'].unique()
        genre_to_idx = {g: i for i, g in enumerate(unique_genres)}
        labels_df['genre_id'] = labels_df['genre'].map(genre_to_idx)
        final_labels_df = labels_df[['genre_id']]
else:
    final_labels_df = None

feature_train, feature_test, label_train, label_test = train_test_split(
        fused_df_normalized, final_labels_df, test_size=0.2, random_state=42
)

train_dataset = Music4AllOnionDataset(feature_train, label_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
test_dataset = Music4AllOnionDataset(feature_test, label_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

input_dim = fused_df_normalized.shape[1]
latent_dim = 128

autoenc_model = AutoEncoder(input_dim, latent_dim=latent_dim)
train_autoenc(autoenc_model, train_loader, num_epochs=250, lr=1e-3, patience=100, device=device)

if final_labels_df is not None:
    num_classes = len(np.unique(final_labels_df['genre_id']))
    classifier_model = FineTuneClassifier(autoenc_model, latent_dim, num_classes)
    train_classifier(classifier_model, train_loader, num_epochs=250, patience=100, lr=1e-3, device=device)

full_dataset = Music4AllOnionDataset(fused_df_normalized)
embeddings = compute_all_embeddings(full_dataset, autoenc_model, device=device)
rec_matrix = build_recommendation_matrix(embeddings, topK=10)
np.savetxt("./predictions/rets_auto_enc_10_matrix", rec_matrix, delimiter=",")

TypeError: 'NoneType' object is not subscriptable

In [None]:
def build_recommendations(embeddings, ids, infos, topK=10):
    recommendations = []
    N = len(embeddings)

    for i in tqdm(range(N), desc="Building Recommendations"):
        vec_i = embeddings[i]
        sims = [
            {"source_id": ids[i], "target_id": ids[j], "similarity": cosine_similarity(vec_i, embeddings[j])}
            for j in range(N) if i != j
        ]
        top_k_recs = sorted(sims, key=lambda x: x["similarity"], reverse=True)[:topK]
        recommendations.extend(top_k_recs)

    return pd.DataFrame(recommendations)

full_dataset = Music4AllOnionDataset(fused_df_normalized)
embeddings = compute_all_embeddings(full_dataset, autoenc_model, device=device)
ids = fused_df_normalized.index.tolist()
recommendations = build_recommendations(embeddings, ids, fused_df_normalized, topK=100)
recommendations.to_csv("./predictions/ui/rets_auto_enc_10", index=False)