In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import kagglehub

# Download data and create genre column for tracks using artists
def download_and_clean():

    path = kagglehub.dataset_download("yamaerenay/spotify-dataset-19212020-600k-tracks")
    artists_path = path + '/artists.csv'
    tracks_path = path + '/tracks.csv'

    artists_df = pd.read_csv(artists_path)
    tracks_df = pd.read_csv(tracks_path)

    artists_df = artists_df.dropna().drop_duplicates()
    tracks_df = tracks_df.dropna().drop_duplicates()

    artists_df = artists_df[artists_df['genres'].apply(lambda x: len(eval(x)) > 0 if isinstance(x, str) else False)]

    valid_artists = set(artists_df['name'])
    tracks_df = tracks_df[tracks_df['artists'].apply(lambda x: all(artist in valid_artists for artist in eval(x)) if isinstance(x, str) else False)]

    artist_to_genres = dict(zip(artists_df['name'], artists_df['genres']))

    def get_genres_for_track(artists_list):
        if isinstance(artists_list, str):
            try:
                artists_list = eval(artists_list)
                genres = [genre for artist in artists_list if artist in artist_to_genres for genre in eval(artist_to_genres[artist])]
                return list(set(genres))
            except:
                return []
        return []

    tracks_df['genres'] = tracks_df['artists'].apply(get_genres_for_track)

    artists_df.to_csv('/content/drive/MyDrive/nlp-proj/data/cleaned_data/cleaned_artists.csv', index=False)
    tracks_df.to_csv('/content/drive/MyDrive/nlp-proj/data/cleaned_data/cleaned_tracks.csv', index=False)

    return artists_df, tracks_df


In [None]:
# from data.cleaning import download_and_clean

artists, tracks = download_and_clean()

Downloading from https://www.kaggle.com/api/v1/datasets/download/yamaerenay/spotify-dataset-19212020-600k-tracks?dataset_version_number=1...


100%|██████████| 193M/193M [00:01<00:00, 144MB/s]

Extracting files...





In [None]:
print(artists.head())

                         id  followers                      genres  \
45   0VLMVnVbJyJ4oyZs2L3Yl2       71.0          ['carnaval cadiz']   
46   0dt23bs4w8zx154C5xdVyl       63.0          ['carnaval cadiz']   
47   0pGhoB99qpEJEsBQxgaskQ       64.0          ['carnaval cadiz']   
48   3HDrX2OtSuXLW5dLR85uN3       53.0          ['carnaval cadiz']   
136  22mLrN5fkppmuUPsHx6i2G       59.0  ['classical harp', 'harp']   

                             name  popularity  
45   Las Viudas De Los Bisabuelos           6  
46              Los De Capuchinos           5  
47             Los “Pofesionales”           7  
48      Los Que No Paran De Rajar           6  
136                   Vera Dulova           3  


In [None]:
print(tracks.head())

                       id                                name  popularity  \
2  07A5yehtSnoedViJAZkNnc  Vivo para Quererte - Remasterizado           0   
3  08FmqUhxtyLTn6pAh6bk45       El Prisionero - Remasterizado           0   
4  08y9GfoqCWfOGsKdwojr5e                 Lady of the Evening           0   
5  0BRXJHRNGQ3W4v9frnSfhu                           Ave Maria           0   
7  0IA0Hju8CAgYfV1hwhidBH                             La Java           0   

   duration_ms  explicit              artists                  id_artists  \
2       181640         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
3       176907         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
4       163080         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
5       178933         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
7       161427         0      ['Mistinguett']  ['4AxgXfD7ISvJSTObqm4aIE']   

  release_date  danceability  energy  ...  loudness  mode  speechiness  \


In [None]:
# The next few cells create a multi-hot vector for storing the genre labels for modeling
from itertools import chain
all_genres = set(chain.from_iterable(tracks["genres"]))
len(all_genres)
# all_genres

4706

In [None]:
genre_to_index = {genre: idx for idx, genre in enumerate(sorted(all_genres))}
# genre_to_index

In [None]:
def multihot_vector(genres, index_dict):
    multihot = [0] * len(index_dict)
    for genre in genres:
        multihot[index_dict[genre]] = 1
    return multihot

In [None]:
tracks["multi_hot_genres"] = tracks["genres"].apply(
    lambda genres: multihot_vector(genres, genre_to_index)
)
tracks["multi_hot_genres"]

Unnamed: 0,multi_hot_genres
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
586667,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
586668,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
586669,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
586670,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Split the data into 80-10-10

from sklearn.model_selection import train_test_split

X = tracks[["name", "popularity", "duration_ms", "explicit", "release_date", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]]
y = tracks["multi_hot_genres"].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=478)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=478)

In [None]:
print(f"Train set size: {len(X_train)}")
print(f"Dev set size: {len(X_dev)}")
print(f"Test set size: {len(X_test)}")

Train set size: 399251
Dev set size: 49906
Test set size: 49907


In [None]:
# Drop genres that have less than 100 occurences
import numpy as np

label_counts = np.sum(y_train, axis=0)

threshold =100
selected_labels = np.where(label_counts >= threshold)[0]

def filter_labels(y, selected_labels):
    return np.array([[y_sample[i] for i in selected_labels] for y_sample in y])

y_train = filter_labels(y_train, selected_labels)
y_dev = filter_labels(y_dev, selected_labels)
y_test = filter_labels(y_test, selected_labels)

filtered_genre_to_index = {genre: idx for idx, genre in enumerate(selected_labels)}
index_to_genre = {v: k for k, v in filtered_genre_to_index.items()}

print(f"Original number of labels: {len(label_counts)}")
print(f"Filtered number of labels: {len(selected_labels)}")


Original number of labels: 4706
Filtered number of labels: 1529


In [None]:
# Drop rows with no positive labels
def remove_empty_labels(X, y):
    non_empty_indices = [i for i, labels in enumerate(y) if np.sum(labels) > 0]
    X_filtered = X.iloc[non_empty_indices].reset_index(drop=True)
    y_filtered = np.array([y[i] for i in non_empty_indices])
    return X_filtered, y_filtered

X_train, y_train = remove_empty_labels(X_train, y_train)
X_dev, y_dev = remove_empty_labels(X_dev, y_dev)
X_test, y_test = remove_empty_labels(X_test, y_test)

print(f"Train set size after filtering: {len(X_train)}")
print(f"Dev set size after filtering: {len(X_dev)}")
print(f"Test set size after filtering: {len(X_test)}")

Train set size after filtering: 386974
Dev set size after filtering: 48297
Test set size after filtering: 48376


In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, jaccard_score, hamming_loss
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Tokenizing and preparing the feature for modeling
def tokenize_data(X):
    combined_text = X.apply(lambda row: " ".join(row.astype(str)), axis=1)

    return tokenizer(
        combined_text.tolist(),
        padding=True,
        truncation=True,
        max_length=32,
        return_tensors="pt"
    )

# Dataset preparation
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        tokenized = tokenize_data(X)

        self.input_ids = tokenized["input_ids"]
        self.attention_mask = tokenized["attention_mask"]
        self.labels = torch.tensor(y)

        token_lengths = [len(ids) for ids in self.input_ids]
        print(f"Token length range: {min(token_lengths)} to {max(token_lengths)}")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

In [None]:
train_dataset = MultiLabelDataset(X_train, y_train)
dev_dataset = MultiLabelDataset(X_dev, y_dev)
test_dataset = MultiLabelDataset(X_test, y_test)

Token length range: 32 to 32
Token length range: 32 to 32
Token length range: 32 to 32


In [None]:
# RoBERTa, following similar to the model described in “Punk or Funk: Understanding the Performance of RoBERTa on Music Genre Classification.”
class MultiLabelRoBERTa(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        logits = self.classifier(outputs.pooler_output)

        loss = None
        if labels is not None:
            criterion = nn.BCEWithLogitsLoss()
            loss = criterion(logits, labels.float())

        return {"loss": loss, "logits": logits}

# Metrics
def compute_metrics(pred, threshold_range=(0.05, 0.55, 0.05), prob=True):
    logits, labels = pred
    output = logits
    if prob:
        output = torch.sigmoid(torch.tensor(logits))

    # Find probability range
    prob_min = output.min().item()
    prob_max = output.max().item()
    prob_mean = output.mean().item()

    def prob_threshold_pred(output, threshold):
        if isinstance(output, np.ndarray):
            return (output > threshold).astype(int)
        return (output > threshold).int()

    def calc_metrics(labels, predictions):
        subset_accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average="micro")
        recall = recall_score(labels, predictions, average="micro")
        f1 = f1_score(labels, predictions, average="micro")
        jaccard = jaccard_score(labels, predictions, average='samples')
        hamming = hamming_loss(labels, predictions)

        return subset_accuracy, precision, recall, f1, jaccard, hamming

    threshold_results = {}
    for threshold in np.arange(*threshold_range):
        predictions = prob_threshold_pred(output, threshold)
        subset_accuracy, precision, recall, f1, jaccard, hamming = calc_metrics(labels, predictions)
        threshold_results[threshold] = {
            "accuracy": subset_accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "hamming": hamming,
            "jaccard": jaccard
        }

    # Find optimal probability threshold
    optimal_threshold = max(threshold_results, key=lambda t: threshold_results[t]["f1"])
    optimal_threshold_metrics = threshold_results[optimal_threshold]

    return {
        "prob_min": prob_min,
        "prob_max": prob_max,
        "prob_mean": prob_mean,
        "optimal_threshold": optimal_threshold,
        "optimal_threshold_accuracy": optimal_threshold_metrics["accuracy"],
        "optimal_threshold_precision": optimal_threshold_metrics["precision"],
        "optimal_threshold_recall": optimal_threshold_metrics["recall"],
        "optimal_threshold_f1": optimal_threshold_metrics["f1"],
        "optimal_threshold_hamming": optimal_threshold_metrics["hamming"],
        "optimal_threshold_jaccard": optimal_threshold_metrics["jaccard"],
    }


In [None]:
# Initialize the model
def initialize_model():
    num_labels = len(y_train[0])
    model = MultiLabelRoBERTa(num_labels=num_labels)
    model.roberta.gradient_checkpointing_enable()
    return model
model = initialize_model()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Random chance baseline
label_frequencies = y_train.mean(axis=0)
random_predictions = np.random.rand(*y_dev.shape) < label_frequencies
random_baseline_metrics = compute_metrics((random_predictions, y_dev), prob=False)
print("Random Baseline Metrics:", random_baseline_metrics)

In [None]:
from sklearn.model_selection import ParameterSampler

# Hyperparameters; we are very limited in processing power, so only searching with low # of epochs
param_grid = {
    "learning_rate": [1e-5],
    "num_train_epochs": [5, 10],
    "per_device_train_batch_size": [32, 64, 128],
    "weight_decay": [0.01]
}

# Random Search, like in the Stanford Paper
n_trials = 5
param_samples = list(ParameterSampler(param_grid, n_iter=n_trials, random_state=478))

best_score = -np.inf
best_params = None
best_model = None


# Begin Search
for i, params in enumerate(param_samples):
    print(f"Trial {i+1}/{n_trials} with parameters: {params}")

    model = initialize_model()

    training_args = TrainingArguments(
        output_dir=f"./results/trial_{i}",
        eval_strategy="no",
        save_strategy="no",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        per_device_eval_batch_size=16,
        num_train_epochs=params["num_train_epochs"],
        weight_decay=params["weight_decay"],
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Evaluate
    metrics = trainer.evaluate(dev_dataset)
    f1 = metrics["eval_optimal_threshold_f1"]
    print(metrics)

    print(f"Trial {i+1} F1 Score: {f1}")

    # Track best model
    if f1 > best_score:
        best_score = f1
        best_params = params
        best_model = trainer
        trainer.save_model("/content/drive/MyDrive/nlp-proj/models")

# Output
print(f"Best F1 Score: {best_score}")
print(f"Best Parameters: {best_params}")

# Save the best model
best_model.save_model("/content/drive/MyDrive/nlp-proj/best-model")


Trial 1/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 64, 'num_train_epochs': 5, 'learning_rate': 1e-05}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5197
1000,0.1786
1500,0.0651
2000,0.039
2500,0.0306
3000,0.0268
3500,0.0246
4000,0.0225
4500,0.0212
5000,0.02


{'eval_loss': 0.010325008071959019, 'eval_prob_min': 2.8012221875428622e-08, 'eval_prob_max': 0.9530133008956909, 'eval_prob_mean': 0.001776322373189032, 'eval_optimal_threshold': 0.1, 'eval_optimal_threshold_accuracy': 0.04132761869267242, 'eval_optimal_threshold_precision': 0.3318626541163993, 'eval_optimal_threshold_recall': 0.3418213796079331, 'eval_optimal_threshold_f1': 0.3367684094969542, 'eval_optimal_threshold_hamming': 0.0033379414296321867, 'eval_optimal_threshold_jaccard': 0.1995347845728584, 'eval_runtime': 1021.0637, 'eval_samples_per_second': 47.301, 'eval_steps_per_second': 2.957, 'epoch': 5.0}
Trial 1 F1 Score: 0.3367684094969542


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 2/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 64, 'num_train_epochs': 10, 'learning_rate': 1e-05}


Step,Training Loss
500,0.5177
1000,0.1752
1500,0.0631
2000,0.0379
2500,0.03
3000,0.0263
3500,0.0241
4000,0.0222
4500,0.0211
5000,0.0199


{'eval_loss': 0.0087470393627882, 'eval_prob_min': 2.0091785180031962e-10, 'eval_prob_max': 0.991455614566803, 'eval_prob_mean': 0.0019898591563105583, 'eval_optimal_threshold': 0.15000000000000002, 'eval_optimal_threshold_accuracy': 0.05836801457647473, 'eval_optimal_threshold_precision': 0.40594184229999725, 'eval_optimal_threshold_recall': 0.4045094794107526, 'eval_optimal_threshold_f1': 0.4052243951017192, 'eval_optimal_threshold_hamming': 0.002943959961711187, 'eval_optimal_threshold_jaccard': 0.26324235873827073, 'eval_runtime': 1022.8546, 'eval_samples_per_second': 47.218, 'eval_steps_per_second': 2.952, 'epoch': 10.0}
Trial 2 F1 Score: 0.4052243951017192
Trial 3/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 128, 'num_train_epochs': 10, 'learning_rate': 1e-05}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5126
1000,0.1764
1500,0.065
2000,0.0388
2500,0.0306
3000,0.0266
3500,0.0241
4000,0.0224
4500,0.0211
5000,0.0199


{'eval_loss': 0.009633146226406097, 'eval_prob_min': 6.6534853182531606e-09, 'eval_prob_max': 0.9772841930389404, 'eval_prob_mean': 0.0018655636813491583, 'eval_optimal_threshold': 0.15000000000000002, 'eval_optimal_threshold_accuracy': 0.05321241484978363, 'eval_optimal_threshold_precision': 0.4097401202749141, 'eval_optimal_threshold_recall': 0.33344803666136846, 'eval_optimal_threshold_f1': 0.36767819124108253, 'eval_optimal_threshold_hamming': 0.00284344011444448, 'eval_optimal_threshold_jaccard': 0.22467171224014845, 'eval_runtime': 1026.0573, 'eval_samples_per_second': 47.07, 'eval_steps_per_second': 2.942, 'epoch': 10.0}
Trial 3 F1 Score: 0.36767819124108253
Trial 4/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 32, 'num_train_epochs': 10, 'learning_rate': 1e-05}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5133
1000,0.1738
1500,0.0631
2000,0.0377
2500,0.0301
3000,0.0263
3500,0.0241
4000,0.0226
4500,0.0215
5000,0.0207


Step,Training Loss
500,0.5133
1000,0.1738
1500,0.0631
2000,0.0377
2500,0.0301
3000,0.0263
3500,0.0241
4000,0.0226
4500,0.0215
5000,0.0207


{'eval_loss': 0.008010685443878174, 'eval_prob_min': 7.371428631444677e-16, 'eval_prob_max': 0.9959456324577332, 'eval_prob_mean': 0.0020467969588935375, 'eval_optimal_threshold': 0.2, 'eval_optimal_threshold_accuracy': 0.07874195084580823, 'eval_optimal_threshold_precision': 0.47594430445576735, 'eval_optimal_threshold_recall': 0.4198961115571796, 'eval_optimal_threshold_f1': 0.44616688237444935, 'eval_optimal_threshold_hamming': 0.0025844555961936685, 'eval_optimal_threshold_jaccard': 0.296238329902899, 'eval_runtime': 1029.2215, 'eval_samples_per_second': 46.926, 'eval_steps_per_second': 2.933, 'epoch': 10.0}
Trial 4 F1 Score: 0.44616688237444935
Trial 5/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 32, 'num_train_epochs': 5, 'learning_rate': 1e-05}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5137
1000,0.175
1500,0.0637
2000,0.038
2500,0.0302
3000,0.0264
3500,0.0242
4000,0.0226
4500,0.0215
5000,0.0206


KeyboardInterrupt: 

In [None]:
 #Evaluate on test
 best_model.evaluate(test_dataset)