In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import kagglehub

# Download data and create genre column for tracks using artists
def download_and_clean():

    path = kagglehub.dataset_download("yamaerenay/spotify-dataset-19212020-600k-tracks")
    artists_path = path + '/artists.csv'
    tracks_path = path + '/tracks.csv'

    artists_df = pd.read_csv(artists_path)
    tracks_df = pd.read_csv(tracks_path)

    artists_df = artists_df.dropna().drop_duplicates()
    tracks_df = tracks_df.dropna().drop_duplicates()

    artists_df = artists_df[artists_df['genres'].apply(lambda x: len(eval(x)) > 0 if isinstance(x, str) else False)]

    valid_artists = set(artists_df['name'])
    tracks_df = tracks_df[tracks_df['artists'].apply(lambda x: all(artist in valid_artists for artist in eval(x)) if isinstance(x, str) else False)]

    artist_to_genres = dict(zip(artists_df['name'], artists_df['genres']))

    def get_genres_for_track(artists_list):
        if isinstance(artists_list, str):
            try:
                artists_list = eval(artists_list)
                genres = [genre for artist in artists_list if artist in artist_to_genres for genre in eval(artist_to_genres[artist])]
                return list(set(genres))
            except:
                return []
        return []

    tracks_df['genres'] = tracks_df['artists'].apply(get_genres_for_track)

    artists_df.to_csv('/content/drive/MyDrive/nlp-proj/data/cleaned_data/cleaned_artists.csv', index=False)
    tracks_df.to_csv('/content/drive/MyDrive/nlp-proj/data/cleaned_data/cleaned_tracks.csv', index=False)

    return artists_df, tracks_df


In [None]:
# from data.cleaning import download_and_clean

artists, tracks = download_and_clean()

Downloading from https://www.kaggle.com/api/v1/datasets/download/yamaerenay/spotify-dataset-19212020-600k-tracks?dataset_version_number=1...


100%|██████████| 193M/193M [00:01<00:00, 178MB/s]

Extracting files...





In [None]:
print(artists.head())

                         id  followers                      genres  \
45   0VLMVnVbJyJ4oyZs2L3Yl2       71.0          ['carnaval cadiz']   
46   0dt23bs4w8zx154C5xdVyl       63.0          ['carnaval cadiz']   
47   0pGhoB99qpEJEsBQxgaskQ       64.0          ['carnaval cadiz']   
48   3HDrX2OtSuXLW5dLR85uN3       53.0          ['carnaval cadiz']   
136  22mLrN5fkppmuUPsHx6i2G       59.0  ['classical harp', 'harp']   

                             name  popularity  
45   Las Viudas De Los Bisabuelos           6  
46              Los De Capuchinos           5  
47             Los “Pofesionales”           7  
48      Los Que No Paran De Rajar           6  
136                   Vera Dulova           3  


In [None]:
print(tracks.head())

                       id                                name  popularity  \
2  07A5yehtSnoedViJAZkNnc  Vivo para Quererte - Remasterizado           0   
3  08FmqUhxtyLTn6pAh6bk45       El Prisionero - Remasterizado           0   
4  08y9GfoqCWfOGsKdwojr5e                 Lady of the Evening           0   
5  0BRXJHRNGQ3W4v9frnSfhu                           Ave Maria           0   
7  0IA0Hju8CAgYfV1hwhidBH                             La Java           0   

   duration_ms  explicit              artists                  id_artists  \
2       181640         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
3       176907         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
4       163080         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
5       178933         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
7       161427         0      ['Mistinguett']  ['4AxgXfD7ISvJSTObqm4aIE']   

  release_date  danceability  energy  ...  loudness  mode  speechiness  \


In [None]:
# The next few cells create a multi-hot vector for storing the genre labels for modeling

from itertools import chain
all_genres = set(chain.from_iterable(tracks["genres"]))
len(all_genres)
# all_genres

4706

In [None]:
genre_to_index = {genre: idx for idx, genre in enumerate(sorted(all_genres))}
# genre_to_index

In [None]:
def multihot_vector(genres, index_dict):
    multihot = [0] * len(index_dict)
    for genre in genres:
        multihot[index_dict[genre]] = 1
    return multihot

In [None]:
tracks["multi_hot_genres"] = tracks["genres"].apply(
    lambda genres: multihot_vector(genres, genre_to_index)
)
tracks["multi_hot_genres"]

Unnamed: 0,multi_hot_genres
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
586667,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
586668,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
586669,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
586670,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Split the data into 80-10-10

from sklearn.model_selection import train_test_split

X = tracks[["name", "popularity", "duration_ms", "explicit", "release_date", "energy", "danceability", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "valence", "tempo", "time_signature"]]
y = tracks["multi_hot_genres"].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=478)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=478)

In [None]:
print(f"Train set size: {len(X_train)}")
print(f"Dev set size: {len(X_dev)}")
print(f"Test set size: {len(X_test)}")

Train set size: 399251
Dev set size: 49906
Test set size: 49907


In [None]:
# Drop genres that have less than 100 occurences


label_counts = np.sum(y_train, axis=0)

threshold =100
selected_labels = np.where(label_counts >= threshold)[0]

def filter_labels(y, selected_labels):
    return np.array([[y_sample[i] for i in selected_labels] for y_sample in y])

y_train = filter_labels(y_train, selected_labels)
y_dev = filter_labels(y_dev, selected_labels)
y_test = filter_labels(y_test, selected_labels)

filtered_genre_to_index = {genre: idx for idx, genre in enumerate(selected_labels)}
index_to_genre = {v: k for k, v in filtered_genre_to_index.items()}

print(f"Original number of labels: {len(label_counts)}")
print(f"Filtered number of labels: {len(selected_labels)}")


Original number of labels: 4706
Filtered number of labels: 1529


In [None]:
# Drop rows with no positive labels
def remove_empty_labels(X, y):
    non_empty_indices = [i for i, labels in enumerate(y) if np.sum(labels) > 0]
    X_filtered = X.iloc[non_empty_indices].reset_index(drop=True)
    y_filtered = np.array([y[i] for i in non_empty_indices])
    return X_filtered, y_filtered

X_train, y_train = remove_empty_labels(X_train, y_train)
X_dev, y_dev = remove_empty_labels(X_dev, y_dev)
X_test, y_test = remove_empty_labels(X_test, y_test)

print(f"Train set size after filtering: {len(X_train)}")
print(f"Dev set size after filtering: {len(X_dev)}")
print(f"Test set size after filtering: {len(X_test)}")

Train set size after filtering: 386974
Dev set size after filtering: 48297
Test set size after filtering: 48376


In [None]:
!pip install numpy



In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, jaccard_score, hamming_loss
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [None]:
# Tokenizing and preparing the feature for modeling
def tokenize_data(X):
    combined_text = X.apply(lambda row: " ".join(row.astype(str)), axis=1)

    return tokenizer(
        combined_text.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Dataset preparation
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        tokenized = tokenize_data(X)

        self.input_ids = tokenized["input_ids"]
        self.attention_mask = tokenized["attention_mask"]
        self.labels = torch.tensor(y)

        token_lengths = [len(ids) for ids in self.input_ids]
        print(f"Token length range: {min(token_lengths)} to {max(token_lengths)}")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

In [None]:
train_dataset = MultiLabelDataset(X_train, y_train)
dev_dataset = MultiLabelDataset(X_dev, y_dev)
test_dataset = MultiLabelDataset(X_test, y_test)

Token length range: 128 to 128
Token length range: 128 to 128
Token length range: 128 to 128


In [None]:
# RoBERTa, following similar to the model described in “Punk or Funk: Understanding the Performance of RoBERTa on Music Genre Classification.”
class MultiLabelRoBERTa(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        logits = self.classifier(outputs.pooler_output)

        loss = None
        if labels is not None:
            criterion = nn.BCEWithLogitsLoss()
            loss = criterion(logits, labels.float())

        return {"loss": loss, "logits": logits}

# Metrics
def compute_metrics(pred, threshold_range=(0.05, 0.55, 0.05), prob=True):
    logits, labels = pred
    output = logits
    if prob:
        output = torch.sigmoid(torch.tensor(logits))

    # Find probability range
    prob_min = output.min().item()
    prob_max = output.max().item()
    prob_mean = output.mean().item()

    def prob_threshold_pred(output, threshold):
        if isinstance(output, np.ndarray):
            return (output > threshold).astype(int)
        return (output > threshold).int()

    def calc_metrics(labels, predictions):
        subset_accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average="micro")
        recall = recall_score(labels, predictions, average="micro")
        f1 = f1_score(labels, predictions, average="micro")
        jaccard = jaccard_score(labels, predictions, average='samples')
        hamming = hamming_loss(labels, predictions)

        return subset_accuracy, precision, recall, f1, jaccard, hamming

    threshold_results = {}
    for threshold in np.arange(*threshold_range):
        predictions = prob_threshold_pred(output, threshold)
        subset_accuracy, precision, recall, f1, jaccard, hamming = calc_metrics(labels, predictions)
        threshold_results[threshold] = {
            "accuracy": subset_accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "hamming": hamming,
            "jaccard": jaccard
        }

    # Find optimal probability threshold
    optimal_threshold = max(threshold_results, key=lambda t: threshold_results[t]["f1"])
    optimal_threshold_metrics = threshold_results[optimal_threshold]

    return {
        "prob_min": prob_min,
        "prob_max": prob_max,
        "prob_mean": prob_mean,
        "optimal_threshold": optimal_threshold,
        "optimal_threshold_accuracy": optimal_threshold_metrics["accuracy"],
        "optimal_threshold_precision": optimal_threshold_metrics["precision"],
        "optimal_threshold_recall": optimal_threshold_metrics["recall"],
        "optimal_threshold_f1": optimal_threshold_metrics["f1"],
        "optimal_threshold_hamming": optimal_threshold_metrics["hamming"],
        "optimal_threshold_jaccard": optimal_threshold_metrics["jaccard"],
    }


In [None]:
# Initialize the model
def initialize_model():
    num_labels = len(y_train[0])
    model = MultiLabelRoBERTa(num_labels=num_labels)
    model.roberta.gradient_checkpointing_enable()
    return model
model = initialize_model()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    save_strategy="no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    # gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True
)


In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
#NO Liveness
trainer.train()

Step,Training Loss
500,0.5273
1000,0.187
1500,0.0678
2000,0.0395
2500,0.0311
3000,0.027
3500,0.0245
4000,0.0229
4500,0.0215
5000,0.0205


TrainOutput(global_step=120930, training_loss=0.01324376447378411, metrics={'train_runtime': 9955.9934, 'train_samples_per_second': 388.684, 'train_steps_per_second': 12.146, 'total_flos': 0.0, 'train_loss': 0.01324376447378411, 'epoch': 10.0})

In [None]:
# Evaluate on the test set
metrics = trainer.evaluate(test_dataset)
print(metrics)

{'eval_loss': 0.007541674189269543, 'eval_prob_min': 1.9923529514555283e-14, 'eval_prob_max': 0.9971439242362976, 'eval_prob_mean': 0.0020563765428960323, 'eval_optimal_threshold': 0.2, 'eval_optimal_threshold_accuracy': 0.08549694063171821, 'eval_optimal_threshold_precision': 0.4941827602449364, 'eval_optimal_threshold_recall': 0.4583067136265596, 'eval_optimal_threshold_f1': 0.4755690912954175, 'eval_optimal_threshold_hamming': 0.002502619279563195, 'eval_optimal_threshold_jaccard': 0.32229010580074324, 'eval_runtime': 1020.1924, 'eval_samples_per_second': 47.419, 'eval_steps_per_second': 2.964, 'epoch': 10.0}
