In [2]:
# Download data and create genre column for tracks using artists


from data.cleaning import download_and_clean

artists, tracks = download_and_clean()

In [3]:
print(artists.head())

                         id  followers                      genres  \
45   0VLMVnVbJyJ4oyZs2L3Yl2       71.0          ['carnaval cadiz']   
46   0dt23bs4w8zx154C5xdVyl       63.0          ['carnaval cadiz']   
47   0pGhoB99qpEJEsBQxgaskQ       64.0          ['carnaval cadiz']   
48   3HDrX2OtSuXLW5dLR85uN3       53.0          ['carnaval cadiz']   
136  22mLrN5fkppmuUPsHx6i2G       59.0  ['classical harp', 'harp']   

                             name  popularity  
45   Las Viudas De Los Bisabuelos           6  
46              Los De Capuchinos           5  
47             Los “Pofesionales”           7  
48      Los Que No Paran De Rajar           6  
136                   Vera Dulova           3  


In [4]:
print(tracks.head())

                       id                                name  popularity  \
2  07A5yehtSnoedViJAZkNnc  Vivo para Quererte - Remasterizado           0   
3  08FmqUhxtyLTn6pAh6bk45       El Prisionero - Remasterizado           0   
4  08y9GfoqCWfOGsKdwojr5e                 Lady of the Evening           0   
5  0BRXJHRNGQ3W4v9frnSfhu                           Ave Maria           0   
7  0IA0Hju8CAgYfV1hwhidBH                             La Java           0   

   duration_ms  explicit              artists                  id_artists  \
2       181640         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
3       176907         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
4       163080         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
5       178933         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
7       161427         0      ['Mistinguett']  ['4AxgXfD7ISvJSTObqm4aIE']   

  release_date  danceability  energy  ...  loudness  mode  speechiness  \


In [5]:
# The next few cells create a multi-hot vector for storing the genre labels for modeling

from itertools import chain
all_genres = set(chain.from_iterable(tracks["genres"]))
len(all_genres)
# all_genres

4706

In [6]:
genre_to_index = {genre: idx for idx, genre in enumerate(sorted(all_genres))}
# genre_to_index

In [7]:
def multihot_vector(genres, index_dict):
    multihot = [0] * len(index_dict)
    for genre in genres:
        multihot[index_dict[genre]] = 1
    return multihot

In [8]:
tracks["multi_hot_genres"] = tracks["genres"].apply(
    lambda genres: multihot_vector(genres, genre_to_index)
)
tracks["multi_hot_genres"]

2         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                ...                        
586667    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586668    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586669    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586670    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586671    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: multi_hot_genres, Length: 499064, dtype: object

In [57]:
# Split the data into 80-10-10

from sklearn.model_selection import train_test_split

X = tracks[["name", "popularity", "duration_ms", "explicit", "release_date", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]]
y = tracks["multi_hot_genres"].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=478)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=478)

In [58]:
print(f"Train set size: {len(X_train)}")
print(f"Dev set size: {len(X_dev)}")
print(f"Test set size: {len(X_test)}")

Train set size: 399251
Dev set size: 49906
Test set size: 49907


In [59]:
# Drop genres that have less than 100 occurences
import numpy as np

label_counts = np.sum(y_train, axis=0)

threshold =100
selected_labels = np.where(label_counts >= threshold)[0]

def filter_labels(y, selected_labels):
    return np.array([[y_sample[i] for i in selected_labels] for y_sample in y])

y_train = filter_labels(y_train, selected_labels)
y_dev = filter_labels(y_dev, selected_labels)
y_test = filter_labels(y_test, selected_labels)

filtered_genre_to_index = {genre: idx for idx, genre in enumerate(selected_labels)}
index_to_genre = {v: k for k, v in filtered_genre_to_index.items()}

print(f"Original number of labels: {len(label_counts)}")
print(f"Filtered number of labels: {len(selected_labels)}")


KeyboardInterrupt: 

In [None]:
# Drop rows with no positive labels
def remove_empty_labels(X, y):
    non_empty_indices = [i for i, labels in enumerate(y) if np.sum(labels) > 0]
    X_filtered = X.iloc[non_empty_indices].reset_index(drop=True)
    y_filtered = np.array([y[i] for i in non_empty_indices])
    return X_filtered, y_filtered

X_train, y_train = remove_empty_labels(X_train, y_train)
X_dev, y_dev = remove_empty_labels(X_dev, y_dev)
X_test, y_test = remove_empty_labels(X_test, y_test)

print(f"Train set size after filtering: {len(X_train)}")
print(f"Dev set size after filtering: {len(X_dev)}")
print(f"Test set size after filtering: {len(X_test)}")

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, jaccard_score, hamming_loss
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [None]:
# Tokenizing and preparing the feature for modeling
def tokenize_data(X):
    combined_text = X.apply(lambda row: " ".join(row.astype(str)), axis=1)
    
    # Tokenize the combined text
    return tokenizer(
        combined_text.tolist(),
        padding=True,
        truncation=True,
        max_length=16,
        return_tensors="pt"
    )

# Dataset preparation
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        tokenized = tokenize_data(X)

        self.input_ids = tokenized["input_ids"]
        self.attention_mask = tokenized["attention_mask"]
        self.labels = torch.tensor(y)

        token_lengths = [len(ids) for ids in self.input_ids]
        print(f"Token length range: {min(token_lengths)} to {max(token_lengths)}")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

In [None]:
train_dataset = MultiLabelDataset(X_train, y_train)
dev_dataset = MultiLabelDataset(X_dev, y_dev)
test_dataset = MultiLabelDataset(X_test, y_test)

In [None]:
# RoBERTa, following similar to the model described in “Punk or Funk: Understanding the Performance of RoBERTa on Music Genre Classification.” 
class MultiLabelRoBERTa(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        logits = self.classifier(outputs.pooler_output)
        
        loss = None
        if labels is not None:
            criterion = nn.BCEWithLogitsLoss()
            loss = criterion(logits, labels.float())
        
        return {"loss": loss, "logits": logits}

# Metrics
def compute_metrics(pred, threshold_range=(0.05, 0.55, 0.05), prob=True):
    logits, labels = pred
    output = logits
    if prob:
        output = torch.sigmoid(torch.tensor(logits))

    # Find probability range
    prob_min = output.min().item()
    prob_max = output.max().item()
    prob_mean = output.mean().item()

    def prob_threshold_pred(output, threshold):
        if isinstance(output, np.ndarray):
            return (output > threshold).astype(int)
        return (output > threshold).int()

    def calc_metrics(labels, predictions):
        subset_accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average="micro")
        recall = recall_score(labels, predictions, average="micro")
        f1 = f1_score(labels, predictions, average="micro")
        jaccard = jaccard_score(labels, predictions, average='samples')
        hamming = hamming_loss(labels, predictions)

        return subset_accuracy, precision, recall, f1, jaccard, hamming
        
    threshold_results = {}
    for threshold in np.arange(*threshold_range):
        predictions = prob_threshold_pred(output, threshold)
        subset_accuracy, precision, recall, f1, jaccard, hamming = calc_metrics(labels, predictions)
        threshold_results[threshold] = {
            "accuracy": subset_accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "hamming": hamming,
            "jaccard": jaccard
        }

    # Find optimal probability threshold
    optimal_threshold = max(threshold_results, key=lambda t: threshold_results[t]["f1"])
    optimal_threshold_metrics = threshold_results[optimal_threshold]

    return {
        "prob_min": prob_min,
        "prob_max": prob_max,
        "prob_mean": prob_mean,
        "optimal_threshold": optimal_threshold,
        "optimal_threshold_accuracy": optimal_threshold_metrics["accuracy"],
        "optimal_threshold_precision": optimal_threshold_metrics["precision"],
        "optimal_threshold_recall": optimal_threshold_metrics["recall"],
        "optimal_threshold_f1": optimal_threshold_metrics["f1"],
        "optimal_threshold_hamming": optimal_threshold_metrics["hamming"],
        "optimal_threshold_jaccard": optimal_threshold_metrics["jaccard"],
    }


In [None]:
# Initialize the model
def initialize_model():
    num_labels = len(y_train[0])
    model = MultiLabelRoBERTa(num_labels=num_labels)
    model.roberta.gradient_checkpointing_enable()
    return model
model = initialize_model()

In [None]:
from sklearn.model_selection import ParameterSampler
import numpy as np
import torch

# Hyperparameters; we are very limited in processing power, so only searching with low # of epochs
param_grid = {
    "learning_rate": [1e-5],
    "num_train_epochs": [5, 10],
    "per_device_train_batch_size": [32, 64, 128],
    "weight_decay": [0.01]
}

# Random Search, like in the Stanford Paper
n_trials = 5
param_samples = list(ParameterSampler(param_grid, n_iter=n_trials, random_state=478))

best_score = -np.inf
best_params = None
best_model = None

for i, params in enumerate(param_samples):
    print(f"Trial {i+1}/{n_trials} with parameters: {params}")

    model = initialize_model()

    training_args = TrainingArguments(
        output_dir=f"./results/trial_{i}",
        eval_strategy="no",
        save_strategy="no",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        per_device_eval_batch_size=16, 
        num_train_epochs=params["num_train_epochs"],
        weight_decay=params["weight_decay"],
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Evaluate
    metrics = trainer.evaluate(dev_dataset)
    f1 = metrics["eval_optimal_threshold_f1"]

    print(f"Trial {i+1} F1 Score: {f1}")
    print(metrics)

    # Track best model
    if f1 > best_score:
        best_score = f1
        best_params = params
        best_model = trainer
        trainer.save_model("./models")

# Output
print(f"Best F1 Score: {best_score}")
print(f"Best Parameters: {best_params}")

# Save the best model
best_model.save_model("./best-model")


In [61]:
from sklearn.model_selection import train_test_split

##No Energy##

X = tracks[["name", "popularity", "duration_ms", "explicit", "release_date", "danceability", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]]
y = tracks["multi_hot_genres"].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=478)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=478)

print(f"Train set size: {len(X_train)}")
print(f"Dev set size: {len(X_dev)}")
print(f"Test set size: {len(X_test)}")

# Count label frequencies in the training set
import numpy as np

label_counts = np.sum(y_train, axis=0)

# Set a threshold for filtering labels
threshold =100
selected_labels = np.where(label_counts >= threshold)[0]

# Filter labels in train, dev, and test sets
def filter_labels(y, selected_labels):
    return np.array([[y_sample[i] for i in selected_labels] for y_sample in y])

y_train = filter_labels(y_train, selected_labels)
y_dev = filter_labels(y_dev, selected_labels)
y_test = filter_labels(y_test, selected_labels)

# Update genre_to_index mapping
filtered_genre_to_index = {genre: idx for idx, genre in enumerate(selected_labels)}
index_to_genre = {v: k for k, v in filtered_genre_to_index.items()}

# Step 5: Print results for verification
print(f"Original number of labels: {len(label_counts)}")
print(f"Filtered number of labels: {len(selected_labels)}")

# Remove rows with no positive labels
def remove_empty_labels(X, y):
    non_empty_indices = [i for i, labels in enumerate(y) if np.sum(labels) > 0]
    X_filtered = X.iloc[non_empty_indices].reset_index(drop=True)
    y_filtered = np.array([y[i] for i in non_empty_indices])
    return X_filtered, y_filtered

# Apply to train, dev, and test sets
X_train, y_train = remove_empty_labels(X_train, y_train)
X_dev, y_dev = remove_empty_labels(X_dev, y_dev)
X_test, y_test = remove_empty_labels(X_test, y_test)

# Verify
print(f"Train set size after filtering: {len(X_train)}")
print(f"Dev set size after filtering: {len(X_dev)}")
print(f"Test set size after filtering: {len(X_test)}")

def tokenize_data(X):
    combined_text = X.apply(lambda row: " ".join(row.astype(str)), axis=1)
    
    # Tokenize the combined text
    return tokenizer(
        combined_text.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Convert splits into datasets
train_dataset = MultiLabelDataset(X_train, y_train)
dev_dataset = MultiLabelDataset(X_dev, y_dev)
test_dataset = MultiLabelDataset(X_test, y_test)

Train set size: 399251
Dev set size: 49906
Test set size: 49907
Original number of labels: 4706
Filtered number of labels: 1529
Train set size after filtering: 386974
Dev set size after filtering: 48297
Test set size after filtering: 48376
Token length range: 128 to 128
Token length range: 128 to 128
Token length range: 128 to 128


In [66]:
# Initialize the model
num_labels = len(y_train[0])
model = MultiLabelRoBERTa(num_labels=num_labels)
model.roberta.gradient_checkpointing_enable()
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    save_strategy="no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    # gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True
)
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
# # Train the model
#NO ENERGY
trainer.train()

Step,Training Loss
500,0.526
1000,0.1812
1500,0.065
2000,0.0384
2500,0.0306
3000,0.0265
3500,0.0241
4000,0.0224
4500,0.0211
5000,0.0202


TrainOutput(global_step=120930, training_loss=0.013277571246472776, metrics={'train_runtime': 22566.3133, 'train_samples_per_second': 171.483, 'train_steps_per_second': 5.359, 'total_flos': 0.0, 'train_loss': 0.013277571246472776, 'epoch': 10.0})

In [68]:
# Evaluate on the test set
metrics = trainer.evaluate(test_dataset)
print(metrics)

{'eval_loss': 0.007655394729226828, 'eval_prob_min': 2.911071340580551e-12, 'eval_prob_max': 0.9978594183921814, 'eval_prob_mean': 0.002056489000096917, 'eval_optimal_threshold': 0.2, 'eval_optimal_threshold_accuracy': 0.08092855961633869, 'eval_optimal_threshold_precision': 0.48751641791044775, 'eval_optimal_threshold_recall': 0.44589510470418, 'eval_optimal_threshold_f1': 0.4657778031286095, 'eval_optimal_threshold_hamming': 0.0025324434290233374, 'eval_optimal_threshold_jaccard': 0.31269528987399087, 'eval_runtime': 153.5002, 'eval_samples_per_second': 315.153, 'eval_steps_per_second': 19.7, 'epoch': 10.0}
