In [1]:
import torch
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Optionally allocate a fraction of GPU memory
torch.cuda.set_per_process_memory_fraction(0.9, 0)

# Check memory stats
print(f'Allocated GPU memory: {torch.cuda.memory_allocated(device) / (1024 ** 3):.2f} GB')
print(f'Reserved GPU memory: {torch.cuda.memory_reserved(device) / (1024 ** 3):.2f} GB')

Using device: cuda
Allocated GPU memory: 0.00 GB
Reserved GPU memory: 0.00 GB


In [2]:
from data.cleaning import download_and_clean

artists, tracks = download_and_clean()

In [3]:
print(artists.head())

                         id  followers                      genres  \
45   0VLMVnVbJyJ4oyZs2L3Yl2       71.0          ['carnaval cadiz']   
46   0dt23bs4w8zx154C5xdVyl       63.0          ['carnaval cadiz']   
47   0pGhoB99qpEJEsBQxgaskQ       64.0          ['carnaval cadiz']   
48   3HDrX2OtSuXLW5dLR85uN3       53.0          ['carnaval cadiz']   
136  22mLrN5fkppmuUPsHx6i2G       59.0  ['classical harp', 'harp']   

                             name  popularity  
45   Las Viudas De Los Bisabuelos           6  
46              Los De Capuchinos           5  
47             Los “Pofesionales”           7  
48      Los Que No Paran De Rajar           6  
136                   Vera Dulova           3  


In [4]:
print(tracks.head())

                       id                                name  popularity  \
2  07A5yehtSnoedViJAZkNnc  Vivo para Quererte - Remasterizado           0   
3  08FmqUhxtyLTn6pAh6bk45       El Prisionero - Remasterizado           0   
4  08y9GfoqCWfOGsKdwojr5e                 Lady of the Evening           0   
5  0BRXJHRNGQ3W4v9frnSfhu                           Ave Maria           0   
7  0IA0Hju8CAgYfV1hwhidBH                             La Java           0   

   duration_ms  explicit              artists                  id_artists  \
2       181640         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
3       176907         0  ['Ignacio Corsini']  ['5LiOoJbxVSAMkBS2fUm3X2']   
4       163080         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
5       178933         0      ['Dick Haymes']  ['3BiJGZsyX9sJchTqcSA7Su']   
7       161427         0      ['Mistinguett']  ['4AxgXfD7ISvJSTObqm4aIE']   

  release_date  danceability  energy  ...  loudness  mode  speechiness  \


In [5]:
from itertools import chain
all_genres = set(chain.from_iterable(tracks["genres"]))
len(all_genres)
# all_genres

4706

In [6]:
genre_to_index = {genre: idx for idx, genre in enumerate(sorted(all_genres))}
# genre_to_index

In [7]:
def multihot_vector(genres, index_dict):
    multihot = [0] * len(index_dict)
    for genre in genres:
        multihot[index_dict[genre]] = 1
    return multihot

In [8]:
tracks["multi_hot_genres"] = tracks["genres"].apply(
    lambda genres: multihot_vector(genres, genre_to_index)
)
tracks["multi_hot_genres"]

2         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                ...                        
586667    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586668    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586669    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586670    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
586671    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: multi_hot_genres, Length: 499064, dtype: object

In [9]:
from sklearn.model_selection import train_test_split

X = tracks[["name", "popularity", "duration_ms", "explicit", "release_date", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]]
y = tracks["multi_hot_genres"].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=478)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=478)

In [10]:
print(f"Train set size: {len(X_train)}")
print(f"Dev set size: {len(X_dev)}")
print(f"Test set size: {len(X_test)}")

Train set size: 399251
Dev set size: 49906
Test set size: 49907


In [11]:
# Count label frequencies in the training set
import numpy as np

label_counts = np.sum(y_train, axis=0)

# Set a threshold for filtering labels
threshold =100
selected_labels = np.where(label_counts >= threshold)[0]

# Filter labels in train, dev, and test sets
def filter_labels(y, selected_labels):
    return np.array([[y_sample[i] for i in selected_labels] for y_sample in y])

y_train = filter_labels(y_train, selected_labels)
y_dev = filter_labels(y_dev, selected_labels)
y_test = filter_labels(y_test, selected_labels)

# Update genre_to_index mapping
filtered_genre_to_index = {genre: idx for idx, genre in enumerate(selected_labels)}
index_to_genre = {v: k for k, v in filtered_genre_to_index.items()}

# Step 5: Print results for verification
print(f"Original number of labels: {len(label_counts)}")
print(f"Filtered number of labels: {len(selected_labels)}")


Original number of labels: 4706
Filtered number of labels: 1529


In [12]:
# Remove rows with no positive labels
def remove_empty_labels(X, y):
    non_empty_indices = [i for i, labels in enumerate(y) if np.sum(labels) > 0]
    X_filtered = X.iloc[non_empty_indices].reset_index(drop=True)
    y_filtered = np.array([y[i] for i in non_empty_indices])
    return X_filtered, y_filtered

# Apply to train, dev, and test sets
X_train, y_train = remove_empty_labels(X_train, y_train)
X_dev, y_dev = remove_empty_labels(X_dev, y_dev)
X_test, y_test = remove_empty_labels(X_test, y_test)

# Verify
print(f"Train set size after filtering: {len(X_train)}")
print(f"Dev set size after filtering: {len(X_dev)}")
print(f"Test set size after filtering: {len(X_test)}")

Train set size after filtering: 386974
Dev set size after filtering: 48297
Test set size after filtering: 48376


In [13]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, jaccard_score, hamming_loss
import numpy as np

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")




In [33]:
# Tokenization function
def tokenize_data(X):
    combined_text = X.apply(lambda row: " ".join(row.astype(str)), axis=1)
    
    # Tokenize the combined text
    return tokenizer(
        combined_text.tolist(),
        padding=True,
        truncation=True,
        max_length=16,
        return_tensors="pt"
    )

# Dataset preparation
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        tokenized = tokenize_data(X)

        self.input_ids = tokenized["input_ids"]
        self.attention_mask = tokenized["attention_mask"]
        self.labels = torch.tensor(y)

        token_lengths = [len(ids) for ids in self.input_ids]
        print(f"Token length range: {min(token_lengths)} to {max(token_lengths)}")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

In [34]:
# Convert splits into datasets
train_dataset = MultiLabelDataset(X_train, y_train)
dev_dataset = MultiLabelDataset(X_dev, y_dev)
test_dataset = MultiLabelDataset(X_test, y_test)

Token length range: 16 to 16
Token length range: 16 to 16
Token length range: 16 to 16


In [35]:
# Model definition
class MultiLabelRoBERTa(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        logits = self.classifier(outputs.pooler_output)
        
        loss = None
        if labels is not None:
            criterion = nn.BCEWithLogitsLoss()
            loss = criterion(logits, labels.float())
        
        return {"loss": loss, "logits": logits}

# Metrics computation
def compute_metrics(pred, threshold_range=(0.05, 0.55, 0.05), prob=True):
    logits, labels = pred
    output = logits
    if prob:
        output = torch.sigmoid(torch.tensor(logits))

    # Calculate logit statistics
    prob_min = output.min().item()
    prob_max = output.max().item()
    prob_mean = output.mean().item()

    def prob_threshold_pred(output, threshold):
        if isinstance(output, np.ndarray):
            return (output > threshold).astype(int)
        return (output > threshold).int()

    def calc_metrics(labels, predictions):
        subset_accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average="micro")
        recall = recall_score(labels, predictions, average="micro")
        f1 = f1_score(labels, predictions, average="micro")
        jaccard = jaccard_score(labels, predictions, average='samples')
        hamming = hamming_loss(labels, predictions)

        return subset_accuracy, precision, recall, f1, jaccard, hamming
        
    # Store results for thresholds
    threshold_results = {}
    for threshold in np.arange(*threshold_range):
        predictions = prob_threshold_pred(output, threshold)
        subset_accuracy, precision, recall, f1, jaccard, hamming = calc_metrics(labels, predictions)
        threshold_results[threshold] = {
            "accuracy": subset_accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "hamming": hamming,
            "jaccard": jaccard
        }

    # Find optimal threshold
    optimal_threshold = max(threshold_results, key=lambda t: threshold_results[t]["f1"])
    optimal_threshold_metrics = threshold_results[optimal_threshold]

    # Return detailed metrics
    return {
        "prob_min": prob_min,
        "prob_max": prob_max,
        "prob_mean": prob_mean,
        "optimal_threshold": optimal_threshold,
        "optimal_threshold_accuracy": optimal_threshold_metrics["accuracy"],
        "optimal_threshold_precision": optimal_threshold_metrics["precision"],
        "optimal_threshold_recall": optimal_threshold_metrics["recall"],
        "optimal_threshold_f1": optimal_threshold_metrics["f1"],
        "optimal_threshold_hamming": optimal_threshold_metrics["hamming"],
        "optimal_threshold_jaccard": optimal_threshold_metrics["jaccard"],
    }


In [36]:
# Initialize the model
def initialize_model():
    num_labels = len(y_train[0])
    model = MultiLabelRoBERTa(num_labels=num_labels)
    model.roberta.gradient_checkpointing_enable()
    return model
model = initialize_model()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="steps",
#     eval_steps=2000,
#     save_strategy="steps",
#     save_steps=2000,
#     learning_rate=1e-5,
#     per_device_train_batch_size=16,
#     # gradient_accumulation_steps=2,
#     per_device_eval_batch_size=16,
#     num_train_epochs=10,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     fp16=True
# )


In [38]:
# # Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=dev_dataset,
#     processing_class=tokenizer,
#     compute_metrics=compute_metrics
# )

In [39]:
# # Train the model
# trainer.train()

In [40]:
# Evaluate on the test set
# trainer.evaluate(test_dataset)

In [41]:
# def compute_metrics(pred, top_k_range=(1, 10), threshold_range=(0.05, 0.5, 0.05)):
#     logits, labels = pred

#     # Calculate logit statistics
#     logit_min = logits.min()
#     logit_max = logits.max()
#     logit_mean = logits.mean()

#     def top_k_pred(logits, k):
#         top_k_indices = np.argsort(-logits, axis=1)[:, :k]
#         predictions = np.zeros_like(logits, dtype=int)
#         for i, indices in enumerate(top_k_indices):
#             predictions[i, indices] = 1
#         return predictions

#     def logit_threshold_pred(logits, threshold):
#         return (logits > threshold).astype(int)

#     def calc_metrics(labels, predictions):
#         subset_accuracy = accuracy_score(labels, predictions)
#         precision = precision_score(labels, predictions, average="micro")
#         recall = recall_score(labels, predictions, average="micro")
#         f1 = f1_score(labels, predictions, average="micro")
#         return subset_accuracy, precision, recall, f1

#     # Store results for top-k
#     top_k_results = {}
#     for k in range(*top_k_range):
#         predictions = top_k_pred(logits, k)
#         subset_accuracy, precision, recall, f1 = calc_metrics(labels, predictions)
#         top_k_results[k] = {
#             "accuracy": subset_accuracy,
#             "precision": precision,
#             "recall": recall,
#             "f1": f1
#         }

#     # Find optimal k
#     optimal_k = max(top_k_results, key=lambda k: top_k_results[k]["f1"])
#     optimal_k_metrics = top_k_results[optimal_k]

#     # Store results for thresholds
#     threshold_results = {}
#     for threshold in np.arange(*threshold_range):
#         predictions = logit_threshold_pred(logits, threshold)
#         subset_accuracy, precision, recall, f1 = calc_metrics(labels, predictions)
#         threshold_results[threshold] = {
#             "accuracy": subset_accuracy,
#             "precision": precision,
#             "recall": recall,
#             "f1": f1
#         }

#     # Find optimal threshold
#     optimal_threshold = max(threshold_results, key=lambda t: threshold_results[t]["f1"])
#     optimal_threshold_metrics = threshold_results[optimal_threshold]

#     # Return detailed metrics
#     return {
#         "logit_min": logit_min,
#         "logit_max": logit_max,
#         "logit_mean": logit_mean,
#         "optimal_k": optimal_k,
#         "optimal_k_accuracy": optimal_k_metrics["accuracy"],
#         "optimal_k_precision": optimal_k_metrics["precision"],
#         "optimal_k_recall": optimal_k_metrics["recall"],
#         "optimal_k_f1": optimal_k_metrics["f1"],
#         "optimal_threshold": optimal_threshold,
#         "optimal_threshold_accuracy": optimal_threshold_metrics["accuracy"],
#         "optimal_threshold_precision": optimal_threshold_metrics["precision"],
#         "optimal_threshold_recall": optimal_threshold_metrics["recall"],
#         "optimal_threshold_f1": optimal_threshold_metrics["f1"]
#     }



# # Initialize the model
# num_labels = len(y_train[0])
# model = MultiLabelRoBERTa(num_labels=num_labels)
# model.roberta.gradient_checkpointing_enable()
# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="steps",
#     eval_steps=4000,
#     save_strategy="steps",
#     save_steps=4000,
#     learning_rate=1e-5,
#     per_device_train_batch_size=32,
#     # gradient_accumulation_steps=2,
#     per_device_eval_batch_size=16,
#     num_train_epochs=20,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     fp16=True
# )
# # Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=dev_dataset,
#     processing_class=tokenizer,
#     compute_metrics=compute_metrics
# )

In [42]:
# Train the model
# trainer.train()

In [43]:
# Evaluate on the test set
# trainer.evaluate(test_dataset)

In [45]:
# # Random chance baseline
# label_frequencies = y_train.mean(axis=0)
# random_predictions = np.random.rand(*y_dev.shape) < label_frequencies
# random_baseline_metrics = compute_metrics((random_predictions, y_dev), prob=False)
# print("Random Baseline Metrics:", random_baseline_metrics)

In [46]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.feature_extraction.text import CountVectorizer

# # Naive Bayes baseline
# # Flatten multi-label data for Naive Bayes (treat each label as independent)
# X_train_NB = X_train
# X_dev_NB = X_dev

# vectorizer = CountVectorizer()
# X_train_vectorized = vectorizer.fit_transform(X_train_NB)
# X_dev_vectorized = vectorizer.transform(X_dev_NB)

# # Train separate Naive Bayes for each label
# naive_bayes_predictions = []
# for i in range(y_train.shape[1]):
#     model = MultinomialNB()
#     model.fit(X_dev_vectorized, y_train[:, i])
#     predictions = model.predict(X_dev_vectorized)
#     naive_bayes_predictions.append(predictions)

# naive_bayes_predictions = np.array(naive_bayes_predictions).T  # Convert to (samples x labels)
# naive_bayes_baseline_metrics = compute_metrics((naive_bayes_predictions, y_test))
# print("Naive Bayes Baseline Metrics:", naive_bayes_baseline_metrics)

In [None]:
from sklearn.model_selection import ParameterSampler
import numpy as np
import torch

# Define hyperparameter search space
param_grid = {
    "learning_rate": [1e-5],
    "num_train_epochs": [5, 10],
    "per_device_train_batch_size": [32, 64, 128],
    "weight_decay": [0.01]
}

# Generate random samples of hyperparameters
n_trials = 5
param_samples = list(ParameterSampler(param_grid, n_iter=n_trials, random_state=478))

best_score = -np.inf
best_params = None
best_model = None

# Iterate over sampled hyperparameters
for i, params in enumerate(param_samples):
    print(f"Trial {i+1}/{n_trials} with parameters: {params}")

    model = initialize_model()

    # Update TrainingArguments dynamically
    training_args = TrainingArguments(
        output_dir=f"./results/trial_{i}",
        eval_strategy="steps",
        eval_steps=6000,
        save_strategy="steps",
        save_steps=6000,
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        per_device_eval_batch_size=16, 
        num_train_epochs=params["num_train_epochs"],
        weight_decay=params["weight_decay"],
        fp16=True  # Mixed precision for speed
    )

    # Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate on the validation set
    metrics = trainer.evaluate(dev_dataset)
    f1 = metrics["eval_optimal_threshold_f1"]

    print(f"Trial {i+1} F1 Score: {f1}")

    # Keep track of the best model
    if f1 > best_score:
        best_score = f1
        best_params = params
        best_model = trainer
        trainer.save_model("./models")

# Output the best results
print(f"Best F1 Score: {best_score}")
print(f"Best Parameters: {best_params}")

# Save the best model
best_model.save_model("./best-model")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 1/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 64, 'num_train_epochs': 5, 'learning_rate': 1e-05}


Step,Training Loss,Validation Loss,Prob Min,Prob Max,Prob Mean,Optimal Threshold,Optimal Threshold Accuracy,Optimal Threshold Precision,Optimal Threshold Recall,Optimal Threshold F1,Optimal Threshold Hamming,Optimal Threshold Jaccard
6000,0.0185,0.01494,2e-06,0.209955,0.001209,0.05,0.000973,0.247967,0.076125,0.116488,0.002863,0.038178
12000,0.0142,0.01248,0.0,0.816113,0.001485,0.05,0.023811,0.234469,0.265232,0.248904,0.003969,0.131342
18000,0.0128,0.011364,0.0,0.927757,0.001695,0.1,0.0276,0.312335,0.260852,0.284281,0.003256,0.151378
24000,0.0121,0.010884,0.0,0.955817,0.001748,0.1,0.034992,0.320682,0.297114,0.308449,0.003303,0.176465
30000,0.0119,0.010721,0.0,0.96051,0.001789,0.1,0.036089,0.315552,0.310824,0.31317,0.00338,0.182378


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Trial 1 F1 Score: 0.3131558776007632
Trial 2/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 64, 'num_train_epochs': 10, 'learning_rate': 1e-05}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Prob Min,Prob Max,Prob Mean,Optimal Threshold,Optimal Threshold Accuracy,Optimal Threshold Precision,Optimal Threshold Recall,Optimal Threshold F1,Optimal Threshold Hamming,Optimal Threshold Jaccard
6000,0.0183,0.0148,3e-06,0.241311,0.001188,0.05,0.000828,0.272867,0.08895,0.134164,0.002846,0.044313
12000,0.0136,0.01196,0.0,0.876529,0.001488,0.05,0.01321,0.235754,0.316341,0.270166,0.004237,0.150695
18000,0.012,0.010708,0.0,0.948537,0.001707,0.1,0.028946,0.318001,0.309579,0.313734,0.003358,0.18047
24000,0.0111,0.010059,0.0,0.971019,0.001817,0.1,0.038843,0.315822,0.3593,0.336161,0.003518,0.206108
30000,0.0106,0.009663,0.0,0.980655,0.001863,0.15,0.049092,0.392637,0.320104,0.35268,0.002913,0.213795
36000,0.0102,0.009389,0.0,0.987277,0.001943,0.15,0.052198,0.390196,0.345175,0.366307,0.002961,0.227992
42000,0.01,0.009219,0.0,0.989511,0.001973,0.15,0.052778,0.385091,0.364156,0.374331,0.003018,0.236703
48000,0.0098,0.009102,0.0,0.991748,0.001966,0.15,0.05286,0.389809,0.372229,0.380816,0.003001,0.240585
54000,0.0097,0.009027,0.0,0.992823,0.002007,0.15,0.052985,0.386704,0.381831,0.384252,0.003034,0.24538
60000,0.0096,0.009003,0.0,0.993307,0.001985,0.15,0.053502,0.390462,0.382443,0.386411,0.003011,0.246404


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Trial 2 F1 Score: 0.3865365111561866
Trial 3/5 with parameters: {'weight_decay': 0.01, 'per_device_train_batch_size': 128, 'num_train_epochs': 10, 'learning_rate': 1e-05}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Prob Min,Prob Max,Prob Mean,Optimal Threshold,Optimal Threshold Accuracy,Optimal Threshold Precision,Optimal Threshold Recall,Optimal Threshold F1,Optimal Threshold Hamming,Optimal Threshold Jaccard
6000,0.0178,0.014531,2e-06,0.272315,0.001176,0.05,0.000766,0.275984,0.109137,0.156419,0.002918,0.055438
12000,0.0133,0.011669,0.0,0.877161,0.001597,0.05,0.02174,0.234866,0.349485,0.280934,0.004435,0.165884


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import ParameterSampler
import numpy as np
import torch

# Define hyperparameter search space
param_grid = {
    "learning_rate": [1e-5],
    "num_train_epochs": [5, 10],
    "per_device_train_batch_size": [32, 128],
    "weight_decay": [0.01]
}

# Generate random samples of hyperparameters
n_trials = 3
param_samples = list(ParameterSampler(param_grid, n_iter=n_trials, random_state=478))

best_score = -np.inf
best_params = None
best_model = None

# Iterate over sampled hyperparameters
for i, params in enumerate(param_samples):
    print(f"Trial {i+1}/{n_trials} with parameters: {params}")

    model = initialize_model()

    # Update TrainingArguments dynamically
    training_args = TrainingArguments(
        output_dir=f"./results/trial_{i}",
        eval_strategy="steps",
        eval_steps=6000,
        save_strategy="steps",
        save_steps=6000,
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        per_device_eval_batch_size=16, 
        num_train_epochs=params["num_train_epochs"],
        weight_decay=params["weight_decay"],
        fp16=True  # Mixed precision for speed
    )

    # Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate on the validation set
    metrics = trainer.evaluate(dev_dataset)
    f1 = metrics["eval_optimal_threshold_f1"]

    print(f"Trial {i+1} F1 Score: {f1}")
    print(metrics)

    # Keep track of the best model
    if f1 > best_score:
        best_score = f1
        best_params = params
        best_model = trainer
        trainer.save_model("./models")

# Output the best results
print(f"Best F1 Score: {best_score}")
print(f"Best Parameters: {best_params}")

# Save the best model
best_model.save_model("./best-model")


In [None]:
best_model.evaluate(test_dataset)