In [None]:
# Step 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 1: Install dependencies and import libraries
!pip install -q transformers
!pip install -q torchinfo
!pip install -q datasets
!pip install -q evaluate
!pip install -q optuna
!pip install -q wandb

import wandb
# Login to wandb. Replace "your_api_key_here" with your actual WANDB API key.
# Alternatively, you can set the WANDB_API_KEY environment variable.
wandb.login(key="your_api_key_here")

from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import optuna
import os


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpeng_zhao[0m ([33mpeng_zhao-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Step 2: Load and preprocess the dataset
# Load the "raw_review_CDs_and_Vinyl" configuration from the Amazon Reviews 2023 dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_CDs_and_Vinyl", trust_remote_code=True)

# Since the dataset only has one split ("full"), first shuffle the data first and then select 100%
shuffled_dataset = dataset["full"].shuffle(seed=42)
subset_size = int(1 * len(shuffled_dataset))
subset_dataset = shuffled_dataset.select(range(subset_size))

# Filter out samples where rating equals 3
subset_dataset = subset_dataset.filter(lambda x: x["rating"] != 3)

# Initialize the Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Define the tokenization and formatting function
def tokenize_and_format(examples):
    # Tokenize the 'text' field
    outputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    # Create binary labels: 1 if rating > 3 (positive), 0 if rating < 3 (negative)
    outputs["labels"] = [1 if rating > 3 else 0 for rating in examples["rating"]]
    return outputs

# Apply the tokenization function to the subset dataset using batched processing
tokenized_dataset = subset_dataset.map(tokenize_and_format, batched=True)

# Save a checkpoint of the tokenized dataset so you can load it in a new runtime later.
tokenized_dataset.save_to_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_CDs_checkpoint")

# Split the processed dataset into training and testing sets (80/20 split)
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

CDs_and_Vinyl.jsonl:   0%|          | 0.00/3.29G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/4827273 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/4543582 [00:00<?, ? examples/s]

Saving the dataset (0/17 shards):   0%|          | 0/4543582 [00:00<?, ? examples/s]

In [None]:
# Step 2':To continue from here in a new Colab runtime, you can load the dataset as follows:
from datasets import load_from_disk
tokenized_dataset = load_from_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_CDs_checkpoint")

# Split the loaded dataset into train and test sets (80/20 split)
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")

Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

In [None]:
# Step 3A: Hyperparameter search and Trainer setup

# Define the model initialization function for hyperparameter search
def model_init():
    return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Define the compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Define the hyperparameter search space function using Optuna
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16])
    }

# Define initial TrainingArguments.
# Include WANDB integration with report_to and run_name settings.
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,                   # Initial learning rate; will be tuned
    per_device_train_batch_size=32,        # Initial batch size; will be tuned
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=8,
    warmup_steps=500,
    report_to=["wandb"],                 # Enable logging to Weights & Biases
    run_name="roberta_amazon_reviews",   # Experiment run name in wandb
    logging_steps=50,
    logging_first_step=True
)

# Calculate the total number of samples in the train and test datasets
total_train = len(tokenized_datasets["train"])
total_eval = len(tokenized_datasets["test"])

print("Total training samples:", total_train)
print("Total evaluation samples:", total_eval)

# Use partial data for training and evaluation
train_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(int(0.002 * total_train)))
eval_subset = tokenized_datasets["test"].shuffle(seed=42).select(range(int(0.002 * total_eval)))

# Create a Trainer instance for hyperparameter search (using a subset of data for demonstration)
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Run hyperparameter search using Optuna backend (n_trials=10 for demonstration)
best_run = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    n_trials=10,
    hp_space=hp_space,
)




Total training samples: 3634865
Total evaluation samples: 908717


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-03-30 07:08:51,493] A new study created in memory with name: no-name-1b7bb9cb-4300-40dc-8243-2062d6c95356
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
train/epoch,▁█
train/global_step,▁█
train/grad_norm,█▁
train/learning_rate,▁█
train/loss,█▁

0,1
train/epoch,0.02201
train/global_step,50.0
train/grad_norm,3.48189
train/learning_rate,1e-05
train/loss,0.666


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1602,0.09955,0.963676,0.958927,0.961696,0.963676
2,0.126,0.218929,0.940561,0.947857,0.963635,0.940561
3,0.0709,0.110802,0.972482,0.970494,0.971099,0.972482


[I 2025-03-30 07:14:14,604] Trial 0 finished with value: 3.886556657937833 and parameters: {'learning_rate': 4.6905942087626794e-05, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 3.886556657937833.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▆▁█
eval/f1,▄▁█
eval/loss,▁█▂
eval/precision,▁▂█
eval/recall,▆▁█
eval/runtime,█▂▁
eval/samples_per_second,▁▇█
eval/steps_per_second,▁▇█
train/epoch,▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███
train/global_step,▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███

0,1
eval/accuracy,0.97248
eval/f1,0.97049
eval/loss,0.1108
eval/precision,0.9711
eval/recall,0.97248
eval/runtime,6.7625
eval/samples_per_second,268.688
eval/steps_per_second,16.858
total_flos,2868831392117760.0
train/epoch,3.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1516,0.108042,0.971932,0.971767,0.971619,0.971932
2,0.1359,0.195099,0.929554,0.939328,0.959942,0.929554
3,0.0855,0.097604,0.970831,0.971214,0.971689,0.970831
4,0.0304,0.114021,0.976335,0.976003,0.975777,0.976335


[I 2025-03-30 07:23:05,981] Trial 1 finished with value: 3.9044501839442085 and parameters: {'learning_rate': 5.843903099703166e-05, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 3.9044501839442085.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▇▁▇██
eval/f1,▇▁▇██
eval/loss,▂█▁▃▂
eval/precision,▆▁▆▇█
eval/recall,▇▁▇██
eval/runtime,▅▂█▁▇
eval/samples_per_second,▄▇▁█▂
eval/steps_per_second,▄▇▁█▂
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇███

0,1
eval/accuracy,0.97633
eval/f1,0.976
eval/loss,0.11402
eval/precision,0.97578
eval/recall,0.97633
eval/runtime,6.7576
eval/samples_per_second,268.881
eval/steps_per_second,16.87
total_flos,4763888768348160.0
train/epoch,4.98022


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2682,0.143272,0.949917,0.937744,0.946358,0.949917
2,0.2842,0.252111,0.933957,0.902063,0.872276,0.933957
3,0.2409,0.243289,0.933957,0.902063,0.872276,0.933957


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-03-30 07:29:38,895] Trial 2 finished with value: 3.6422532187216534 and parameters: {'learning_rate': 9.225645728791086e-05, 'per_device_train_batch_size': 8}. Best is trial 1 with value: 3.9044501839442085.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,█▁▁
eval/f1,█▁▁
eval/loss,▁█▇
eval/precision,█▁▁
eval/recall,█▁▁
eval/runtime,▁█▆
eval/samples_per_second,█▁▃
eval/steps_per_second,█▁▃
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.93396
eval/f1,0.90206
eval/loss,0.24329
eval/precision,0.87228
eval/recall,0.93396
eval/runtime,6.8505
eval/samples_per_second,265.235
eval/steps_per_second,16.641
total_flos,2868831392117760.0
train/epoch,3.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2004,0.114825,0.964227,0.962789,0.962101,0.964227
2,0.1686,0.1354,0.959824,0.956154,0.956011,0.959824
3,0.1069,0.13259,0.966979,0.966448,0.966044,0.966979


[I 2025-03-30 07:36:15,366] Trial 3 finished with value: 3.866449340623687 and parameters: {'learning_rate': 5.546426700542595e-05, 'per_device_train_batch_size': 8}. Best is trial 1 with value: 3.9044501839442085.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▅▁█
eval/f1,▆▁█
eval/loss,▁█▇
eval/precision,▅▁█
eval/recall,▅▁█
eval/runtime,▁█▄
eval/samples_per_second,█▁▄
eval/steps_per_second,█▁▄
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.96698
eval/f1,0.96645
eval/loss,0.13259
eval/precision,0.96604
eval/recall,0.96698
eval/runtime,6.7338
eval/samples_per_second,269.831
eval/steps_per_second,16.929
total_flos,2868831392117760.0
train/epoch,3.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5834,0.241692,0.933957,0.902063,0.872276,0.933957
2,0.1309,0.098991,0.965327,0.965656,0.966029,0.965327
3,0.0998,0.092007,0.971932,0.970803,0.970489,0.971932
4,0.0696,0.097172,0.975234,0.974887,0.974645,0.975234


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-03-30 07:45:34,759] Trial 4 finished with value: 3.9000000883901538 and parameters: {'learning_rate': 2.5006491203785904e-06, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 3.9044501839442085.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆▇██
eval/f1,▁▇███
eval/loss,█▁▁▁▁
eval/precision,▁▇███
eval/recall,▁▆▇██
eval/runtime,▂▁█▄▂
eval/samples_per_second,▇█▁▅▇
eval/steps_per_second,▇█▁▅▇
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇███

0,1
eval/accuracy,0.97523
eval/f1,0.97489
eval/loss,0.09717
eval/precision,0.97464
eval/recall,0.97523
eval/runtime,6.6292
eval/samples_per_second,274.089
eval/steps_per_second,17.197
total_flos,4763888768348160.0
train/epoch,4.98022


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1883,0.1456,0.933957,0.902063,0.872276,0.933957


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-03-30 07:47:37,790] Trial 5 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██

0,1
eval/accuracy,0.93396
eval/f1,0.90206
eval/loss,0.1456
eval/precision,0.87228
eval/recall,0.93396
eval/runtime,6.8005
eval/samples_per_second,267.186
eval/steps_per_second,16.763
train/epoch,1.0
train/global_step,455.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1571,0.103852,0.957072,0.958871,0.961537,0.957072


[I 2025-03-30 07:49:15,778] Trial 6 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇█
train/global_step,▁▃▄▆▇█

0,1
eval/accuracy,0.95707
eval/f1,0.95887
eval/loss,0.10385
eval/precision,0.96154
eval/recall,0.95707
eval/runtime,6.9225
eval/samples_per_second,262.479
eval/steps_per_second,16.468
train/epoch,1.0
train/global_step,228.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1513,0.132784,0.959824,0.952118,0.959861,0.959824
2,0.131,0.13225,0.958723,0.962025,0.968859,0.958723
3,0.0728,0.099057,0.977435,0.978049,0.979084,0.977435
4,0.0158,0.111506,0.981288,0.981064,0.980921,0.981288


[I 2025-03-30 08:00:27,425] Trial 7 finished with value: 3.9245612131789196 and parameters: {'learning_rate': 1.8239686998854285e-05, 'per_device_train_batch_size': 8}. Best is trial 7 with value: 3.9245612131789196.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▁▇██
eval/f1,▁▃▇██
eval/loss,██▁▂▄
eval/precision,▁▄▇██
eval/recall,▁▁▇██
eval/runtime,▁▂▁▂█
eval/samples_per_second,█▇█▇▁
eval/steps_per_second,█▇█▇▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.98129
eval/f1,0.98106
eval/loss,0.11151
eval/precision,0.98092
eval/recall,0.98129
eval/runtime,7.0874
eval/samples_per_second,256.372
eval/steps_per_second,16.085
total_flos,4886630075673600.0
train/epoch,4.9901


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1547,0.092442,0.965327,0.963438,0.962927,0.965327
2,0.1255,0.19085,0.945515,0.951499,0.964157,0.945515
3,0.0666,0.128071,0.975234,0.974784,0.974509,0.975234


[I 2025-03-30 08:06:59,652] Trial 8 finished with value: 3.899761068758609 and parameters: {'learning_rate': 2.473644195936579e-05, 'per_device_train_batch_size': 8}. Best is trial 7 with value: 3.9245612131789196.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▆▁█
eval/f1,▅▁█
eval/loss,▁█▄
eval/precision,▁▂█
eval/recall,▆▁█
eval/runtime,▂█▁
eval/samples_per_second,▇▁█
eval/steps_per_second,▇▁█
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.97523
eval/f1,0.97478
eval/loss,0.12807
eval/precision,0.97451
eval/recall,0.97523
eval/runtime,6.58
eval/samples_per_second,276.139
eval/steps_per_second,17.325
total_flos,2868831392117760.0
train/epoch,3.0


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1567,0.079442,0.970281,0.970725,0.971287,0.970281
2,0.1095,0.111875,0.944964,0.950718,0.962372,0.944964
3,0.0732,0.072269,0.980187,0.979526,0.979474,0.980187
4,0.0311,0.111284,0.980187,0.979526,0.979474,0.980187


[I 2025-03-30 08:15:59,365] Trial 9 finished with value: 3.919374315056191 and parameters: {'learning_rate': 4.338820105446594e-05, 'per_device_train_batch_size': 16}. Best is trial 7 with value: 3.9245612131789196.


In [None]:
# 3B. Print the best hyperparameters found by hyperparameter search
print("Best run hyperparameters:", best_run.hyperparameters)

# Update TrainingArguments with the best hyperparameters found
updated_training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_run.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to=["wandb"],
    run_name="roberta_amazon_reviews",
    logging_steps=50
)

# Calculate the total number of samples in the train and test datasets
total_train = len(tokenized_datasets["train"])
total_eval = len(tokenized_datasets["test"])

# Use partial data for training and evaluation
train_subset_final = tokenized_datasets["train"].shuffle(seed=42).select(range(int(0.002 * total_train)))
eval_subset_final = tokenized_datasets["test"].shuffle(seed=42).select(range(int(0.002 * total_eval)))

# Create the final Trainer instance with updated TrainingArguments for final training
final_trainer = Trainer(
    model_init=model_init,
    args=updated_training_args,
    train_dataset=train_subset_final,
    eval_dataset=eval_subset_final,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)



Best run hyperparameters: {'learning_rate': 1.8239686998854285e-05, 'per_device_train_batch_size': 8}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 4: Start final training with the updated Trainer using the best hyperparameters
train_result = final_trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1752,0.102172,0.972482,0.971312,0.971055,0.972482
2,0.0743,0.131787,0.971381,0.972111,0.973192,0.971381
3,0.0807,0.135196,0.976885,0.976217,0.975993,0.976885


In [None]:
# Save the final model checkpoint after training
final_trainer.save_model("/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_CDs")