In [None]:
# Step 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 1: Install dependencies and import libraries
!pip install -q transformers
!pip install -q torchinfo
!pip install -q datasets
!pip install -q evaluate
!pip install -q optuna
!pip install -q wandb

import wandb
# Log in to wandb. Replace "your_api_key_here" with your actual WANDB API key.
wandb.login(key="your_api_key_here")

from transformers import (RobertaTokenizer, RobertaForSequenceClassification, Trainer,
                          TrainingArguments, EarlyStoppingCallback)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# Step 2: Load and preprocess the new dataset (Movies_and_TV)
# Load the "raw_review_Movies_and_TV" configuration from the Amazon Reviews 2023 dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Movies_and_TV", trust_remote_code=True)

# Shuffle the dataset and select all samples (adjust if needed)
shuffled_dataset = dataset["full"].shuffle(seed=42)
subset_size = int(0.2 * len(shuffled_dataset))  # 1 indicates 20% of the data
subset_dataset = shuffled_dataset.select(range(subset_size))

# Filter out samples where the rating equals 3
subset_dataset = subset_dataset.filter(lambda x: x["rating"] != 3)

# Initialize the Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Define the tokenization and formatting function
def tokenize_and_format(examples):
    # Tokenize the 'text' field
    outputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    # Create binary labels: 1 if rating > 3 (positive), 0 if rating < 3 (negative)
    outputs["labels"] = [1 if rating > 3 else 0 for rating in examples["rating"]]
    return outputs

# Apply the tokenization function to the new dataset using batched processing
tokenized_dataset = subset_dataset.map(tokenize_and_format, batched=True)

# Save a checkpoint of the tokenized dataset so you can load it in a new runtime later.
tokenized_dataset.save_to_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_movies_checkpoint")

# Split the processed dataset into training and testing sets (80/20 split)
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Movies_and_TV.jsonl:   0%|          | 0.00/8.39G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/3465662 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/3214260 [00:00<?, ? examples/s]

Saving the dataset (0/11 shards):   0%|          | 0/3214260 [00:00<?, ? examples/s]

In [None]:
# Step 2':To continue from here in a new Colab runtime, load the dataset as follows:
from datasets import load_from_disk
tokenized_dataset = load_from_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_movies_checkpoint")

# Split the processed dataset into training and testing sets (80/20 split)
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")

In [None]:
# Step 3: Load the previously fine-tuned model checkpoint
# Load the checkpoint from the previous training (from raw_review_CDs_and_Vinyl fine-tuning)
model = RobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_CDs", num_labels=2)

# Define the compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
# Step 4: Set up TrainingArguments for continued fine-tuning on the Movies_and_TV dataset
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/FP/Movies_results_continued",    # Directory to save training outputs
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,                    # Adjust learning rate as needed
    per_device_train_batch_size=192,         # Batch size per device during training
    per_device_eval_batch_size=96,
    gradient_accumulation_steps=1,
    num_train_epochs=5,                    # Number of epochs for continued training
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=6,
    warmup_steps=500,
    report_to=["wandb"],
    run_name="roberta_movies_and_tv_finetune",  # Set the wandb run name
    logging_steps=50,
    logging_first_step=True
)

# Calculate the total number of samples in the train and test datasets
total_train = len(tokenized_datasets["train"])
total_eval = len(tokenized_datasets["test"])

print("Total training samples:", total_train)
print("Total evaluation samples:", total_eval)

# Use partial data for training and evaluation
train_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(int(0.05 * total_train)))
eval_subset = tokenized_datasets["test"].shuffle(seed=42).select(range(int(0.05 * total_eval)))

# Initialize the Trainer with the loaded model and new dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)




Total training samples: 2571408
Total evaluation samples: 642852


In [None]:
# Step 5: Continue fine-tuning on the Movies_and_TV dataset
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0892,0.0819,0.970133,0.970155,0.970179,0.970133
2,0.0605,0.077884,0.972279,0.97197,0.971861,0.972279
3,0.0507,0.092339,0.971501,0.971493,0.971485,0.971501
4,0.039,0.101428,0.971439,0.971563,0.971717,0.971439


In [None]:
# Print est result checkpoint
print("Best model checkpoint:", trainer.state.best_model_checkpoint)
print("Best validation metric:", trainer.state.best_metric)

# Evaluation results of the best model
eval_results = trainer.evaluate()
print("Evaluation results of the best model:", eval_results)


Best model checkpoint: /content/drive/MyDrive/FP/Movies_results_continued/checkpoint-1340
Best validation metric: 0.0778844878077507


Evaluation results of the best model: {'eval_loss': 0.0778844878077507, 'eval_accuracy': 0.9722792607802875, 'eval_f1': 0.9719704797126254, 'eval_precision': 0.9718613153082752, 'eval_recall': 0.9722792607802875, 'eval_runtime': 61.686, 'eval_samples_per_second': 521.059, 'eval_steps_per_second': 5.431, 'epoch': 4.0}


In [None]:
# Save the updated model checkpoint after continued fine-tuning
trainer.save_model("/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_movies")