In [None]:
# Step 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 1: Install dependencies and import libraries
!pip install -q transformers
!pip install -q torchinfo
!pip install -q datasets
!pip install -q evaluate
!pip install -q optuna
!pip install -q wandb

import wandb
# Log in to wandb. Replace "your_api_key_here" with your actual WANDB API key.
wandb.login(key="your_api_key_here")

from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer,
                          TrainingArguments, EarlyStoppingCallback)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpeng_zhao[0m ([33mpeng_zhao-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Step 2: Load and preprocess the new dataset (Movies_and_TV)
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Movies_and_TV", trust_remote_code=True)

# Shuffle the dataset and select a subset
shuffled_dataset = dataset["full"].shuffle(seed=42)
subset_size = int(0.2 * len(shuffled_dataset))  # use 20% of the data
subset_dataset = shuffled_dataset.select(range(subset_size))

# Filter out samples where the rating equals 3
subset_dataset = subset_dataset.filter(lambda x: x["rating"] != 3)

# Initialize the DeBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

# Define the tokenization and formatting function
def tokenize_and_format(examples):
    outputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    outputs["labels"] = [1 if rating > 3 else 0 for rating in examples["rating"]]
    return outputs

# Tokenize the new dataset
tokenized_dataset = subset_dataset.map(tokenize_and_format, batched=True)

# Save a checkpoint of the tokenized dataset
tokenized_dataset.save_to_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_movies_checkpoint")

# Split into train/test and set to PyTorch tensors
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
tokenized_datasets.set_format("torch")


In [None]:
# Step 2': Load from checkpoint if needed
from datasets import load_from_disk
tokenized_dataset = load_from_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_movies_checkpoint")
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
tokenized_datasets.set_format("torch")


In [None]:
# Step 3: Load the previously fine-tuned DeBERTa model from the CDs training
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_CDs_deberta", num_labels=2
)

# Define the compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
# Step 4: Set up TrainingArguments for continued fine-tuning
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/FP/Movies_results_continued",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=6,
    warmup_steps=500,
    report_to=["wandb"],
    run_name="deberta_movies_and_tv_finetune",
    logging_steps=50,
    logging_first_step=True
)

# Print number of training and evaluation samples
total_train = len(tokenized_datasets["train"])
total_eval = len(tokenized_datasets["test"])
print("Total training samples:", total_train)
print("Total evaluation samples:", total_eval)

# Use partial data for actual training and eval
train_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(int(0.05 * total_train)))
eval_subset = tokenized_datasets["test"].shuffle(seed=42).select(range(int(0.05 * total_eval)))

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)




Total training samples: 2571408
Total evaluation samples: 642852


In [None]:
# Step 5: Continue fine-tuning
train_result = trainer.train()

# Print checkpoint and metrics
print("Best model checkpoint:", trainer.state.best_model_checkpoint)
print("Best validation metric:", trainer.state.best_metric)

eval_results = trainer.evaluate()
print("Evaluation results of the best model:", eval_results)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0951,0.084693,0.968981,0.968496,0.968397,0.968981
2,0.0656,0.078376,0.972279,0.971973,0.971863,0.972279
3,0.0441,0.089033,0.972435,0.972432,0.97243,0.972435
4,0.0323,0.101287,0.971657,0.971783,0.971943,0.971657


Could not locate the best model at /content/drive/MyDrive/FP/Movies_results_continued/checkpoint-2010/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Best model checkpoint: /content/drive/MyDrive/FP/Movies_results_continued/checkpoint-2010
Best validation metric: 0.07837598025798798


Evaluation results of the best model: {'eval_loss': 0.10128685086965561, 'eval_accuracy': 0.9716570219650302, 'eval_f1': 0.9717833744396753, 'eval_precision': 0.9719426493322775, 'eval_recall': 0.9716570219650302, 'eval_runtime': 76.6332, 'eval_samples_per_second': 419.426, 'eval_steps_per_second': 6.564, 'epoch': 4.0}


In [None]:
# Save the final model
trainer.save_model("/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_movies_deberta")