In [None]:
# Step 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 1: Install dependencies and import libraries
!pip install -q transformers
!pip install -q torchinfo
!pip install -q datasets
!pip install -q evaluate
!pip install -q optuna
!pip install -q wandb

import wandb
# Log in to wandb. Replace with your actual WANDB API key
wandb.login(key="your_api_key_here")

# Import DistilBERT tokenizer and model instead of Roberta
from transformers import (DistilBertTokenizer, DistilBertForSequenceClassification, Trainer,
                          TrainingArguments, EarlyStoppingCallback)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpeng_zhao[0m ([33mpeng_zhao-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Step 2: Load and preprocess the new dataset (Movies_and_TV)
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Movies_and_TV", trust_remote_code=True)

# Shuffle and sample 20% of the full dataset
shuffled_dataset = dataset["full"].shuffle(seed=42)
subset_size = int(0.2 * len(shuffled_dataset))
subset_dataset = shuffled_dataset.select(range(subset_size))

# Filter out samples where rating == 3
subset_dataset = subset_dataset.filter(lambda x: x["rating"] != 3)

# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define tokenization function
def tokenize_and_format(examples):
    outputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    outputs["labels"] = [1 if rating > 3 else 0 for rating in examples["rating"]]
    return outputs

# Apply the tokenizer
tokenized_dataset = subset_dataset.map(tokenize_and_format, batched=True)

# Save the tokenized dataset to disk
tokenized_dataset.save_to_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_movies_checkpoint")

# Split into training and testing sets
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
tokenized_datasets.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Movies_and_TV.jsonl:   0%|          | 0.00/8.39G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/3465662 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/3214260 [00:00<?, ? examples/s]

Saving the dataset (0/11 shards):   0%|          | 0/3214260 [00:00<?, ? examples/s]

In [None]:
# Step 2': Reload dataset from checkpoint if needed
from datasets import load_from_disk
tokenized_dataset = load_from_disk("/content/drive/MyDrive/FP/Checkpoints/tokenized_movies_checkpoint")
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
tokenized_datasets.set_format("torch")


In [None]:
# Step 3: Load the previously fine-tuned DistilBERT model checkpoint
model = DistilBertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_CDs_distilbert", num_labels=2
)

# Compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
# Step 4: Setup TrainingArguments for continued fine-tuning
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/FP/Movies_results_continued",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=192,
    per_device_eval_batch_size=96,
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=6,
    warmup_steps=500,
    report_to=["wandb"],
    run_name="distilbert_movies_and_tv_finetune",  # Update the run name
    logging_steps=50,
    logging_first_step=True
)

# Show dataset sizes
total_train = len(tokenized_datasets["train"])
total_eval = len(tokenized_datasets["test"])
print("Total training samples:", total_train)
print("Total evaluation samples:", total_eval)

# Select a small subset for training/evaluation
train_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(int(0.05 * total_train)))
eval_subset = tokenized_datasets["test"].shuffle(seed=42).select(range(int(0.05 * total_eval)))

# Initialize Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)




Total training samples: 2571408
Total evaluation samples: 642852


In [None]:
# Step 5: Continue fine-tuning on the Movies_and_TV dataset
train_result = trainer.train()

# Print best checkpoint and evaluation metrics
print("Best model checkpoint:", trainer.state.best_model_checkpoint)
print("Best validation metric:", trainer.state.best_metric)
eval_results = trainer.evaluate()
print("Evaluation results of the best model:", eval_results)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1096,0.102446,0.961328,0.961109,0.960941,0.961328
2,0.0755,0.0959,0.964221,0.963633,0.963463,0.964221
3,0.0578,0.105915,0.964812,0.964817,0.964822,0.964812
4,0.0408,0.119757,0.964283,0.964431,0.964606,0.964283


Best model checkpoint: /content/drive/MyDrive/FP/Movies_results_continued/checkpoint-1340
Best validation metric: 0.09590006619691849


Evaluation results of the best model: {'eval_loss': 0.09590006619691849, 'eval_accuracy': 0.9642212681227055, 'eval_f1': 0.9636334025746888, 'eval_precision': 0.9634629014081372, 'eval_recall': 0.9642212681227055, 'eval_runtime': 30.6546, 'eval_samples_per_second': 1048.522, 'eval_steps_per_second': 10.928, 'epoch': 4.0}


In [None]:
# Save the final fine-tuned model
trainer.save_model("/content/drive/MyDrive/FP/Checkpoints/final_checkpoint_movies_distilbert")