# TinyBERT Training/Fine-tuning

### 1. Colab Setup for **Training**

In [None]:
!git clone https://github.com/joms-hub/tagalog-fake-news-detection.git
!git config --global user.name "joms-hub"
!git config --global user.email "21101932@usc.edu.ph"

%cd /content/tagalog-fake-news-detection
!git remote set-url origin https://"TOKEN"@github.com/joms-hub/tagalog-fake-news-detection.git


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install transformers datasets evaluate

### 2. Model Training

#### 2.1 Load Preprocessed dataset from Day 2

In [None]:
from datasets import load_from_disk

train_dataset = load_from_disk("/content/tagalog-fake-news-detection/tokenized/MobileBERT_train")
val_dataset   = load_from_disk("/content/tagalog-fake-news-detection/tokenized/MobileBERT_val")
test_dataset  = load_from_disk("/content/tagalog-fake-news-detection/tokenized/MobileBERT_test")

print(train_dataset, val_dataset, test_dataset)


#### 2.2 Set up model

In [None]:
from transformers import MobileBertForSequenceClassification
model_name = "google/mobilebert-uncased"
model = MobileBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

#### 2.3 Finetuning model with early stopping

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from evaluate import load
import numpy as np

# Define a function to compute metrics
def compute_metrics(eval_pred):
    f1_metric = load("f1")
    acc_metric = load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_result = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    acc_result = acc_metric.compute(predictions=predictions, references=labels)
    # Combine results into one dictionary for logging
    return {
        "f1": f1_result["f1"],
        "accuracy": acc_result["accuracy"]
    }

# Define training arguments with early stopping settings
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start the training process
trainer.train()

### 3. Logging and Saving Outputs

In [None]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Optional: Define label names dynamically
label_names = ['Real', 'Fake']  # Or load from your dataset/processor

# Get predictions on the test set
predictions = trainer.predict(test_dataset)

# Ensure output directory exists
output_dir = "./outputs"
os.makedirs(output_dir, exist_ok=True)

# Extract and save metrics
metrics = predictions.metrics
metrics_file = os.path.join(output_dir, 'mobilebert_metrics.json')
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Saved metrics to {metrics_file}")

# Generate and print the confusion matrix
preds = np.argmax(predictions.predictions, axis=1)
cm = confusion_matrix(predictions.label_ids, preds)
cm_df = pd.DataFrame(cm, index=label_names, columns=[f'Predicted {n}' for n in label_names])
print("\nConfusion Matrix:")
print(cm_df)

# Save the confusion matrix to a CSV file
cm_file = os.path.join(output_dir, 'mobilebert_cm.csv')
cm_df.to_csv(cm_file)
print(f"Saved confusion matrix to {cm_file}")

# Save raw predictions and true labels for error analysis
results_df = pd.DataFrame({
    'true_label': predictions.label_ids,
    'predicted_label': preds
})
results_file = os.path.join(output_dir, 'mobilebert_predictions.csv')
results_df.to_csv(results_file, index=False)
print(f"Saved raw predictions to {results_file}")

# Print and save a full classification report
report = classification_report(predictions.label_ids, preds, target_names=label_names, output_dict=True)
report_file = os.path.join(output_dir, 'mobilebert_classification_report.json')
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())
print(f"Saved classification report to {report_file}")

### 6. Push trained model to HuggingFace

In [None]:
!pip install huggingface_hub --upgrade

In [None]:
from huggingface_hub import login

login("TOKEN")  # Paste your token here

In [None]:
trainer.push_to_hub("mobilebert_fake_news_filipino")

In [None]:
import shutil

shutil.copy('/content/drive/MyDrive/Colab Notebooks/mobilebert_training.ipynb', '/content/tagalog-fake-news-detection/notebooks')

In [None]:
!pip install nbstripout
!nbstripout '/content/tagalog-fake-news-detection/notebooks/mobilebert_training.ipynb'