# DistilBERT Training/Fine-tuning

### 1. Colab Setup for **Training**

In [3]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


### 2. Model Training

#### 2.1 Load Preprocessed dataset from Day 2

In [4]:
from datasets import load_from_disk

train_dataset = load_from_disk("/content/tagalog-fake-news-detection/tokenized/DistilBERT_train")
val_dataset   = load_from_disk("/content/tagalog-fake-news-detection/tokenized/DistilBERT_val")
test_dataset  = load_from_disk("/content/tagalog-fake-news-detection/tokenized/DistilBERT_test")

print(train_dataset, val_dataset, test_dataset)


Dataset({
    features: ['label', 'article', 'input_ids', 'attention_mask'],
    num_rows: 2244
}) Dataset({
    features: ['label', 'article', 'input_ids', 'attention_mask'],
    num_rows: 481
}) Dataset({
    features: ['label', 'article', 'input_ids', 'attention_mask'],
    num_rows: 481
})


#### 2.2 Set up model

In [5]:
from transformers import DistilBertForSequenceClassification

model_name = "distilbert-base-multilingual-cased"
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 2.3 Finetuning model with early stopping

In [8]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from evaluate import load
import numpy as np

# Define a function to compute metrics
def compute_metrics(eval_pred):
    f1_metric = load("f1")
    acc_metric = load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_result = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    acc_result = acc_metric.compute(predictions=predictions, references=labels)
    # Combine results into one dictionary for logging
    return {
        "f1": f1_result["f1"],
        "accuracy": acc_result["accuracy"]
    }

# Define training arguments with early stopping settings
training_args = TrainingArguments(
    output_dir='./tagalog-fake-news-detection/results',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./tagalog-fake-news-detection/logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start the training process
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.183794,0.93136,0.931393
2,0.287200,0.172438,0.95214,0.952183
3,0.133700,0.129322,0.958412,0.95842
4,0.133700,0.166347,0.964646,0.964657
5,0.049600,0.133718,0.970885,0.970894
6,0.024700,0.224556,0.949995,0.950104
7,0.024700,0.161757,0.966726,0.966736


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=497, training_loss=0.10229719908424546, metrics={'train_runtime': 1181.8823, 'train_samples_per_second': 18.987, 'train_steps_per_second': 0.601, 'total_flos': 2080797898088448.0, 'train_loss': 0.10229719908424546, 'epoch': 7.0})

In [11]:
trainer.save_model("/content/tagalog-fake-news-detection/trained_models")  # After training, this saves the best checkpoint/model

### 3. Logging and Saving Outputs

In [10]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Optional: Define label names dynamically
label_names = ['Real', 'Fake']  # Or load from your dataset/processor

# Get predictions on the test set
predictions = trainer.predict(test_dataset)

# Ensure output directory exists
output_dir = "./tagalog-fake-news-detection/outputs"
os.makedirs(output_dir, exist_ok=True)

# Extract and save metrics
metrics = predictions.metrics
metrics_file = os.path.join(output_dir, 'distilbert_metrics.json')
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Saved metrics to {metrics_file}")

# Generate and print the confusion matrix
preds = np.argmax(predictions.predictions, axis=1)
cm = confusion_matrix(predictions.label_ids, preds)
cm_df = pd.DataFrame(cm, index=label_names, columns=[f'Predicted {n}' for n in label_names])
print("\nConfusion Matrix:")
print(cm_df)

# Save the confusion matrix to a CSV file
cm_file = os.path.join(output_dir, 'confusion_matrix.csv')
cm_df.to_csv(cm_file)
print(f"Saved confusion matrix to {cm_file}")

# Save raw predictions and true labels for error analysis
results_df = pd.DataFrame({
    'true_label': predictions.label_ids,
    'predicted_label': preds
})
results_file = os.path.join(output_dir, 'predictions.csv')
results_df.to_csv(results_file, index=False)
print(f"Saved raw predictions to {results_file}")

# Print and save a full classification report
report = classification_report(predictions.label_ids, preds, target_names=label_names, output_dict=True)
report_file = os.path.join(output_dir, 'classification_report.json')
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())
print(f"Saved classification report to {report_file}")

Saved metrics to ./tagalog-fake-news-detection/outputs/distilbert_metrics.json

Confusion Matrix:
      Predicted Real  Predicted Fake
Real             230              11
Fake               4             236
Saved confusion matrix to ./tagalog-fake-news-detection/outputs/confusion_matrix.csv
Saved raw predictions to ./tagalog-fake-news-detection/outputs/predictions.csv

Classification Report:
              precision    recall  f1-score     support
Real           0.982906  0.954357  0.968421  241.000000
Fake           0.955466  0.983333  0.969199  240.000000
accuracy       0.968815  0.968815  0.968815    0.968815
macro avg      0.969186  0.968845  0.968810  481.000000
weighted avg   0.969214  0.968815  0.968809  481.000000
Saved classification report to ./tagalog-fake-news-detection/outputs/classification_report.json


### 6. Push trained model to HuggingFace

In [12]:
!pip install huggingface_hub --upgrade



In [31]:
from huggingface_hub import login

login("YOUR_HF_TOKEN")  # Paste your token here

HTTPError: Invalid user token.

In [25]:
!git add "/content/tagalog-fake-news-detection/notebooks/distilbert_training.ipynb"

In [44]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mnew file:   logs/events.out.tfevents.1756657116.ec4bf2bc78f7.784.2[m
	[32mnew file:   notebooks/distilbert_training.ipynb[m
	[32mnew file:   outputs/classification_report.json[m
	[32mnew file:   outputs/confusion_matrix.csv[m
	[32mnew file:   outputs/distilbert_metrics.json[m
	[32mnew file:   outputs/predictions.csv[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mresults/[m
	[31mtrained_models/[m



In [48]:
import shutil
shutil.copy('/content/drive/MyDrive/Colab Notebooks/{your_notebook}.ipynb', '/content/{your-repo}/')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/{your_notebook}.ipynb'

In [47]:
!git reset --soft HEAD~1

In [46]:
!git commit -m "saved distilbert_training.ipynb and outputs"
!git push


[main f43ebcd] saved distilbert_training.ipynb and outputs
 6 files changed, 521 insertions(+)
 create mode 100644 logs/events.out.tfevents.1756657116.ec4bf2bc78f7.784.2
 create mode 100644 notebooks/distilbert_training.ipynb
 create mode 100644 outputs/classification_report.json
 create mode 100644 outputs/confusion_matrix.csv
 create mode 100644 outputs/distilbert_metrics.json
 create mode 100644 outputs/predictions.csv
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 2 threads
Compressing objects: 100% (11/11), done.
Writing objects: 100% (11/11), 16.88 KiB | 5.63 MiB/s, done.
Total 11 (delta 0), reused 0 (delta 0), pack-reused 0
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot contain secrets