In [17]:
# 1. Load code/data
!git clone https://github.com/joms-hub/tagalog-fake-news-detection.git
import os
os.chdir('/kaggle/working/tagalog-fake-news-detection')

# 2. Install packages (if needed)
!pip install transformers datasets evaluate huggingface_hub accelerate torch

Cloning into 'tagalog-fake-news-detection'...
remote: Enumerating objects: 225, done.[K
remote: Counting objects: 100% (225/225), done.[K
remote: Compressing objects: 100% (163/163), done.[K
remote: Total 225 (delta 119), reused 143 (delta 58), pack-reused 0 (from 0)[K
Receiving objects: 100% (225/225), 4.87 MiB | 16.10 MiB/s, done.
Resolving deltas: 100% (119/119), done.


In [None]:
import json

def patch_feature_type(path):
    with open(path, 'r') as f:
        data = json.load(f)
    # Recursively replace "List" with "Sequence"
    def replace_list(obj):
        if isinstance(obj, dict):
            return {k: replace_list(v) for k, v in obj.items()}
        elif obj == "List":
            return "Sequence"
        elif isinstance(obj, list):
            return [replace_list(x) for x in obj]
        else:
            return obj
    data['features'] = replace_list(data['features'])
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)

# Patch all three dataset info files
patch_feature_type("/kaggle/working/tagalog-fake-news-detection/tokenized/MobileBERT_train/dataset_info.json")
patch_feature_type("/kaggle/working/tagalog-fake-news-detection/tokenized/MobileBERT_val/dataset_info.json")
patch_feature_type("/kaggle/working/tagalog-fake-news-detection/tokenized/MobileBERT_test/dataset_info.json")

In [2]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.6.0
    Uninstalling datasets-3.6.0:
      Successfully uninstalled datasets-3.6.0
Successfully installed datasets-4.0.0


In [3]:
# 3. Load preprocessed datasets
from datasets import load_from_disk
train_dataset = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/MobileBERT_train")
val_dataset = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/MobileBERT_val")
test_dataset  = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/MobileBERT_test")


In [4]:
print(train_dataset, val_dataset, test_dataset)

Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2244
}) Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 481
}) Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 481
})


In [5]:

# 4. Set up model
from transformers import MobileBertForSequenceClassification
model_name = "google/mobilebert-uncased"
model = MobileBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

2025-09-11 15:39:07.200674: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757605147.416838      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757605147.468817      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

In [6]:

# 5. Training with early stopping
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from evaluate import load
import numpy as np

os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

def compute_metrics(eval_pred):
    f1_metric = load("f1")
    acc_metric = load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_result = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    acc_result = acc_metric.compute(predictions=predictions, references=labels)
    return {
        "f1": f1_result["f1"],
        "accuracy": acc_result["accuracy"]
    }

# Adjust batch sizes if you hit RAM limits (Kaggle RAM is 16GB)
training_args = TrainingArguments(
    output_dir='/kaggle/working/tagalog-fake-news-detection/results',
    num_train_epochs=100,
    per_device_train_batch_size=32,   # Reduce from 32 if you get OOM errors
    per_device_eval_batch_size=32,    # Same here
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='/kaggle/working/tagalog-fake-news-detection/logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.39809,0.780742,0.785863
2,543354.960000,0.27931,0.872135,0.873181
3,0.819100,0.284677,0.890916,0.891892
4,0.819100,0.196547,0.935383,0.935551
5,0.163800,0.12543,0.954244,0.954262
6,0.110100,0.17078,0.939675,0.939709
7,0.110100,0.148582,0.964651,0.964657
8,0.078000,0.211822,0.956341,0.956341
9,0.034000,0.288589,0.935422,0.935551


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=639, training_loss=85032.26628324216, metrics={'train_runtime': 594.1722, 'train_samples_per_second': 377.668, 'train_steps_per_second': 11.949, 'total_flos': 1266461289308160.0, 'train_loss': 85032.26628324216, 'epoch': 9.0})

In [27]:
# 6. Logging and Saving Outputs
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

label_names = ['Real', 'Fake']  # Update if your labels are different

predictions = trainer.predict(test_dataset)

output_dir = "/kaggle/working/tagalog-fake-news-detection/outputs"
os.makedirs(output_dir, exist_ok=True)

metrics = predictions.metrics
metrics_file = os.path.join(output_dir, 'mobilebert_metrics.json')
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Saved metrics to {metrics_file}")

preds = np.argmax(predictions.predictions, axis=1)
cm = confusion_matrix(predictions.label_ids, preds)
cm_df = pd.DataFrame(cm, index=label_names, columns=[f'Predicted {n}' for n in label_names])
print("\nConfusion Matrix:")
print(cm_df)

cm_file = os.path.join(output_dir, 'mobilebert_cm.csv')
cm_df.to_csv(cm_file)
print(f"Saved confusion matrix to {cm_file}")

results_df = pd.DataFrame({
    'true_label': predictions.label_ids,
    'predicted_label': preds
})
results_file = os.path.join(output_dir, 'mobilebert_predictions.csv')
results_df.to_csv(results_file, index=False)
print(f"Saved raw predictions to {results_file}")

report = classification_report(predictions.label_ids, preds, target_names=label_names, output_dict=True)
report_file = os.path.join(output_dir, 'mobilebert_classification_report.json')
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())
print(f"Saved classification report to {report_file}")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Saved metrics to /kaggle/working/tagalog-fake-news-detection/outputs/mobilebert_metrics.json

Confusion Matrix:
      Predicted Real  Predicted Fake
Real             237               4
Fake              18             222
Saved confusion matrix to /kaggle/working/tagalog-fake-news-detection/outputs/mobilebert_cm.csv
Saved raw predictions to /kaggle/working/tagalog-fake-news-detection/outputs/mobilebert_predictions.csv

Classification Report:
              precision    recall  f1-score     support
Real           0.929412  0.983402  0.955645  241.000000
Fake           0.982301  0.925000  0.952790  240.000000
accuracy       0.954262  0.954262  0.954262    0.954262
macro avg      0.955856  0.954201  0.954217  481.000000
weighted avg   0.955801  0.954262  0.954220  481.000000
Saved classification report to /kaggle/working/tagalog-fake-news-detection/outputs/mobilebert_classification_report.json


In [8]:

# 7. Push trained model to HuggingFace
from huggingface_hub import login
login("")  # Replace with your token
trainer.push_to_hub("mobilebert-fnf")

Uploading...:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jcunado/results/commit/a8ab19a6f9ba2fba37d01f6d2aaca35271c4b4f2', commit_message='mobilebert-fnf', commit_description='', oid='a8ab19a6f9ba2fba37d01f6d2aaca35271c4b4f2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jcunado/results', endpoint='https://huggingface.co', repo_type='model', repo_id='jcunado/results'), pr_revision=None, pr_num=None)

In [28]:
%cd /kaggle/working/tagalog-fake-news-detection/
!git config --global user.name "joms-hub"
!git config --global user.email "21101932@usc.edu.ph"
!git remote set-url origin https://@github.com/joms-hub/tagalog-fake-news-detection.git

/kaggle/working


In [29]:
!git pull
!git add logs outputs
!git commit -m "new tinybert logs and outputs"
!git push

Already up to date.
[main cf48df5] new tinybert logs and outputs
 4 files changed, 520 insertions(+)
 create mode 100644 outputs/mobilebert_classification_report.json
 create mode 100644 outputs/mobilebert_cm.csv
 create mode 100644 outputs/mobilebert_metrics.json
 create mode 100644 outputs/mobilebert_predictions.csv
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 4 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 1.08 KiB | 1.08 MiB/s, done.
Total 7 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
remote: Bypassed rule violations for refs/heads/main:[K
remote: 
remote: - Changes must be made through a pull request.[K
remote: 
To https://github.com/joms-hub/tagalog-fake-news-detection.git
   803aa3f..cf48df5  main -> main
