In [1]:
# 1. Load code/data
!git clone https://github.com/joms-hub/tagalog-fake-news-detection.git
import os
os.chdir('/kaggle/working/tagalog-fake-news-detection')

# 2. Install packages (if needed)
!pip install transformers datasets evaluate huggingface_hub accelerate torch

Cloning into 'tagalog-fake-news-detection'...
remote: Enumerating objects: 240, done.[K
remote: Counting objects: 100% (240/240), done.[K
remote: Compressing objects: 100% (173/173), done.[K
remote: Total 240 (delta 133), reused 150 (delta 63), pack-reused 0 (from 0)[K
Receiving objects: 100% (240/240), 4.88 MiB | 13.79 MiB/s, done.
Resolving deltas: 100% (133/133), done.
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloadin

In [2]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.6.0
    Uninstalling datasets-3.6.0:
      Successfully uninstalled datasets-3.6.0
Successfully installed datasets-4.0.0


In [3]:
# 3. Load preprocessed datasets
from datasets import load_from_disk
train_dataset = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/ELECTRA-small_train")
val_dataset = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/ELECTRA-small_val")
test_dataset  = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/ELECTRA-small_test")


In [4]:
print(train_dataset, val_dataset, test_dataset)

Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2244
}) Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 481
}) Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 481
})


In [5]:

# 4. Set up model
from transformers import ElectraForSequenceClassification
model_name = "google/electra-small-discriminator"
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)

2025-09-11 16:21:09.019970: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757607669.187303      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757607669.230533      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

In [6]:

# 5. Training with early stopping
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from evaluate import load
import numpy as np

os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

def compute_metrics(eval_pred):
    f1_metric = load("f1")
    acc_metric = load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_result = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    acc_result = acc_metric.compute(predictions=predictions, references=labels)
    return {
        "f1": f1_result["f1"],
        "accuracy": acc_result["accuracy"]
    }

# Adjust batch sizes if you hit RAM limits (Kaggle RAM is 16GB)
training_args = TrainingArguments(
    output_dir='/kaggle/working/tagalog-fake-news-detection/results',
    num_train_epochs=50,
    per_device_train_batch_size=32,   # Reduce from 32 if you get OOM errors
    per_device_eval_batch_size=32,    # Same here
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='/kaggle/working/tagalog-fake-news-detection/logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.47526,0.830679,0.831601
2,0.562000,0.34202,0.887509,0.887734
3,0.356600,0.317127,0.893454,0.893971
4,0.356600,0.241907,0.918918,0.918919
5,0.283300,0.213231,0.935514,0.935551
6,0.213900,0.19628,0.93136,0.931393
7,0.213900,0.227377,0.922843,0.923077


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=497, training_loss=0.31330383999246947, metrics={'train_runtime': 203.0636, 'train_samples_per_second': 552.536, 'train_steps_per_second': 17.482, 'total_flos': 462123718189056.0, 'train_loss': 0.31330383999246947, 'epoch': 7.0})

In [7]:
# 6. Logging and Saving Outputs
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

label_names = ['Real', 'Fake']  # Update if your labels are different

predictions = trainer.predict(test_dataset)

output_dir = "/kaggle/working/tagalog-fake-news-detection/outputs"
os.makedirs(output_dir, exist_ok=True)

metrics = predictions.metrics
metrics_file = os.path.join(output_dir, 'electra-small_metrics.json')
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Saved metrics to {metrics_file}")

preds = np.argmax(predictions.predictions, axis=1)
cm = confusion_matrix(predictions.label_ids, preds)
cm_df = pd.DataFrame(cm, index=label_names, columns=[f'Predicted {n}' for n in label_names])
print("\nConfusion Matrix:")
print(cm_df)

cm_file = os.path.join(output_dir, 'electra-small_cm.csv')
cm_df.to_csv(cm_file)
print(f"Saved confusion matrix to {cm_file}")

results_df = pd.DataFrame({
    'true_label': predictions.label_ids,
    'predicted_label': preds
})
results_file = os.path.join(output_dir, 'electra-small_predictions.csv')
results_df.to_csv(results_file, index=False)
print(f"Saved raw predictions to {results_file}")

report = classification_report(predictions.label_ids, preds, target_names=label_names, output_dict=True)
report_file = os.path.join(output_dir, 'electra-small_classification_report.json')
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())
print(f"Saved classification report to {report_file}")


Saved metrics to /kaggle/working/tagalog-fake-news-detection/outputs/electra-small_metrics.json

Confusion Matrix:
      Predicted Real  Predicted Fake
Real             220              21
Fake              13             227
Saved confusion matrix to /kaggle/working/tagalog-fake-news-detection/outputs/electra-small_cm.csv
Saved raw predictions to /kaggle/working/tagalog-fake-news-detection/outputs/electra-small_predictions.csv

Classification Report:
              precision    recall  f1-score     support
Real           0.944206  0.912863  0.928270  241.000000
Fake           0.915323  0.945833  0.930328  240.000000
accuracy       0.929314  0.929314  0.929314    0.929314
macro avg      0.929764  0.929348  0.929299  481.000000
weighted avg   0.929794  0.929314  0.929297  481.000000
Saved classification report to /kaggle/working/tagalog-fake-news-detection/outputs/electra-small_classification_report.json


In [9]:

# 7. Push trained model to HuggingFace
from huggingface_hub import login
login("")  # Replace with your token
trainer.push_to_hub("electra-small-fnf")

Uploading...:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jcunado/results/commit/8149fba26c1fc23b771a0c8471a648f6baaa4896', commit_message='electra-small-fnf', commit_description='', oid='8149fba26c1fc23b771a0c8471a648f6baaa4896', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jcunado/results', endpoint='https://huggingface.co', repo_type='model', repo_id='jcunado/results'), pr_revision=None, pr_num=None)

In [10]:
%cd /kaggle/working/tagalog-fake-news-detection/
!git config --global user.name "joms-hub"
!git config --global user.email "21101932@usc.edu.ph"
!git remote set-url origin https://@github.com/joms-hub/tagalog-fake-news-detection.git

/kaggle/working


In [11]:
!git pull
!git add logs outputs
!git commit -m "new electra-small logs and outputs"
!git push

Already up to date.
[main 7832764] new electra-small logs and outputs
 5 files changed, 520 insertions(+)
 create mode 100644 logs/events.out.tfevents.1757607697.bb1aeb4cf9d8.36.0
 create mode 100644 outputs/electra-small_classification_report.json
 create mode 100644 outputs/electra-small_cm.csv
 create mode 100644 outputs/electra-small_metrics.json
 create mode 100644 outputs/electra-small_predictions.csv
Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 4 threads
Compressing objects: 100% (9/9), done.
Writing objects: 100% (9/9), 4.62 KiB | 2.31 MiB/s, done.
Total 9 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
remote: Bypassed rule violations for refs/heads/main:[K
remote: 
remote: - Changes must be made through a pull request.[K
remote: 
To https://github.com/joms-hub/tagalog-fake-news-detection.git
   5bc787b..7832764  main -> main
