In [1]:
# 1. Load code/data
!git clone https://github.com/joms-hub/tagalog-fake-news-detection.git
import os
os.chdir('/kaggle/working/tagalog-fake-news-detection')

# 2. Install packages (if needed)
!pip install transformers datasets evaluate huggingface_hub accelerate torch

fatal: destination path 'tagalog-fake-news-detection' already exists and is not an empty directory.


In [None]:
import json

def patch_feature_type(path):
    with open(path, 'r') as f:
        data = json.load(f)
    # Recursively replace "List" with "Sequence"
    def replace_list(obj):
        if isinstance(obj, dict):
            return {k: replace_list(v) for k, v in obj.items()}
        elif obj == "List":
            return "Sequence"
        elif isinstance(obj, list):
            return [replace_list(x) for x in obj]
        else:
            return obj
    data['features'] = replace_list(data['features'])
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)

# Patch all three dataset info files
patch_feature_type("/kaggle/working/tagalog-fake-news-detection/tokenized/MiniLMv2_train/dataset_info.json")
patch_feature_type("/kaggle/working/tagalog-fake-news-detection/tokenized/MiniLMv2_val/dataset_info.json")
patch_feature_type("/kaggle/working/tagalog-fake-news-detection/tokenized/MiniLMv2_test/dataset_info.json")

In [2]:
#!pip install --upgrade datasets

# 3. Load preprocessed datasets
from datasets import load_from_disk
train_dataset = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/MiniLMv2_train")
val_dataset = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/MiniLMv2_val")
test_dataset  = load_from_disk("/kaggle/working/tagalog-fake-news-detection/tokenized/MiniLMv2_test")


In [3]:
print(train_dataset, val_dataset, test_dataset)

Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2244
}) Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 481
}) Dataset({
    features: ['label', 'article', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 481
})


In [5]:

# 4. Set up model
from transformers import AutoModelForSequenceClassification

model_name = "nreimers/MiniLMv2-L6-H384-distilled-from-BERT-Base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/46.4M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nreimers/MiniLMv2-L6-H384-distilled-from-BERT-Base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:

# 5. Training with early stopping
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from evaluate import load
import numpy as np

os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

def compute_metrics(eval_pred):
    f1_metric = load("f1")
    acc_metric = load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_result = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    acc_result = acc_metric.compute(predictions=predictions, references=labels)
    return {
        "f1": f1_result["f1"],
        "accuracy": acc_result["accuracy"]
    }

# Adjust batch sizes if you hit RAM limits (Kaggle RAM is 16GB)
training_args = TrainingArguments(
    output_dir='/kaggle/working/tagalog-fake-news-detection/results',
    num_train_epochs=10,
    per_device_train_batch_size=32,   # Reduce from 32 if you get OOM errors
    per_device_eval_batch_size=32,    # Same here
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='/kaggle/working/tagalog-fake-news-detection/logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.374828,0.872669,0.873181
2,0.491800,0.289409,0.874497,0.87526
3,0.294400,0.275687,0.900203,0.900208
4,0.294400,0.239117,0.902284,0.902287
5,0.221400,0.241023,0.906275,0.906445
6,0.183400,0.220114,0.916829,0.91684
7,0.183400,0.217517,0.925155,0.925156
8,0.142300,0.244953,0.927159,0.927235
9,0.119800,0.230172,0.931384,0.931393
10,0.119800,0.230456,0.935535,0.935551


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=710, training_loss=0.22332385190775697, metrics={'train_runtime': 342.0902, 'train_samples_per_second': 65.597, 'train_steps_per_second': 2.075, 'total_flos': 744240768860160.0, 'train_loss': 0.22332385190775697, 'epoch': 10.0})

In [7]:
# 6. Logging and Saving Outputs
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

label_names = ['Real', 'Fake']  # Update if your labels are different

predictions = trainer.predict(test_dataset)

output_dir = "/kaggle/working/tagalog-fake-news-detection/outputs"
os.makedirs(output_dir, exist_ok=True)

metrics = predictions.metrics
metrics_file = os.path.join(output_dir, 'minilmv2_metrics.json')
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Saved metrics to {metrics_file}")

preds = np.argmax(predictions.predictions, axis=1)
cm = confusion_matrix(predictions.label_ids, preds)
cm_df = pd.DataFrame(cm, index=label_names, columns=[f'Predicted {n}' for n in label_names])
print("\nConfusion Matrix:")
print(cm_df)

cm_file = os.path.join(output_dir, 'minilmv2_cm.csv')
cm_df.to_csv(cm_file)
print(f"Saved confusion matrix to {cm_file}")

results_df = pd.DataFrame({
    'true_label': predictions.label_ids,
    'predicted_label': preds
})
results_file = os.path.join(output_dir, 'minilmv2_predictions.csv')
results_df.to_csv(results_file, index=False)
print(f"Saved raw predictions to {results_file}")

report = classification_report(predictions.label_ids, preds, target_names=label_names, output_dict=True)
report_file = os.path.join(output_dir, 'minilmv2_classification_report.json')
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())
print(f"Saved classification report to {report_file}")


Saved metrics to /kaggle/working/tagalog-fake-news-detection/outputs/minilmv2_metrics.json

Confusion Matrix:
      Predicted Real  Predicted Fake
Real             223              18
Fake              21             219
Saved confusion matrix to /kaggle/working/tagalog-fake-news-detection/outputs/minilmv2_cm.csv
Saved raw predictions to /kaggle/working/tagalog-fake-news-detection/outputs/minilmv2_predictions.csv

Classification Report:
              precision    recall  f1-score     support
Real           0.913934  0.925311  0.919588  241.000000
Fake           0.924051  0.912500  0.918239  240.000000
accuracy       0.918919  0.918919  0.918919    0.918919
macro avg      0.918993  0.918906  0.918913  481.000000
weighted avg   0.918982  0.918919  0.918915  481.000000
Saved classification report to /kaggle/working/tagalog-fake-news-detection/outputs/minilmv2_classification_report.json


In [8]:

# 7. Push trained model to HuggingFace
from huggingface_hub import login
login("")  # Replace with your token
trainer.push_to_hub("minilmv2-fake-news-filipino")

Uploading...:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jcunado/results/commit/ed7455caa7cd45b38e9597550e3708633ed42524', commit_message='minilmv2-fake-news-filipino', commit_description='', oid='ed7455caa7cd45b38e9597550e3708633ed42524', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jcunado/results', endpoint='https://huggingface.co', repo_type='model', repo_id='jcunado/results'), pr_revision=None, pr_num=None)

In [10]:
%cd /kaggle/working/tagalog-fake-news-detection/
!git config --global user.name "joms-hub"
!git config --global user.email "21101932@usc.edu.ph"
!git remote set-url origin https://@github.com/joms-hub/tagalog-fake-news-detection.git

/kaggle/working


In [11]:
!git pull
!git add logs outputs
!git commit -m "minilmv2 logs and outputs"
!git push

Already up to date.
[main e90d908] minilmv2 logs and outputs
 5 files changed, 520 insertions(+)
 create mode 100644 logs/events.out.tfevents.1756916648.acba49942f64.173.0
 create mode 100644 outputs/minilmv2_classification_report.json
 create mode 100644 outputs/minilmv2_cm.csv
 create mode 100644 outputs/minilmv2_metrics.json
 create mode 100644 outputs/minilmv2_predictions.csv
Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 4 threads
Compressing objects: 100% (9/9), done.
Writing objects: 100% (9/9), 5.14 KiB | 2.57 MiB/s, done.
Total 9 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
remote: Bypassed rule violations for refs/heads/main:[K
remote: 
remote: - Changes must be made through a pull request.[K
remote: 
To https://github.com/joms-hub/tagalog-fake-news-detection.git
   9566d82..e90d908  main -> main
