<a href="https://colab.research.google.com/github/joms-hub/tagalog-fake-news-detection/blob/main/notebooks/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

### 1. Install libraries and setup repo

In [None]:
!git clone https://github.com/joms-hub/tagalog-fake-news-detection.git
!pip install pandas transformers scikit-learn torch torchvision torchaudio

### 2. Basic Data Inspection

In [None]:
import pandas as pd

df = pd.read_csv("/content/tagalog-fake-news-detection/data/full.csv")
print(df.head())
print(df['label'].value_counts())


### Train/Validation/Test Split (70/15/15)

In [None]:
from sklearn.model_selection import train_test_split

# First split (70% train, 30% temp)
train, temp = train_test_split(
    df, test_size=0.30, stratify=df['label'], random_state=42
)

# Second split (50/50 of temp → 15% val, 15% test)
val, test = train_test_split(
    temp, test_size=0.50, stratify=temp['label'], random_state=42
)

print("Train size:", len(train))
print("Validation size:", len(val))
print("Test size:", len(test))

### Tokenizer Setup

In [None]:
# Set up HuggingFace Datasets

from datasets import Dataset

train_ds = Dataset.from_pandas(train.reset_index(drop=True))
val_ds = Dataset.from_pandas(val.reset_index(drop=True))
test_ds = Dataset.from_pandas(test.reset_index(drop=True))

# Define Models + Tokenizers

from transformers import AutoTokenizer

model_names = {
    "TinyBERT": "huawei-noah/TinyBERT_General_4L_312D",
    "DistilBERT": "distilbert-base-multilingual-cased",
    "MobileBERT": "google/mobilebert-uncased",
    "MiniLMv2": "nreimers/MiniLMv2-L6-H384-distilled-from-BERT-base",
    "ELECTRA-small": "google/electra-small-discriminator"
}

# Load tokenizers
tokenizers = {name: AutoTokenizer.from_pretrained(path) for name, path in model_names.items()}


In [None]:
# Encoding function

def encode(batch, tokenizer):
    return tokenizer(
        batch['article'],
        truncation=True,
        padding='max_length',
        max_length=512
    )


In [None]:
# Loop through models and save

import os

out_dir = "/content/tagalog-fake-news-detection/tokenized"
os.makedirs(out_dir, exist_ok=True)

for name, tok in tokenizers.items():
    print(f"Tokenizing for {name}...")

    train_enc = train_ds.map(lambda b: encode(b, tok), batched=True)
    val_enc   = val_ds.map(lambda b: encode(b, tok), batched=True)
    test_enc  = test_ds.map(lambda b: encode(b, tok), batched=True)

    # Save HuggingFace dataset objects to disk
    train_enc.save_to_disk(f"{out_dir}/{name}_train")
    val_enc.save_to_disk(f"{out_dir}/{name}_val")
    test_enc.save_to_disk(f"{out_dir}/{name}_test")


### 5. Creating a small sample for documentation

In [None]:
sample = train.head(20)   # pick first 20 rows
sample.to_csv("/content/tagalog-fake-news-detection/data/fake_news_sample.csv", index=False)

In [None]:
from google.colab import runtime

runtime.notebook_path
