### Step 1: Load Cleaned Data

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


df = pd.read_csv("../dataset/cleaned/clean_news.csv")

# --- FIX: Ensure all 'clean_text' entries are strings (by filling NaNs with empty strings) ---
df['clean_text'] = df['clean_text'].fillna('')
print(f"Total rows after loading: {len(df)}")
print(f"Missing values in 'clean_text' after fillna: {df['clean_text'].isnull().sum()}")

Total rows after loading: 44919
Missing values in 'clean_text' after fillna: 0


  df = pd.read_csv("../dataset/cleaned/clean_news.csv")


### Step 2: Train-Test Split

In [2]:
train_texts, test_texts, train_labels, test_labels = \
    train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

print(f"Train samples: {len(train_texts)}, Test samples: {len(test_texts)}")

Train samples: 35935, Test samples: 8984


### Step 3: Use BERT Tokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### Step 4: Tokenize

In [4]:
# The data is passed as a list of strings, now guaranteed to contain no NaNs.
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

print("Tokenization successful!")
print(f"Input IDs shape (e.g., first element): {train_encodings['input_ids'][0][:10]}")
print("-" * 30)

Tokenization successful!
Input IDs shape (e.g., first element): [101, 2358, 2301, 7318, 2758, 23966, 2686, 6246, 2110, 15532]
------------------------------


### Step 5: Create Torch Dataset

In [5]:
# Step 5: Create a PyTorch Dataset class
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Convert labels to a list and then to a LongTensor
        self.labels = torch.tensor(list(labels), dtype=torch.long)

    def __getitem__(self, idx):
        # Create a dictionary for the batch
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

print("Dataset objects created.")

Dataset objects created.


### Step 6: Train BERT Model

In [6]:
# We have two classes: 0 (Fake) and 1 (True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
print("BERT Model initialized.")

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # <-- Reduced
    per_device_eval_batch_size=8,   # <-- Reduced
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Continue with Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("Trainer initialized. Ready to train!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model initialized.
Trainer initialized. Ready to train!


### Step 7: Model training

In [8]:
trainer.train()

print("Model trained!")

RuntimeError: MPS backend out of memory (MPS allocated: 9.00 GiB, other allocations: 24.66 MiB, max allowed: 9.07 GiB). Tried to allocate 192.00 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).