<a href="https://colab.research.google.com/github/jcvancity2022/NLP-Week1-Text-Classification/blob/main/true_fake_news_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Fake News Detection Using DistilBERT
This project fine-tunes a DistilBERT model to classify news articles
as Fake (0) or Real (1). We combined multiple public misinformation
datasets and trained a binary classifier using HuggingFace Transformers.

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Setup Python Libraries (pip)

In [19]:
#install some Python packages with pip

!pip install numpy torch datasets transformers evaluate --quiet



In [20]:
# let's check the version we are using

!pip freeze | grep -E '^numpy|^torch|^datasets|^transformers|^evaluate'

datasets==4.0.0
evaluate==0.4.6
numpy==2.0.2
torch==2.8.0+cu126
torchao==0.10.0
torchaudio==2.8.0+cu126
torchdata==0.11.0
torchsummary==1.5.1
torchtune==0.6.1
torchvision==0.23.0+cu126
transformers==4.57.1


# Create IMDB Dataset for Fine-tuning BERT

In [21]:
class_labels = {
    0: "Fake",
    1: "Real"
}


## Let's load the IMDB Dataset

fake-and-real-news-dataset

In [22]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load both CSV files
df_fake = pd.read_csv("/content/drive/MyDrive/fake-and-real-news-dataset/Fake.csv")
df_true = pd.read_csv("/content/drive/MyDrive/fake-and-real-news-dataset/True.csv")

# Assign numeric labels
df_fake["label"] = 0
df_true["label"] = 1

# Class labels
class_labels = {
    0: "Fake",
    1: "Real"
}

# Keep the proper columns
df_fake = df_fake[["title", "text", "subject", "date", "label"]]
df_true = df_true[["title", "text", "subject", "date", "label"]]

# Combine + shuffle
df_all = pd.concat([df_fake, df_true], axis=0).sample(frac=1).reset_index(drop=True)

df_all.head()


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


Fake News Detection

In [23]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load both CSV files
df_fake = pd.read_csv("/content/drive/MyDrive/Fake News Detection/fake.csv")
df_true = pd.read_csv("/content/drive/MyDrive/Fake News Detection/true.csv")

# Assign numeric labels
df_fake["label"] = 0
df_true["label"] = 1

# Class labels
class_labels = {
    0: "Fake",
    1: "Real"
}

# Keep the proper columns
df_fake = df_fake[["title", "text", "subject", "date", "label"]]
df_true = df_true[["title", "text", "subject", "date", "label"]]

# Combine + shuffle
df_all = pd.concat([df_fake, df_true], axis=0).sample(frac=1).reset_index(drop=True)

df_all.head()

Unnamed: 0,title,text,subject,date,label
0,"White House targets leakers, may restructure c...",WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,"July 26, 2017",1
1,Atlanta Mayor And Newspaper DESTROY Trump For...,Donald Trump called Atlanta a crime infested ...,News,"January 15, 2017",0
2,Canada frets over possible huge surge in asylu...,OTTAWA (Reuters) - Canada fears a huge surge i...,worldnews,"August 23, 2017",1
3,PROOF THEY KNOW SHE’S LOSING…Hillary’s Campaig...,Hillary s foreign policy spokesperson showed h...,politics,"Oct 9, 2016",0
4,What Beyoncé Just Announced She Is Doing For ...,While so many keep trashing Beyonc Knowles fo...,News,"February 22, 2016",0


## Let's create the train, validation, test sets

In [24]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split into train (70%) and temp (30%)
train_df, temp_df = train_test_split(
    df_all,
    test_size=0.30,
    random_state=42,
    stratify=df_all["label"]
)

# Split temp into val (15%) and test (15%)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["label"]
)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Bundle into DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "val": val_dataset,
    "test": test_dataset
})

# Remove unwanted auto-generated index column
for split in ["train", "val", "test"]:
    dataset[split] = dataset[split].remove_columns(["__index_level_0__"])

dataset



DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'subject', 'date', 'label'],
        num_rows: 31428
    })
    val: Dataset({
        features: ['title', 'text', 'subject', 'date', 'label'],
        num_rows: 6735
    })
    test: Dataset({
        features: ['title', 'text', 'subject', 'date', 'label'],
        num_rows: 6735
    })
})

## We start by tokenizing our dataset with the BERT's Fast Tokenizer

In [25]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["title", "text", "subject", "date"]
)

tokenized_dataset


Map:   0%|          | 0/31428 [00:00<?, ? examples/s]

Map:   0%|          | 0/6735 [00:00<?, ? examples/s]

Map:   0%|          | 0/6735 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 31428
    })
    val: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 6735
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 6735
    })
})

# Setup Training Metrics (Accuracy, F1)

In [26]:
import evaluate
import numpy as np

# Load accuracy and F1 evaluation modules
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Define function that HuggingFace Trainer will call
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Convert model logits -> predicted class (0 or 1)
    predictions = np.argmax(logits, axis=-1)

    # Compute accuracy and F1 using HuggingFace's evaluate package
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)

    # Return BOTH metrics
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }


# Setup Training Configurations

In [27]:
import os
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "Fake", 1: "Real"},
    label2id={"Fake": 0, "Real": 1}
)

training_args = TrainingArguments(
    seed=42,
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,       # REQUIRED hyperparameter
    weight_decay=0.01,        # REQUIRED hyperparameter
    warmup_ratio=0.1,         # REQUIRED hyperparameter
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics
)

os.environ["WANDB_DISABLED"] = "true"



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Evaluate UnFine-Tuned BERT on Test Set for a Baseline Metric


In [28]:
# let's first evaluate unfine-tuned model with test set

trainer.evaluate(tokenized_dataset["test"])

{'eval_loss': 0.7058759927749634,
 'eval_model_preparation_time': 0.0053,
 'eval_accuracy': 0.5174461766889383,
 'eval_f1': 0.0,
 'eval_runtime': 106.6968,
 'eval_samples_per_second': 63.123,
 'eval_steps_per_second': 3.946}

Without fine-tuning BERT, our model currently has around **52% Accuracy (eval_accuracy)** and **19% F1 (eval_f1)**, which is pretty bad due to the test dataset having around 50% positive and 50% negative reviews. 😕


Let's make it better with transfer learning! 🦾

# Fine-Tune BERT with IMDb Dataset

In [None]:
# let's fine-tune BERT with the IMDb dataset

trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1
1,0.0001,8.5e-05,0.0053,1.0,1.0


In [None]:
# let's see how well it did in the test set

trainer.evaluate(tokenized_dataset['test'])

**WOAH!** We got a **92% Accuracy (eval_accuracy)** and **92% F1 (eval_f1)** with just **1 epoch**! 🤯

# Try out some examples!

In [None]:
from transformers import pipeline
import torch

device = torch.cuda.current_device()

news_classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device
)


In [None]:
fake_news_example = """
BREAKING: Scientists warn that the sun will explode next Friday unless
all citizens donate $50 immediately. Anonymous experts confirm the threat
but provide no scientific evidence. Social media influencers are urging
everyone to act fast before it's too late.
"""
news_classifier(fake_news_example)


That is **99% POSITIVE**! *justinvitelli* loves the movie!

In [None]:
real_news_example = """
The U.S. Department of Labor released updated employment numbers today,
showing moderate job growth in manufacturing and technology sectors.
Economists say the trend aligns with long-term projections for the year.
"""
news_classifier(real_news_example)


That is **99% NEGATIVE**! *industriousbug16* must hate the movie very badly.

In [None]:
model.save_pretrained("/content/drive/MyDrive/fake-news-final-model/")
tokenizer.save_pretrained("/content/drive/MyDrive/fake-news-final-model/")


# Resources

### If you would like to use this model without running the entire notebook, try the model at my [HuggingFace](https://huggingface.co/wesleyacheng/movie-review-sentiment-classifier-with-bert).

### If you woud like to get this in GitHub, here's my [repo](https://github.com/wesleyacheng/movie-review-sentiment-classifier-with-bert).