<a href="https://colab.research.google.com/github/junyizhou0304/LLM-NLP-CV-Projects/blob/main/Fake_and_Real_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset: fake-and-real-news-dataset
https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset/data?select=True.csv

Model: AWD_LSTM pre-trained language model

1. Setup and Data Preparation

In [None]:
# 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split

# For traditional model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# For ULMFiT
from fastai.text.all import *

In [None]:
# 2. Load Fake and True datasets
fake_df = pd.read_csv("Fake.csv")
true_df = pd.read_csv("True.csv")

print(fake_df.head())
print(true_df.head())

                                                                                        title  \
0              Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing   
1                        Drunk Bragging Trump Staffer Started Russian Collusion Investigation   
2   Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People ‘In The Eye’   
3               Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)   
4                       Pope Francis Just Called Out Donald Trump During His Christmas Speech   

                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [None]:
# 3. Add label column: 0 = FAKE, 1 = TRUE
fake_df["label"] = 0
true_df["label"] = 1

# 4. Keep only the necessary columns
# Assumes both have 'title' and 'text'.
fake_df["full_text"] = fake_df["title"].astype(str) + ". " + fake_df["text"].astype(str)
true_df["full_text"] = true_df["title"].astype(str) + ". " + true_df["text"].astype(str)

fake_df = fake_df[["full_text", "label"]]
true_df = true_df[["full_text", "label"]]

# 5. Combine into a single DataFrame
df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)
print(df["label"].value_counts())

label
0    23481
1    21417
Name: count, dtype: int64


In [None]:
# 6. Basic cleaning constraints: drop very short texts
min_chars = 30
df = df[df["full_text"].str.len() >= min_chars].reset_index(drop=True)
print("Remaining samples:", len(df))

# 7. Train/valid/test split (stratified on label)
train_valid_df, test_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["label"]
)

train_valid_df = train_valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print("Train+Valid size:", len(train_valid_df))
print("Test size:", len(test_df))

Remaining samples: 44898
Train+Valid size: 40408
Test size: 4490


2. ULMFiT Model (fastai AWD_LSTM)

In [None]:
# 8. Create TextDataLoaders for LANGUAGE MODEL (is_lm=True)
dls_lm = TextDataLoaders.from_df(
    train_valid_df,
    text_col="full_text",
    is_lm=True,
    valid_pct=0.1,
    seed=42,
    bs=64  # adjust batch size for GPU
)

# 9. Create language model learner
learn_lm = language_model_learner(
    dls_lm,
    AWD_LSTM,
    drop_mult=0.5,
    metrics=[accuracy, Perplexity()]
).to_fp16()

In [None]:
# 10. Fine-tune language model
# 1 epoch is often enough to adapt style to this corpus for the assignment
learn_lm.fine_tune(1)

# 11. Save encoder for classifier
learn_lm.save_encoder("fake_news_lm_encoder")

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.815439,3.643878,0.344883,38.239834,15:42


epoch,train_loss,valid_loss,accuracy,perplexity,time


KeyboardInterrupt: 

2.2. Text Classifier using the fine-tuned encoder

In [None]:
# 12. Create TextDataLoaders for CLASSIFICATION (is_lm=False)
dls_clas = TextDataLoaders.from_df(
    train_valid_df,
    text_col="full_text",
    label_col="label",
    valid_pct=0.1,
    seed=42,
    bs=64
)

# 13. Create classifier learner
learn_clas = text_classifier_learner(
    dls_clas,
    AWD_LSTM,
    drop_mult=0.5,
    metrics=accuracy
).to_fp16()

# 14. Load the encoder from the language model
learn_clas = learn_clas.load_encoder("fake_news_lm_encoder")

In [None]:
# 15. Train the classifier with gradual unfreezing (ULMFiT style)

# First, train the new classifier head only
learn_clas.fit_one_cycle(1, 2e-2)

# Then unfreeze last layers and train
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(1, slice(1e-3, 1e-2))

# Unfreeze more layers
learn_clas.freeze_to(-3)
learn_clas.fit_one_cycle(1, slice(1e-4, 1e-3))

# Finally, unfreeze all layers
learn_clas.unfreeze()
learn_clas.fit_one_cycle(2, slice(1e-5, 1e-4))

2.3. Evaluate ULMFiT on Test Set

In [None]:
# 16. Create a test dataloader from test_df
test_dl = learn_clas.dls.test_dl(test_df["full_text"])

# 17. Get predictions
preds, _ = learn_clas.get_preds(dl=test_dl)
pred_labels = preds.argmax(dim=1).numpy()

# 18. True labels
y_test = test_df["label"].values

# 19. Compute metrics
ulmfit_acc = (pred_labels == y_test).mean()
print("ULMFiT Test Accuracy:", ulmfit_acc)

print("ULMFiT Classification Report:")
print(classification_report(y_test, pred_labels, digits=4))

print("ULMFiT Confusion Matrix:")
print(confusion_matrix(y_test, pred_labels))

3. Traditional Model: TF-IDF + Logistic Regression

In [None]:
# 20. Extract raw text and labels
X_train_valid = train_valid_df["full_text"].values
y_train_valid = train_valid_df["label"].values

X_test = test_df["full_text"].values
y_test = test_df["label"].values

# Split train_valid into actual train and validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid,
    y_train_valid,
    test_size=0.1,
    random_state=42,
    stratify=y_train_valid
)

In [None]:
# 21. TF-IDF vectorization
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_valid_tfidf = tfidf.transform(X_valid)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shape:", X_train_tfidf.shape)

TF-IDF shape: (36367, 20000)


In [None]:
# 22. Train Logistic Regression baseline
log_reg = LogisticRegression(
    max_iter=200,
    n_jobs=-1,
    C=1.0
)

log_reg.fit(X_train_tfidf, y_train)

# 23. Validation performance
y_valid_pred = log_reg.predict(X_valid_tfidf)
valid_acc = accuracy_score(y_valid, y_valid_pred)
print("Validation Accuracy (LogReg + TF-IDF):", valid_acc)

Validation Accuracy (LogReg + TF-IDF): 0.9888641425389755


In [None]:
# 24. Test performance
y_test_pred = log_reg.predict(X_test_tfidf)
baseline_acc = accuracy_score(y_test, y_test_pred)
print("Test Accuracy (LogReg + TF-IDF):", baseline_acc)

print("Baseline Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

print("Baseline Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Test Accuracy (LogReg + TF-IDF): 0.9888641425389755
Baseline Classification Report:
              precision    recall  f1-score   support

           0     0.9919    0.9868    0.9893      2348
           1     0.9856    0.9911    0.9884      2142

    accuracy                         0.9889      4490
   macro avg     0.9887    0.9890    0.9888      4490
weighted avg     0.9889    0.9889    0.9889      4490

Baseline Confusion Matrix:
[[2317   31]
 [  19 2123]]


4. Quick Comparison for Report

In [None]:
print(f"ULMFiT Test Accuracy:       {ulmfit_acc:.4f}")
print(f"LogReg + TF-IDF Accuracy:   {baseline_acc:.4f}")

ULMFiT Test Accuracy:       0.9998
LogReg + TF-IDF Accuracy:   0.9889


In [None]:
import joblib

# TF-IDF vectorizer + Logistic Regression model
joblib.dump((tfidf, log_reg), "tfidf_logreg.pkl")
print("Saved tfidf_logreg.pkl")


Saved tfidf_logreg.pkl


In [None]:
import joblib

obj = joblib.load("tfidf_logreg.pkl")
print(type(obj))


<class 'tuple'>
