In [None]:
# 15/06/25
# CSC354 - Assignment4 - ML
# Khadija Sheikh, Hamna Asghar
# FA22-BCS-095, 120
# Using ML algorithms to classify real vs. fake news

import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder

# Load data
with open('True.csv', 'r', encoding='utf-8') as f:
    real_news = f.readlines()
with open('Fake.csv', 'r', encoding='utf-8') as f:
    fake_news = f.readlines()

# Create labels
texts = real_news + fake_news
labels = ['real'] * len(real_news) + ['fake'] * len(fake_news)

df = pd.DataFrame({'text': texts, 'label': labels})
df['text'] = df['text'].str.lower().str.replace(r'[^a-z\s]', '', regex=True).str.strip()

le = LabelEncoder()
df['label'] = le.fit_transform(df['label']) # real: 1, fake: 0

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label']

# Apply PCA to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [None]:
# Evaluation function
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(name, model, X, y):
    print(f"\n{name} Evaluation:")

    # 70/30 Random Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("70/30 Random Split Accuracy:", accuracy_score(y_test, y_pred))
    print("70/30 Random Split F1 Score:", f1_score(y_test, y_pred))

    # 70/30 Stratified Split
    X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    model.fit(X_train_s, y_train_s)
    y_pred_s = model.predict(X_test_s)
    print("\n70/30 Stratified Split Accuracy:", accuracy_score(y_test_s, y_pred_s))
    print("70/30 Stratified Split F1 Score:", f1_score(y_test_s, y_pred_s))

    # 10-fold Random CV
    random_cv = KFold(n_splits=10, shuffle=True, random_state=42)
    acc_random = cross_val_score(model, X, y, cv=random_cv, scoring='accuracy').mean()
    f1_random = cross_val_score(model, X, y, cv=random_cv, scoring='f1').mean()
    print("\n10-fold Random CV Accuracy:", acc_random)
    print("10-fold Random CV F1 Score:", f1_random)

    # 10-fold Stratified CV
    strat_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    acc_strat = cross_val_score(model, X, y, cv=strat_cv, scoring='accuracy').mean()
    f1_strat = cross_val_score(model, X, y, cv=strat_cv, scoring='f1').mean()
    print("\n10-fold Stratified CV Accuracy:", acc_strat)
    print("10-fold Stratified CV F1 Score:", f1_strat)

In [None]:
# Evaluate Naïve Bayes Models
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

evaluate_model("Simple (Gaussian) NB", GaussianNB(), X_pca, y)
evaluate_model("\nMultinomial NB", MultinomialNB(), X, y)
evaluate_model("\nBernoulli NB", BernoulliNB(), X, y)


Simple (Gaussian) NB Evaluation:
70/30 Random Split Accuracy: 0.8056647141691999
70/30 Random Split F1 Score: 0.8060099178447191

70/30 Stratified Split Accuracy: 0.801215985764069
70/30 Stratified Split F1 Score: 0.8012454592631033

10-fold Random CV Accuracy: 0.8038748471017619
10-fold Random CV F1 Score: 0.8032933795347349

10-fold Stratified CV Accuracy: 0.8039194795324184
10-fold Stratified CV F1 Score: 0.803461185195079


Multinomial NB Evaluation:
70/30 Random Split Accuracy: 0.9330466375027805
70/30 Random Split F1 Score: 0.930490339465784

70/30 Stratified Split Accuracy: 0.9309705642470527
70/30 Stratified Split F1 Score: 0.9279355987305519

10-fold Random CV Accuracy: 0.9321335044078236
10-fold Random CV F1 Score: 0.9290709043807818

10-fold Stratified CV Accuracy: 0.9320001909990934
10-fold Stratified CV F1 Score: 0.9289421120803436


Bernoulli NB Evaluation:
70/30 Random Split Accuracy: 0.9508415511233039
70/30 Random Split F1 Score: 0.9496926929205555

70/30 Stratified S

In [None]:
# Neural Network using Backpropagation
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'learning_rate_init': [0.001, 0.01]
}
mlp = MLPClassifier(max_iter=500, random_state=42)
grid_nn = GridSearchCV(mlp, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_nn.fit(X, y)
best_nn = grid_nn.best_estimator_
print("Best NN Parameters:", grid_nn.best_params_)

Best NN Parameters: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.01, 'solver': 'adam'}


In [None]:
# Evaluate Neural Network
evaluate_model("Neural Network", best_nn, X_pca, y)


Neural Network Evaluation:
70/30 Random Split Accuracy: 0.8122636613034774
70/30 Random Split F1 Score: 0.7970177970177971

70/30 Stratified Split Accuracy: 0.8151553347668125
70/30 Stratified Split F1 Score: 0.8013387520918002

10-fold Random CV Accuracy: 0.8162203258662254
10-fold Random CV F1 Score: 0.8014992644510281

10-fold Stratified CV Accuracy: 0.81690996619415
10-fold Stratified CV F1 Score: 0.8043560735597624


In [None]:
# Evaluate SVM Models
from sklearn.svm import SVC

evaluate_model("SVM Linear", SVC(kernel='linear'), X_pca, y)
evaluate_model("\nSVM RBF", SVC(kernel='rbf'), X_pca, y)


SVM Linear Evaluation:
70/30 Random Split Accuracy: 0.8063320234299696
70/30 Random Split F1 Score: 0.7996932515337424

70/30 Stratified Split Accuracy: 0.804700823014755
70/30 Stratified Split F1 Score: 0.7974780870367523

10-fold Random CV Accuracy: 0.8074116109635459
10-fold Random CV F1 Score: 0.7994530437607297

10-fold Stratified CV Accuracy: 0.8073449443628548
10-fold Stratified CV F1 Score: 0.7994278520211573


SVM RBF Evaluation:
70/30 Random Split Accuracy: 0.8165640987617706
70/30 Random Split F1 Score: 0.804797222660565

70/30 Stratified Split Accuracy: 0.8155260621339068
70/30 Stratified Split F1 Score: 0.8031956968834045

10-fold Random CV Accuracy: 0.817754998634307
10-fold Random CV F1 Score: 0.805051174103502

10-fold Stratified CV Accuracy: 0.8175772457732791
10-fold Stratified CV F1 Score: 0.8048993757991235
