# Natural Language Processing Final Assignment

## Real - Fake News Classification: A Comparison of Natural Language Models for Classification

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier




In [2]:
# Load Data

df_raw = pd.read_csv("data/news_dataset.csv")

In [3]:
df_raw.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [4]:
# combine the title and text into one column "feature" - easier

df = df_raw.copy()

df["feature"] = (df["title"] + ' ' + df["text"]).fillna('')

In [5]:
df.head()

Unnamed: 0,title,text,subject,date,label,feature
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,Pope Francis Just Called Out Donald Trump Dur...


### Preprocessing Function

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jakobhren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jakobhren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jakobhren/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def preprocess_text(text):
    
    text = text.str.lower()
    
    # this remove links and twitter handels, and emails etc
    text = text.str.replace(r'http\S+|www\S+|https\S+', '', regex=True)
    text = text.str.replace(r'\S+@\S+', '', regex=True)
    text = text.str.replace(r'@\w+', '', regex=True)
    
    text = text.str.replace(f'[{string.punctuation}]', '', regex=True)
    
    text = text.str.replace(r'\d+', '', regex=True)
    
    text = text.str.replace(r'\s+', ' ', regex=True).str.strip()
    
    text_tokens = text.apply(nltk.word_tokenize)
    
    stop_words = set(stopwords.words('english'))
    text_tokens = text_tokens.apply(lambda tokens: [w for w in tokens if w not in stop_words])
    
    lemmatizer = WordNetLemmatizer()
    text_tokens = text_tokens.apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
    
    processed_text = text_tokens.apply(lambda tokens: ' '.join(tokens))
    
    return processed_text


In [8]:
df['feature'] = preprocess_text(df['feature'])

### Train - Test Split

In [9]:
X = df.feature
y = df["label"]

In [10]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=7)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=7)



X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [11]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(27110,)
(5809,)
(5810,)


## Model Fitting

### Define functions for efficient model evaluation and fitting

In [12]:
def evaluate_models(y_true, y_pred_dict, label=None):
    """
    Evaluate and compare multiple models' predictions.
    Parameters:
    y_true : array-like
        True labels.
    y_pred_dict : dict
        Dictionary where keys are model names and values are predicted labels.
    label : str, optional
        Description of the evaluation set (e.g., "Validation Set").
    Returns:
    pd.DataFrame containing accuracy, precision, recall, and F1 for each model.
    """

    metrics = {
        'Model': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1-score': []
    }

    for model_name, y_pred in y_pred_dict.items():
        print(f"\n===== {model_name}: {label if label else ''} =====")
        print("Classification Report:")
        print(classification_report(y_true, y_pred))

        metrics['Model'].append(model_name)
        metrics['Accuracy'].append(accuracy_score(y_true, y_pred))

        unique_labels = np.unique(y_true)
        if len(unique_labels) == 2:
            metrics['Precision'].append(precision_score(y_true, y_pred, average='binary', pos_label=unique_labels[1]))
            metrics['Recall'].append(recall_score(y_true, y_pred, average='binary', pos_label=unique_labels[1]))
            metrics['F1-score'].append(f1_score(y_true, y_pred, average='binary', pos_label=unique_labels[1]))
        else:
            metrics['Precision'].append(precision_score(y_true, y_pred, average='weighted'))
            metrics['Recall'].append(recall_score(y_true, y_pred, average='weighted'))
            metrics['F1-score'].append(f1_score(y_true, y_pred, average='weighted'))

    return pd.DataFrame(metrics)


In [None]:
def fit_pipeline_predict_evaluate(
    model_class,
    model_name,
    X_train,
    y_train,
    X_val,
    y_val,
    vectorizer_type='tfidf',
    ngram_range=(1,1)
):
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)
    elif vectorizer_type == 'bow':
        vectorizer = CountVectorizer(stop_words='english', ngram_range=ngram_range)
    else:
        raise ValueError("vectorizer_type must be either 'tfidf' or 'bow'")

    pipeline = make_pipeline(vectorizer, model_class)

    print(f"Fitting {model_name}...")
    pipeline.fit(X_train, y_train)

    print(f"Predicting on training set...")
    y_pred_train = pipeline.predict(X_train)

    print(f"Predicting on validation set...")
    y_pred_val = pipeline.predict(X_val)

    print(f"Evaluating {model_name} on training set...")
    train_metrics = evaluate_models(y_train, {model_name: y_pred_train}, label="Train Set")

    print(f"Evaluating {model_name} on validation set...")
    val_metrics = evaluate_models(y_val, {model_name: y_pred_val}, label="Validation Set")

    train_metrics["Dataset"] = "Train"
    val_metrics["Dataset"] = "Validation"

    return pd.concat([train_metrics, val_metrics], ignore_index=True)

### Bag Of Words + Unigrams Vecotrizer

In [None]:
logistic_bow_uni_metrics = fit_pipeline_predict_evaluate(
    LogisticRegression(max_iter=1000),
    "Logistic Regression (BoW)",
    X_train,
    y_train,
    X_val,
    y_val,
    vectorizer_type='bow',
    ngram_range=(1, 1)
)
print(logistic_bow_uni_metrics)

Fitting Logistic Regression (BoW)...
Predicting on training set...
Predicting on validation set...
Evaluating Logistic Regression (BoW) on training set...

===== Logistic Regression (BoW): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12532
           1       1.00      1.00      1.00     14578

    accuracy                           1.00     27110
   macro avg       1.00      1.00      1.00     27110
weighted avg       1.00      1.00      1.00     27110

Evaluating Logistic Regression (BoW) on validation set...

===== Logistic Regression (BoW): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2685
           1       0.99      0.99      0.99      3124

    accuracy                           0.99      5809
   macro avg       0.99      0.99      0.99      5809
weighted avg       0.99      0.99    

In [24]:
bow_uni_nb_metrics = fit_pipeline_predict_evaluate(
    model_class=MultinomialNB(),
    model_name="Multinomial Naive Bayes (BoW Unigrams)",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    vectorizer_type='bow',
    ngram_range=(1, 1)
)

print(bow_uni_nb_metrics)

Fitting Multinomial Naive Bayes (BoW Unigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Multinomial Naive Bayes (BoW Unigrams) on training set...

===== Multinomial Naive Bayes (BoW Unigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     12532
           1       0.97      0.97      0.97     14578

    accuracy                           0.97     27110
   macro avg       0.97      0.97      0.97     27110
weighted avg       0.97      0.97      0.97     27110

Evaluating Multinomial Naive Bayes (BoW Unigrams) on validation set...

===== Multinomial Naive Bayes (BoW Unigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2685
           1       0.96      0.95      0.95      3124

    accuracy                           0.95      5809
   macro avg       0.95   

In [25]:
bow_uni_rf_metrics = fit_pipeline_predict_evaluate(
    model_class=RandomForestClassifier(n_estimators=100, random_state=7),
    model_name="Random Forest (BoW Unigrams)",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    vectorizer_type='bow',
    ngram_range=(1, 1)
)

print(bow_uni_rf_metrics)

Fitting Random Forest (BoW Unigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Random Forest (BoW Unigrams) on training set...

===== Random Forest (BoW Unigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12532
           1       1.00      1.00      1.00     14578

    accuracy                           1.00     27110
   macro avg       1.00      1.00      1.00     27110
weighted avg       1.00      1.00      1.00     27110

Evaluating Random Forest (BoW Unigrams) on validation set...

===== Random Forest (BoW Unigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2685
           1       0.97      0.99      0.98      3124

    accuracy                           0.98      5809
   macro avg       0.98      0.98      0.98      5809
weighted avg       0.9

### Bag of Words + Bigrams Vecotrizer:

In [22]:
logistic_bow_bigrams_metrics = fit_pipeline_predict_evaluate(
    LogisticRegression(max_iter=1000),
    "Logistic Regression (BoW + Bigrams)",
    X_train,
    y_train,
    X_val,
    y_val,
    vectorizer_type='bow',
    ngram_range=(1, 2)
)
print(logistic_bow_bigrams_metrics)

Fitting Logistic Regression (BoW + Bigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Logistic Regression (BoW + Bigrams) on training set...

===== Logistic Regression (BoW + Bigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12532
           1       1.00      1.00      1.00     14578

    accuracy                           1.00     27110
   macro avg       1.00      1.00      1.00     27110
weighted avg       1.00      1.00      1.00     27110

Evaluating Logistic Regression (BoW + Bigrams) on validation set...

===== Logistic Regression (BoW + Bigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2685
           1       1.00      0.99      1.00      3124

    accuracy                           0.99      5809
   macro avg       0.99      0.99      0.

In [26]:
bow_bi_nb_metrics = fit_pipeline_predict_evaluate(
    model_class=MultinomialNB(),
    model_name="Multinomial Naive Bayes (BoW Bigrams)",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    vectorizer_type='bow',
    ngram_range=(1, 2)
)

print(bow_bi_nb_metrics)

Fitting Multinomial Naive Bayes (BoW Bigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Multinomial Naive Bayes (BoW Bigrams) on training set...

===== Multinomial Naive Bayes (BoW Bigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     12532
           1       0.99      1.00      1.00     14578

    accuracy                           1.00     27110
   macro avg       1.00      0.99      1.00     27110
weighted avg       1.00      1.00      1.00     27110

Evaluating Multinomial Naive Bayes (BoW Bigrams) on validation set...

===== Multinomial Naive Bayes (BoW Bigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      2685
           1       0.95      0.98      0.96      3124

    accuracy                           0.96      5809
   macro avg       0.96      0.

In [27]:
bow_bi_rf_metrics = fit_pipeline_predict_evaluate(
    model_class=RandomForestClassifier(n_estimators=100, random_state=7),
    model_name="Random Forest (BoW Bigrams)",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    vectorizer_type='bow',
    ngram_range=(1, 2)
)

print(bow_bi_rf_metrics)

Fitting Random Forest (BoW Bigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Random Forest (BoW Bigrams) on training set...

===== Random Forest (BoW Bigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12532
           1       1.00      1.00      1.00     14578

    accuracy                           1.00     27110
   macro avg       1.00      1.00      1.00     27110
weighted avg       1.00      1.00      1.00     27110

Evaluating Random Forest (BoW Bigrams) on validation set...

===== Random Forest (BoW Bigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2685
           1       0.96      0.99      0.97      3124

    accuracy                           0.97      5809
   macro avg       0.97      0.97      0.97      5809
weighted avg       0.97    

### TF-IDF Vectorizer:

In [None]:
logistic_tfidf_metrics = fit_pipeline_predict_evaluate(
    LogisticRegression(max_iter=1000),
    "Logistic Regression",
    X_train,
    y_train,
    X_val,
    y_val,
    vectorizer_type='tfidf',
    ngram_range=(1, 1)
)
print(logistic_tfidf_metrics)

Fitting Logistic Regression...
Predicting on training set...
Predicting on validation set...
Evaluating Logistic Regression on training set...

===== Logistic Regression: Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12532
           1       0.99      0.99      0.99     14578

    accuracy                           0.99     27110
   macro avg       0.99      0.99      0.99     27110
weighted avg       0.99      0.99      0.99     27110

Evaluating Logistic Regression on validation set...

===== Logistic Regression: Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2685
           1       0.98      0.99      0.98      3124

    accuracy                           0.98      5809
   macro avg       0.98      0.98      0.98      5809
weighted avg       0.98      0.98      0.98      5809

            

In [28]:
tfidf_uni_nb_metrics = fit_pipeline_predict_evaluate(
    model_class=MultinomialNB(),
    model_name="Multinomial Naive Bayes (TF-IDF Unigrams)",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    vectorizer_type='tfidf',
    ngram_range=(1, 1)
)

print(tfidf_uni_nb_metrics)

Fitting Multinomial Naive Bayes (TF-IDF Unigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Multinomial Naive Bayes (TF-IDF Unigrams) on training set...

===== Multinomial Naive Bayes (TF-IDF Unigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93     12532
           1       0.92      0.98      0.95     14578

    accuracy                           0.94     27110
   macro avg       0.95      0.94      0.94     27110
weighted avg       0.94      0.94      0.94     27110

Evaluating Multinomial Naive Bayes (TF-IDF Unigrams) on validation set...

===== Multinomial Naive Bayes (TF-IDF Unigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92      2685
           1       0.90      0.97      0.93      3124

    accuracy                           0.93      5809
   macro av

In [29]:
tfidf_uni_rf_metrics = fit_pipeline_predict_evaluate(
    model_class=RandomForestClassifier(n_estimators=100, random_state=7),
    model_name="Random Forest (TF-IDF Unigrams)",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    vectorizer_type='tfidf',
    ngram_range=(1, 1)
)

print(tfidf_uni_rf_metrics)

Fitting Random Forest (TF-IDF Unigrams)...
Predicting on training set...
Predicting on validation set...
Evaluating Random Forest (TF-IDF Unigrams) on training set...

===== Random Forest (TF-IDF Unigrams): Train Set =====
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12532
           1       1.00      1.00      1.00     14578

    accuracy                           1.00     27110
   macro avg       1.00      1.00      1.00     27110
weighted avg       1.00      1.00      1.00     27110

Evaluating Random Forest (TF-IDF Unigrams) on validation set...

===== Random Forest (TF-IDF Unigrams): Validation Set =====
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2685
           1       0.98      0.99      0.98      3124

    accuracy                           0.98      5809
   macro avg       0.98      0.98      0.98      5809
weighte