In [1]:
# Import libraries
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
import json
import joblib as joblib

import re
import string

In [2]:
# nltk.download('punkt')
# pip install unidecode
from unidecode import unidecode

In [3]:
# Load data
data_train = pd.read_csv('../../Data Splits/train_data_70_30.csv')
data_val = pd.read_csv('../../Data Splits/val_data_70_30.csv')

In [4]:
X_train = data_train['article']
y_train = data_train['label']

X_val = data_val['article']
y_val = data_val['label']

In [5]:
# Stopword Removal
ps = PorterStemmer()
with open('../../Datasets/stopwords-tl.json', 'r') as f:
    stopwords = json.load(f)

# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [' '.join(self.preprocess(text)) for text in X]
    
    def preprocess(self, text):
        # Lowercase Conversion
        lowered = text.lower()

        # URL Removal
        urled = re.sub(r'https?://\S+|www\.\S+', '', lowered)
        
        # Text Simplification
        text = re.sub(r'\[.*?\]', '', urled)
        text = re.sub(r"\\W", " ", text)
        text = re.sub(r'<.*?>+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\w*\d\w*', '', text)

        # Tokenization
        tokens = nltk.word_tokenize(text)
        stems = [ps.stem(token) for token in tokens]
        filtered = [stem for stem in stems if stem not in stopwords]
        return filtered

In [6]:
# Initializing base models

mnb = MultinomialNB()
lr = LogisticRegression()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(probability=True)

# mnb = MultinomialNB(alpha= 0.1, fit_prior=False)
# lr = LogisticRegression(C= 100, penalty= 'l2', solver= 'liblinear')
# rf = RandomForestClassifier(n_estimators= 300, random_state=42)
# knn = KNeighborsClassifier(metric= 'euclidean', n_neighbors= 5, weights= 'uniform')
# svm = SVC(C=10, degree=2, kernel = 'linear', probability = True)

base_models = [mnb, lr, rf, knn, svm]
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']

In [7]:
stack_rf = joblib.load('../../ModelsV2/pipeline_rf_964.joblib')
stack_lr = joblib.load('../../ModelsV2/pipeline_lr.joblib')
stack_svm = joblib.load('../../ModelsV2/pipeline_svm_lendon.joblib')
stack_mlp = joblib.load('../../ModelsV2/pipeline_mlp.joblib')

models = [
    (stack_lr, 'Logistic Regression'),
    (stack_rf, 'Random Forest'),
    (stack_svm, 'Support Vector Machine'),
    (stack_mlp, 'Multi-layer Perceptron'),
]



In [8]:
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']
metrics_df = pd.DataFrame(index=base_names, columns=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])

vectorized_text = stack_rf.named_steps['vectorizer'].transform(X_train)

# Evaluate base models
for model, name in  zip(stack_rf.named_steps['stacking'].estimators_ , base_names):
    y_pred = model.predict(vectorized_text)
    y_prob = model.predict_proba(vectorized_text)[:, 1]  # Probability for positive class for ROC curve
    
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    auc = roc_auc_score(y_train, y_prob)

    # Store metrics in the DataFrame
    metrics_df.loc[name] = [accuracy, precision, recall, f1, auc]


# Evaluate stacking models
def get_performance_stats(model, name):
    stack_y_pred = model.predict(X_train)
    stack_y_prob = model.predict_proba(X_train)[:, 1]

    stack_accuracy = accuracy_score(y_train, stack_y_pred)
    stack_precision = precision_score(y_train, stack_y_pred)
    stack_recall = recall_score(y_train, stack_y_pred)
    stack_f1 = f1_score(y_train, stack_y_pred)
    stack_auc = roc_auc_score(y_train, stack_y_prob)
    metrics_df.loc[name] = [stack_accuracy, stack_precision, stack_recall, stack_f1, stack_auc]

metrics_df.loc['Stacking Model'] = ["", "", "", "", ""]

for model, name in models:
    get_performance_stats(model, name)

# Display the comparison table
print("Comparison of Models: (TRAINING)")
print(metrics_df)

Comparison of Models: (TRAINING)
                        Accuracy Precision    Recall  F1 Score       AUC
MNB                      0.92019   0.86685  0.993684  0.925944  0.995083
LR                      0.952431  0.986425  0.917895  0.950927  0.996231
RF                      0.985201       1.0  0.970526  0.985043   0.99965
KNN                     0.901163  0.855545  0.966316  0.907563  0.977569
SVM                     0.996829       1.0  0.993684  0.996832       1.0
Stacking Model                                                          
Logistic Regression          1.0       1.0       1.0       1.0       1.0
Random Forest                1.0       1.0       1.0       1.0       1.0
Support Vector Machine       1.0       1.0       1.0       1.0       1.0
Multi-layer Perceptron       1.0       1.0       1.0       1.0       1.0


In [9]:

training = pd.read_csv('../../Data Splits/test_data.csv')

training_x = training['article']

training_y = training['label']


In [10]:
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']
metrics_df = pd.DataFrame(index=base_names, columns=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])

vectorized_text = stack_rf.named_steps['vectorizer'].transform(training_x)

# Evaluate base models
for model, name in  zip(stack_rf.named_steps['stacking'].estimators_ , base_names):
    y_pred = model.predict(vectorized_text)
    y_prob = model.predict_proba(vectorized_text)[:, 1]  # Probability for positive class for ROC curve
    
    accuracy = accuracy_score(training_y, y_pred)
    precision = precision_score(training_y, y_pred)
    recall = recall_score(training_y, y_pred)
    f1 = f1_score(training_y, y_pred)
    auc = roc_auc_score(training_y, y_prob)
    metrics_df.loc[name] = [accuracy, precision, recall, f1, auc]


# Evaluate stacking models
def get_performance_stats(model, name):
    stack_y_pred = model.predict(training_x)
    stack_y_prob = model.predict_proba(training_x)[:, 1]  

    stack_accuracy = accuracy_score(training_y, stack_y_pred)
    stack_precision = precision_score(training_y, stack_y_pred)
    stack_recall = recall_score(training_y, stack_y_pred)
    stack_f1 = f1_score(training_y, stack_y_pred)
    stack_auc = roc_auc_score(training_y, stack_y_prob)
    metrics_df.loc[name] = [stack_accuracy, stack_precision, stack_recall, stack_f1, stack_auc]

metrics_df.loc['Stacking Model'] = ["", "", "", "", ""]

for model, name in models:
    get_performance_stats(model, name)

# Display the comparison table
print("Comparison of Models: (TESTING)")
print(metrics_df)

Comparison of Models: (TESTING)
                        Accuracy Precision    Recall  F1 Score       AUC
MNB                     0.813953   0.73399  0.986755  0.841808  0.959647
LR                      0.877076  0.945312  0.801325  0.867384  0.968433
RF                      0.893688  0.947368  0.834437  0.887324  0.970044
KNN                     0.827243  0.779661  0.913907  0.841463  0.922097
SVM                     0.893688     0.976  0.807947  0.884058   0.97351
Stacking Model                                                          
Logistic Regression     0.940199  0.946309  0.933775      0.94   0.98287
Random Forest           0.943522  0.935065  0.953642  0.944262  0.980795
Support Vector Machine  0.946844   0.94702   0.94702   0.94702  0.983841
Multi-layer Perceptron  0.940199  0.946309  0.933775      0.94  0.982296


In [11]:
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']
metrics_df = pd.DataFrame(index=base_names, columns=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])

vectorized_text = stack_rf.named_steps['vectorizer'].transform(X_val)

# Evaluate base models
for model, name in  zip(stack_rf.named_steps['stacking'].estimators_ , base_names):
    y_pred = model.predict(vectorized_text)
    y_prob = model.predict_proba(vectorized_text)[:, 1]  # Probability for positive class for ROC curve
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)

    # Store metrics in the DataFrame
    metrics_df.loc[name] = [accuracy, precision, recall, f1, auc]

# Evaluate stacking models
def get_performance_stats(model, name):
    stack_y_pred = model.predict(X_val)
    stack_y_prob = model.predict_proba(X_val)[:, 1]  

    stack_accuracy = accuracy_score(y_val, stack_y_pred)
    stack_precision = precision_score(y_val, stack_y_pred)
    stack_recall = recall_score(y_val, stack_y_pred)
    stack_f1 = f1_score(y_val, stack_y_pred)
    stack_auc = roc_auc_score(y_val, stack_y_prob)
    metrics_df.loc[name] = [stack_accuracy, stack_precision, stack_recall, stack_f1, stack_auc]

metrics_df.loc['Stacking Model'] = ["", "", "", "", ""]

for model, name in models:
    get_performance_stats(model, name)

# Display the comparison table
print("Comparison of Models: (VALIDATION)")
print(metrics_df)

Comparison of Models: (VALIDATION)
                        Accuracy Precision    Recall  F1 Score       AUC
MNB                     0.820197  0.741697  0.985294  0.846316  0.975102
LR                      0.917488   0.96206  0.870098  0.913771  0.979312
RF                      0.919951  0.957333  0.879902  0.916986  0.975396
KNN                     0.862069  0.816239  0.936275  0.872146  0.934922
SVM                     0.919951  0.967302  0.870098  0.916129  0.983686
Stacking Model                                                          
Logistic Regression     0.955665  0.958128  0.953431  0.955774  0.993272
Random Forest           0.964286  0.961071  0.968137  0.964591  0.994437
Support Vector Machine  0.950739  0.953202  0.948529   0.95086  0.991294
Multi-layer Perceptron  0.949507   0.95086  0.948529  0.949693  0.992501
