In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
import json
import nltk
import unicodedata
import re
import string
import joblib as joblib
import pickle as pickle
from sklearn.model_selection import cross_val_score
# nltk.download('punkt')
# pip install unidecode
from unidecode import unidecode

In [2]:
# Load data
data_train = pd.read_csv('../Dataset/train_data.csv')
data_test = pd.read_csv('../Dataset/test_data.csv')

In [3]:
X_train = data_train['article']
y_train = data_train['label']

X_test = data_test['article']
y_test = data_test['label']

In [4]:
# Text preprocessing
def wordopt(text):
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

ps = PorterStemmer()
with open('../Resources/stopwords-tl.json', 'r') as f:
    stopwords = json.load(f)

def preprocess(text):
    text = wordopt(text)
    tokens = nltk.word_tokenize(text)
    stems = [ps.stem(token) for token in tokens]
    filtered = [stem for stem in stems if stem not in stopwords]
    return filtered

In [5]:
# Apply preprocess function to X_train
X_train = X_train.apply(preprocess)

# Apply preprocess function to X_test
X_test = X_test.apply(preprocess)

In [6]:
# Convert the preprocessed text into a list of strings
X_train_processed = [' '.join(tokens) for tokens in X_train]
X_test_processed = [' '.join(tokens) for tokens in X_test]

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_processed)

# Transform the validation data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test_processed)

# Print the shape of the TF-IDF matrices (No of rows, No of columns)
print("Shape of X_train_tfidf:", X_train_tfidf.shape)
print("Shape of X_test_tfidf:", X_test_tfidf.shape)

Shape of X_train_tfidf: (2103, 27707)
Shape of X_test_tfidf: (902, 27707)


In [7]:
mnb = MultinomialNB()
lr = LogisticRegression()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(probability=True)

base_models = [mnb, lr, rf, knn, svm]
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']

In [8]:
stack_rf = joblib.load('../Models/Stack/stack_rf_5_final_953.joblib')
stack_lr = joblib.load('../Models/Stack/stack_lr.joblib')
stack_svm = joblib.load('../Models/Stack/stack_svm.joblib')
stack_mlp = joblib.load('../Models/Stack/stack_mlp_2_958.joblib')

models = [
    (stack_lr, 'Logistic Regression'),
    (stack_rf, 'Random Forest'),
    (stack_svm, 'Support Vector Machine'),
    (stack_mlp, 'Multi-layer Perceptron'),
]

In [9]:
metrics_df = pd.DataFrame(columns=['Mean Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'])

# Initialize lists to store ROC curve data
plt.figure()

print("TRAINING BASE MODEL ACCURACY")
for model, name in zip(base_models, base_names):
    model.fit(X_train_tfidf, y_train)
    scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
    scores_precision = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='precision')
    scores_recall = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='recall')
    scores_f1 = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='f1')
    scores_roc_auc = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='roc_auc') 
    mean_accuracy = scores.mean()
    mean_scores_precision = scores_precision.mean()
    mean_scores_recall = scores_recall.mean()
    mean_scores_f1 = scores_f1.mean()
    mean_scores_roc_auc = scores_roc_auc.mean()

    metrics_df.loc[name] = [
        mean_accuracy, 
        mean_scores_precision,
        mean_scores_recall,
        mean_scores_f1,
        mean_scores_roc_auc,
    ]
print(metrics_df)


TRAINING BASE MODEL ACCURACY
     Mean Accuracy  Precision    Recall  F1 Score   ROC AUC
MNB       0.831201   0.754853  0.983913  0.854194  0.974528
LR        0.926298   0.964065  0.886377  0.923537  0.983689
RF        0.927250   0.937265  0.910994  0.935669  0.979342
KNN       0.857829   0.806495  0.943199  0.869470  0.942442
SVM       0.935808   0.981206  0.889207  0.932936  0.988276


<Figure size 640x480 with 0 Axes>

In [21]:
for model in base_models:
    model.fit(X_train_tfidf, y_train)

In [22]:
basemodel_mnb = joblib.load('../Models/Base/basemodel_MNB.joblib')
basemodel_lr = joblib.load('../Models/Base/basemodel_LR.joblib')
basemodel_rf = joblib.load('../Models/Base/basemodel_RF.joblib')
basemodel_knn = joblib.load('../Models/Base/basemodel_KNN.joblib')
basemodel_svm = joblib.load('../Models/Base/basemodel_SVM.joblib')

base_models_nogrid = [
    basemodel_mnb, 
    basemodel_lr,
    basemodel_rf,
    basemodel_knn,
    basemodel_svm
]

base_names_nogrid = ['MNB', 'LR', 'RF', 'KNN', 'SVM']

print("TRAINING STACK MODEL ACCURACY")

stack = StackingClassifier(estimators=list(zip(base_names, base_models)), final_estimator=stack_lr)
stack.fit(X_train_tfidf, y_train)

scores = cross_val_score(stack_lr, X_train_tfidf, y_train, cv=5, scoring='accuracy')
scores_precision = cross_val_score(stack_lr, X_train_tfidf, y_train, cv=5, scoring='precision')
scores_recall = cross_val_score(stack_lr, X_train_tfidf, y_train, cv=5, scoring='recall')
scores_f1 = cross_val_score(stack_lr, X_train_tfidf, y_train, cv=5, scoring='f1')
scores_roc_auc = cross_val_score(stack_lr, X_train_tfidf, y_train, cv=5, scoring='roc_auc') 
mean_accuracy = scores.mean()
mean_scores_precision = scores_precision.mean()
mean_scores_recall = scores_recall.mean()
mean_scores_f1 = scores_f1.mean()
mean_scores_roc_auc = scores_roc_auc.mean()

metrics_df.loc[name] = [
    mean_accuracy, 
    mean_scores_precision,
    mean_scores_recall,
    mean_scores_f1,
    mean_scores_roc_auc,
]
print(metrics_df)

TRAINING STACK MODEL ACCURACY


NotFittedError: This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print("TRAINING STACK MODEL ACCURACY")
for model, name in models:
    stack_rf = StackingClassifier(estimators=list(zip(base_names, base_models)), final_estimator=stack_rf, cv='prefit')
    stack_rf.fit(X_train_tfidf, y_train)

    scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
    scores_precision = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='precision')
    scores_recall = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='recall')
    scores_f1 = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='f1')
    scores_roc_auc = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='roc_auc') 
    mean_accuracy = scores.mean()
    mean_scores_precision = scores_precision.mean()
    mean_scores_recall = scores_recall.mean()
    mean_scores_f1 = scores_f1.mean()
    mean_scores_roc_auc = scores_roc_auc.mean()

    metrics_df.loc[name] = [
        mean_accuracy, 
        mean_scores_precision,
        mean_scores_recall,
        mean_scores_f1,
        mean_scores_roc_auc,
    ]
print(metrics_df)