In [14]:
# Import libraries
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
import json
import joblib as joblib

import re
import string

In [15]:
# Load data
data_train = pd.read_csv('../../Data Splits/train_data_70_30.csv')
data_test = pd.read_csv('../../Data Splits/test_data_70_30.csv')

In [16]:
X_train = data_train['article']
y_train = data_train['label']

X_test = data_test['article']
y_test = data_test['label']

In [17]:
# Stopword Removal
ps = PorterStemmer()
with open('../../Datasets/stopwords-tl.json', 'r') as f:
    stopwords = json.load(f)
    
def preprocess(text):
    if isinstance(text, str):  # If it's a single string
        text = [text]  # Convert to a list of one string for consistency

    processed_texts = []
    for item in text:
        # Lowercase Conversion
        lowered = item.lower()

        # URL Removal
        urled = re.sub(r'https?://\S+|www\.\S+', '', lowered)
        
        # Text Simplification
        simplified = re.sub(r'\[.*?\]', '', urled)
        simplified = re.sub(r"\\W", " ", simplified)
        simplified = re.sub(r'<.*?>+', '', simplified)
        simplified = re.sub(r'[%s]' % re.escape(string.punctuation), '', simplified)
        simplified = re.sub(r'\n', '', simplified)
        simplified = re.sub(r'\w*\d\w*', '', simplified)

        # Tokenization
        tokens = nltk.word_tokenize(simplified)
        stems = [ps.stem(token) for token in tokens]
        filtered = [stem for stem in stems if stem not in stopwords]
        
        processed_texts.append(filtered)

    if len(processed_texts) == 1:
        return processed_texts[0]  # Return the single processed text
    else:
        return processed_texts  # Return the list of processed texts


In [18]:
# Apply preprocess function to X_train
X_train_preprocessed = X_train.apply(preprocess)

# Apply preprocess function to X_val
X_test_preprocessed = X_test.apply(preprocess)

In [19]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_preprocessed.apply(lambda x: ' '.join(x)))
X_test_tfidf = vectorizer.transform(X_test_preprocessed.apply(lambda x: ' '.join(x)))

In [20]:
# Initializing base models

mnb = MultinomialNB()
lr = LogisticRegression()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(probability=True)
mlp = MLPClassifier()

base_models = [mnb, lr, rf, knn, svm]
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']

In [21]:
# Define the parameter grid
# knn
param_grid_knn = {
    
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weighting scheme
    'metric': ['euclidean', 'cosine', 'manhattan', 'minkowski']  # Distance metric
}

# rf
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [1, 2, 3, 4, 'sqrt', 'log2']
}

# lr
param_grid_lr = {'penalty': ['l2'], 'C': [1, 10, 100, 1000], 'solver': ['lbfgs', 'liblinear']}

# mlp
param_grid_mlp = {
            'hidden_layer_sizes': [(10,), (20,), (50,), (100,)], 
            'activation': ['relu', 'tanh', 'logistic'], 
            'solver': ['sgd', 'adam'], 
            'learning_rate': ['constant', 'adaptive']
            }

# svm

param_grid_svm = {
    'C': [0.1, 1, 10],               
    'kernel': ['linear', 'rbf'],     
    'gamma': ['scale', 'auto'],     
    'degree': [2, 3, 4],             
    'class_weight': [None, 'balanced'],  
    'probability': [True]           
}

In [22]:
lr_grid = GridSearchCV(estimator=rf, param_grid=param_grid_lr, scoring='accuracy', cv=5, n_jobs=-1)
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring='accuracy', cv=5, n_jobs=-1)
svm_grid = GridSearchCV(estimator=svm, param_grid=param_grid_svm, scoring='accuracy', cv=5, n_jobs=-1)
mlp_grid = GridSearchCV(estimator=mlp, param_grid=param_grid_mlp, scoring='accuracy', cv=5, n_jobs=-1)


knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid_knn, scoring='accuracy', cv=5, n_jobs=-1)

In [23]:
knn_grid.fit(X_train_tfidf, y_train)

In [24]:
print("Best parameters: ", knn_grid.best_params_)
print("Accuracy: {:.2f}%".format(knn_grid.best_score_ * 100))

Best parameters:  {'metric': 'cosine', 'n_neighbors': 3, 'weights': 'distance'}
Accuracy: 84.73%


In [26]:
joblib.dump(knn_grid , '../../ModelsV3/gridsearch_knn.joblib')


['../../ModelsV3/gridsearch_knn.joblib']