In [1]:
# !pip install sklearn-pandas

In [2]:
# Packages for data 
import pandas as pd
import numpy as np
import pickle
from collections import Counter

# Packages for machine learning modelling
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score
# precision_score, recall_score, f1_score

from sklearn_pandas import DataFrameMapper
# from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion

# Packages for sentiment analysis
from textblob import TextBlob


# Packages for visualisation 
import matplotlib.pyplot as plt

# Packages for NLP
import nltk

# Reading the data

In [4]:
train_data = pd.read_csv("../Data/train_data.csv", index_col=1)
val_data = pd.read_csv("../Data/validation_data.csv", index_col=1)
test_data = pd.read_csv("../Data/test_data.csv", index_col=1)

In [13]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Creating the Base Model using Naive Bayes

## Using CountVectorizer

In [14]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [27]:
print("number of features used:", len(vectorizer.get_feature_names()))

number of features used: 238266


In [29]:
# Sparse vector of frequency of each word appearing in a text article
print(type(X_train))

<class 'scipy.sparse.csr.csr_matrix'>


In [15]:
naive_bayes_clf = BernoulliNB()
naive_bayes_clf.fit(X_train, y_train)

BernoulliNB()

In [28]:
count_vectorizer_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in count_vectorizer_params.items():
    vectorizer = CountVectorizer(stop_words='english', ngram_range=values)
    vectorizer.fit(X_train_text)

    X_train = vectorizer.transform(X_train_text)
    X_val = vectorizer.transform(X_val_text)
    X_test = vectorizer.transform(X_test_text)

    print(f'CountVectorizer Model with {ngram}')
    naive_bayes_clf.fit(X_train, y_train)

    #Validation Data
    print('Testing with validation data:')
    val_pred = naive_bayes_clf.predict(X_val)
    print(classification_report(y_val, val_pred))
    print("------------------------------------------")

    # Test Data
    print('Testing using test data:')
    test_pred = naive_bayes_clf.predict(X_test)
    print(classification_report(y_test, test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

CountVectorizer Model with unigram
Testing with validation data:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      6361
           1       0.90      0.89      0.90      6659

    accuracy                           0.90     13020
   macro avg       0.90      0.90      0.90     13020
weighted avg       0.90      0.90      0.90     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      6361
           1       0.90      0.91      0.91      6660

    accuracy                           0.90     13021
   macro avg       0.90      0.90      0.90     13021
weighted avg       0.90      0.90      0.90     13021

------------------------------------------
------------------------------------------
CountVectorizer Model with unigram and bigram
Testing with validation data:
              precision    recall  f1-score   suppo

## Using Tf-Idf

In [25]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    naive_bayes_clf = BernoulliNB()
    print(f"Model with {ngram}")
    naive_bayes_clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = naive_bayes_clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = naive_bayes_clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      6361
           1       0.90      0.89      0.90      6659

    accuracy                           0.90     13020
   macro avg       0.90      0.90      0.90     13020
weighted avg       0.90      0.90      0.90     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      6361
           1       0.90      0.91      0.91      6660

    accuracy                           0.90     13021
   macro avg       0.90      0.90      0.90     13021
weighted avg       0.90      0.90      0.90     13021

------------------------------------------
------------------------------------------
Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.95   

# Feature Selection

In [34]:
min_threshold_config = [0.0, 0.05, 0.1, 0.15, 0.2]
max_threshold_config = [0.8, 0.85, 0.9, 0.95, 1.0]

for min_value in min_threshold_config:
    print('--------------------CountVectorizer--------------------')
    for max_value in max_threshold_config:
        vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=min_value, max_df=max_value)
        vectorizer.fit(X_train_text)

        X_train = vectorizer.transform(X_train_text)
        X_val = vectorizer.transform(X_val_text)
        X_test = vectorizer.transform(X_test_text)

        print(f'CountVectorizer Model with min_df={min_value}, max_df={max_value}')
        naive_bayes_clf.fit(X_train, y_train)
        print(len(vectorizer.get_feature_names()))
        
        # #Validation Data
        # print('Testing with validation data:')
        # val_pred = naive_bayes_clf.predict(X_val)
        # print(classification_report(y_val, val_pred))
        # print("------------------------------------------")

        # Test Data
        print('Testing using test data:')
        test_pred = naive_bayes_clf.predict(X_test)
        print(classification_report(y_test, test_pred))
        print("------------------------------------------")
        print("------------------------------------------")

--------------------CountVectorizer--------------------
CountVectorizer Model with min_df=0.0, max_df=0.8
4540301
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      6361
           1       0.89      0.96      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------
------------------------------------------
CountVectorizer Model with min_df=0.0, max_df=0.85
4540301
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      6361
           1       0.89      0.96      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

-------------------------------------

In [43]:
for min_value in min_threshold_config:
    # for max_value in max_threshold_config:
        tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=min_value) #, max_df=max_value
        tfidf_vectorizer.fit(X_train_text)

        X_train = tfidf_vectorizer.transform(X_train_text)
        X_val = tfidf_vectorizer.transform(X_val_text)
        X_test = tfidf_vectorizer.transform(X_test_text)

        print(f'TF-IDF Model with min_df={min_value}') #, max_df={max_value}
        naive_bayes_clf.fit(X_train, y_train)
        print(len(tfidf_vectorizer.get_feature_names()))

        # #Validation Data
        # print('Testing with validation data:')
        # val_pred = naive_bayes_clf.predict(X_val)
        # print(classification_report(y_val, val_pred))
        # print("------------------------------------------")

        # Test Data
        print('Testing using test data:')
        test_pred = naive_bayes_clf.predict(X_test)
        print(classification_report(y_test, test_pred))
        print("------------------------------------------")
        print("------------------------------------------")

TF-IDF Model with min_df=0.0, max_df=0.8
4540301
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      6361
           1       0.89      0.96      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------
------------------------------------------
TF-IDF Model with min_df=0.0, max_df=0.85
4540301
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      6361
           1       0.89      0.96      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------
------------------------------------------
TF-IDF Model with min_df=

In [47]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=10000) #, max_df=max_value
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df={min_value}') #, max_df={max_value}
naive_bayes_clf.fit(X_train, y_train)
print(len(tfidf_vectorizer.get_feature_names()))

# #Validation Data
# print('Testing with validation data:')
# val_pred = naive_bayes_clf.predict(X_val)
# print(classification_report(y_val, val_pred))
# print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = naive_bayes_clf.predict(X_test)
print(classification_report(y_test, test_pred))
print("------------------------------------------")
print("------------------------------------------")

TF-IDF Model with min_df=0.2
10000
Testing using test data:
              precision    recall  f1-score   support

           0       0.91      0.84      0.87      6361
           1       0.86      0.92      0.89      6660

    accuracy                           0.88     13021
   macro avg       0.88      0.88      0.88     13021
weighted avg       0.88      0.88      0.88     13021

------------------------------------------
------------------------------------------


# TruncatedSVD

Explanation for reference: https://vitalv.github.io/projects/doc-clustering-topic-modeling.html

In [None]:
# Program to find the optimal number of components for Truncated SVD
n_comp = [1500] # 4,10,15,20,50,100,150,200,500,700,800,900,1000,1500,2000,2500,3000,3500 list containing different values of components
explained = [] # explained variance ratio for each component of Truncated SVD
for x in n_comp:
    svd = TruncatedSVD(n_components=x)
    svd.fit(X_train)
    explained.append(svd.explained_variance_ratio_.sum())
    print("Number of components = %r and explained variance = %r"%(x,svd.explained_variance_ratio_.sum()))
plt.plot(n_comp, explained)
plt.xlabel('Number of components')
plt.ylabel("Explained Variance")
plt.title("Plot of Number of components v/s explained variance")
plt.show()

# svd = TruncatedSVD(n_components=5, n_iter=7, random_state=424)
# svd.fit(X_train)

# print(svd.explained_variance_ratio_)

In [None]:
truncatedSVD = TruncatedSVD(n_components=1000)
X_truncated = truncatedSVD.fit_transform(X_train)

X_test_truncated = truncatedSVD.fit_transform(X_test)

In [None]:
naive_bayes_clf.fit(X_truncated, y_train)

y_pred = naive_bayes_clf.predict(X_test_truncated)
print(classification_report(y_test, y_pred))

# Feature Selection for Additional Features

## Re-evaluating performance of Naive Bayes after adding new features

In [55]:
naive_bayes_clf.fit(X_train, y_train)

y_pred = naive_bayes_clf.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.85      0.91      6361
           1       0.87      0.97      0.92      6659

    accuracy                           0.91     13020
   macro avg       0.92      0.91      0.91     13020
weighted avg       0.92      0.91      0.91     13020

