In [1]:
# Packages for data 
import pandas as pd
import numpy as np
import pickle
from collections import Counter
from sklearn_pandas import DataFrameMapper

# Packages for machine learning modelling
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV



from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score
# precision_score, recall_score, f1_score


# Packages for visualisation 
import matplotlib.pyplot as plt

# Packages for NLP
import nltk

from sklearn.metrics import precision_recall_fscore_support




# Reading the data

In [2]:
train_data = pd.read_csv("new data/train_data.csv", index_col=1)
val_data = pd.read_csv("new data/validation_data.csv", index_col=1)
test_data = pd.read_csv("new data/test_data.csv", index_col=1)

In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Creating Model using Logistic Regression

In [4]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
log_reg_clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)

# Feature Selection

## TF-IDF

### min_df = 0.01

In [5]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
# X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.01')
log_reg_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)
# tfidf_numfeatures.append(num_features)

# #Validation Data
# print('Testing with validation data:')
# val_pred = log_reg_clf.predict(X_val)
# print(classification_report(y_val, val_pred))
# print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = log_reg_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',log_reg_clf.best_params_)

--------------------TF-IDF--------------------
TF-IDF Model with min_df=0.01


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

3373
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6361
           1       0.96      0.96      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------
------------------------------------------
Best Parameters :  {'C': 10}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### min_df = 0.15

In [6]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
# X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.15')
log_reg_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)

# #Validation Data
# print('Testing with validation data:')
# val_pred = log_reg_clf.predict(X_val)
# print(classification_report(y_val, val_pred))
# print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = log_reg_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',log_reg_clf.best_params_)

--------------------TF-IDF--------------------
TF-IDF Model with min_df=0.15


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

134
Testing using test data:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      6361
           1       0.90      0.92      0.91      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021

------------------------------------------
------------------------------------------
Best Parameters :  {'C': 100}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Feature Selection for Additional Features

## With Added Features

In [7]:
train_data_features = pd.read_csv("Final datasets/train_data.csv")
# val_data_features = pd.read_csv("Final datasets/val_data.csv")
test_data_features = pd.read_csv("Final datasets/test_data.csv")

In [8]:
y_train = train_data_features["class_label"].values
y_test = test_data_features["class_label"].values
# y_val = val_data_features["class_label"].values

### All added features for min_df = 0.01 (3k)

In [9]:
# Initialise TfidfVectorizer with min_df = 0.01 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)
# X_val_added_features = mapper.transform(val_data_features)


In [10]:
#define logistic regression model
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
log_reg_clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)

In [11]:
log_reg_clf.fit(X_train_added_features, y_train)
y_pred = log_reg_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',log_reg_clf.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6361
           1       0.97      0.96      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------
------------------------------------------
Best Parameters :  {'C': 10}


### Selected added features for min_df = 0.01 (3k)

In [12]:
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)

log_reg_clf.fit(X_train_added_features, y_train)
y_pred = log_reg_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',log_reg_clf.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      6361
           1       0.98      0.94      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------
------------------------------------------
Best Parameters :  {'C': 10}


### All added features for min_df = 0.15 (134)

In [13]:
# Initialise TfidfVectorizer with min_df = 0.01 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)

log_reg_clf.fit(X_train_added_features, y_train)
y_pred = log_reg_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',log_reg_clf.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.94      0.89      0.91      6361
           1       0.90      0.94      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------
------------------------------------------
Best Parameters :  {'C': 1000}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Selected added features for min_df = 0.15 (134)

In [14]:
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)

log_reg_clf.fit(X_train_added_features, y_train)
y_pred = log_reg_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',log_reg_clf.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.86      0.95      0.90      6361
           1       0.95      0.85      0.90      6660

    accuracy                           0.90     13021
   macro avg       0.90      0.90      0.90     13021
weighted avg       0.90      0.90      0.90     13021

------------------------------------------
------------------------------------------
Best Parameters :  {'C': 100}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
