# Project 3: Web APIs and Classification: Model Benchmarks

In [178]:
#Imports:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import re
from bs4 import BeautifulSoup as bs
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import time

%matplotlib inline

In [179]:
# Set the graph style
plt.style.use('ggplot')

## Reading the dataframe

In [180]:
final_df = pd.read_csv('./datasets/final_df.csv')
final_df

Unnamed: 0,text,label
0,driven individual rushing towards dream ever s...,1.0
1,reduce bounce rate webpage,1.0
2,made animated summary lean start eric ries hop...,1.0
3,skate ramp business,1.0
4,help getting textile prototype created,1.0
...,...,...
2161,trying learn various ing strategy came across ...,0.0
2162,pretend know lot finance economics sold positi...,0.0
2163,bill ackman bet market recovery despite covid ...,0.0
2164,news covid vaccine drugmaker pfizer pfe partne...,0.0


## Train test split

Split the model into their train and test set before transforming the text using the count vectorizer

In [181]:
X = final_df['text']
y = final_df['label']

In [182]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Reset the indexes
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [183]:
print(X_train.shape)
print(X_test.shape)

(1624,)
(542,)


## Transforming the text using `countvectorizer`

In [184]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the "CountVectorizer" object
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = None) 

In [185]:
# Transform the words to tokenize the words  
X_train_vec = cvec.fit_transform(X_train)
X_test_vec = cvec.transform(X_test)

In [186]:
# Convert X_train into a DataFrame.

X_train_df = pd.DataFrame(X_train_vec.toarray(),
                          columns=cvec.get_feature_names())
X_train_df

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
# Convert X_test into a DataFrame.
X_test_df = pd.DataFrame(X_test_vec.toarray(),
                         columns=cvec.get_feature_names())

X_test_df

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
540,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Baseline Accuracy

In [188]:
y_test.value_counts(normalize=True)

1.0    0.538745
0.0    0.461255
Name: label, dtype: float64

The baseline accuracy is required to check if the model performs better than the default model.

The baseline accuracy is the majority class fall which is the entrepreneur subreddit which is the class 1.

## Training on the logistics regression model

In [189]:
# Import the logistic regression model
from sklearn.linear_model import LogisticRegression

In [190]:
# Instantiate the Logistic Regression model, setting max_iter to a higher value to prevent convergence warning.
lr = LogisticRegression(solver='newton-cg',max_iter=500)

# Fit the model to the training data
lr.fit(X_train_vec, y_train)

# Evaluate the model on the training set
lr.score(X_train_vec, y_train)

0.9956896551724138

In [191]:
# Evaluate the model on the test set
lr.score(X_test_vec, y_test)

0.9059040590405905

It seems the model is overfitting comparing against the train and test set as the train set has a higher accuracy score compared to the test test.

I can reduce the number of features in the model to reduce the variance which will decrease the overfitting and help improve the accuracy score.

I can also increase regularization strength of the model to reduce the overfitting.

In [192]:
from sklearn.metrics import confusion_matrix

y_pred = lr.predict(X_test_vec)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negative:", tn)
print("False Positive:", fp)
print("True Positive:", tp)
print("False Negative:", fn)

True Negative: 219
False Positive: 31
True Positive: 272
False Negative: 20


In [193]:
confusion_matrix(y_test, y_pred)

array([[219,  31],
       [ 20, 272]], dtype=int64)

In [194]:
specificity = tn / (tn+fp) # How accurately can the model predict for the negative class
sensitivity = tp / (tp+fn) # How accurately can the model predict for the positive class

print('Specficity:', round(specificity,2))
print('Sensitivity:',round(sensitivity,2))

Specficity: 0.88
Sensitivity: 0.93


The Sensitivity is slightly higher compared to the Specificity, which means the model is slightly more likely to accurately predict the positive class compared to the negative class. However, since I'm trying to predict whether the model is able to accurately predict the subreddit where the post belongs to, optimizing for Sensitivity and Specificity would not be a good measure. Accuracy would be a better performance metric.

In [195]:
# AUC ROC Curve

## Logistics Regression: Identifying posts that were misclassified

In [196]:
# Index of Misclassified posts for the test set
index_misclassified_post = y_test[y_pred != y_test].index
index_misclassified_post

Int64Index([  1,   2,   5,   8,  12,  24,  33,  43,  56,  61,  66,  67,  79,
             83,  90, 103, 108, 138, 151, 153, 178, 211, 223, 232, 246, 247,
            266, 289, 295, 316, 319, 326, 337, 343, 359, 370, 384, 385, 405,
            418, 421, 437, 445, 464, 465, 478, 487, 499, 500, 503, 505],
           dtype='int64')

In [197]:
# Creating the dataframe for the misclassified posts
misclass_posts = pd.DataFrame(X_test[index_misclassified_post])
misclass_posts

Unnamed: 0,text
1,covid evaluation matrix
2,good career path mechanical engineer data scie...
5,covid
8,know wrong world constant money like end day m...
12,digital advertisement market reach b
24,competitor popping like crazy discussion
33,sien sientra going tit
43,worthwhile learning ment outside taking action
56,ace v icln
61,amazon start amazon pharmacy w free delivery p...


In [198]:
# Setting the index values for y_pred
y_pred_series = pd.Series(y_pred, index=X_test.index)
y_pred_series

0      1.0
1      1.0
2      0.0
3      0.0
4      1.0
      ... 
537    0.0
538    1.0
539    0.0
540    0.0
541    1.0
Length: 542, dtype: float64

In [199]:
# Adding the predicted and actual values for the labels
misclass_posts['y_true'] =  y_test[index_misclassified_post]
misclass_posts['y_pred'] = y_pred_series[index_misclassified_post]

In [200]:
# Length of the text
misclass_posts['length_of_text'] = misclass_posts['text'].map(lambda x:len(x.split()))
misclass_posts.head()

Unnamed: 0,text,y_true,y_pred,length_of_text
1,covid evaluation matrix,0.0,1.0,3
2,good career path mechanical engineer data scie...,1.0,0.0,7
5,covid,0.0,1.0,1
8,know wrong world constant money like end day m...,1.0,0.0,71
12,digital advertisement market reach b,0.0,1.0,5


In [201]:
# Posts that were misclassified as entrepreneur subreddit
misclass_entre = misclass_posts[misclass_posts['y_pred'] == 1].sort_values('length_of_text')
misclass_entre

Unnamed: 0,text,y_true,y_pred,length_of_text
5,covid,0.0,1.0,1
178,stm,0.0,1.0,1
370,nndm arkq,0.0,1.0,2
138,short penumbra,0.0,1.0,2
1,covid evaluation matrix,0.0,1.0,3
487,black rock monopoly,0.0,1.0,3
56,ace v icln,0.0,1.0,3
418,hpq short analysis,0.0,1.0,3
153,scam academy private,0.0,1.0,3
343,looking amzn alternative,0.0,1.0,3


In [202]:
# Majority of the misclassified class for entreg is 74% 
(misclass_entre['length_of_text'].map(lambda x:x<10).sum() / len(misclass_entre)) * 100

74.19354838709677

It's seems the majority of the posts that were misclassified as investing were have a length of less than 10 which is about 74%.

In [203]:
# Getting the frequency of the words that are misclassified as entrepreneur label
misclass_entre_words = " ".join(misclass_entre['text'].values)

misclass_entre_words_dict = {}
for word in misclass_entre_words.split():
    misclass_entre_words_dict[word] = misclass_entre_words.count(word)

In [204]:
pd.DataFrame(misclass_entre_words_dict.values(), 
             index=misclass_entre_words_dict.keys(),
             columns=['Frequency']).sort_values('Frequency',ascending=False).head(10)

Unnamed: 0,Frequency
e,377
b,53
w,47
v,35
ing,34
ed,26
go,12
day,9
men,8
hi,8


In [205]:
# Posts that were misclassified as investing subreddit
misclass_investing = misclass_posts[misclass_posts['y_pred'] == 0].sort_values('length_of_text')
misclass_investing

Unnamed: 0,text,y_true,y_pred,length_of_text
232,freight company,1.0,0.0,2
83,company formation,1.0,0.0,2
505,feel like paralyasis,1.0,0.0,3
503,freight transportation company,1.0,0.0,3
405,apply increase conversion rate,1.0,0.0,4
465,anyone opinion real estate wholesale,1.0,0.0,5
24,competitor popping like crazy discussion,1.0,0.0,5
211,st time importer home gym china,1.0,0.0,6
43,worthwhile learning ment outside taking action,1.0,0.0,6
79,cost efficient source branded eco friendly cup,1.0,0.0,7


In [206]:
# Majority of the misclassified class for investing is 74% 
(misclass_investing['length_of_text'].map(lambda x:x<10).sum() / len(misclass_investing)) * 100

80.0

It's seems the majority of the posts that were misclassified as investing were have a length of less than 10 which is about 80%.

## Combining the misclassified posts and obtaining the probability of the posts being classified

In [207]:
# The actual probability of the class being predicted by the model
combined_mis = pd.DataFrame(lr.predict_proba(X_test_vec[index_misclassified_post]), columns=['Invest prob', 'Entrepreneur prob'])
combined_mis = pd.concat([combined_mis, X_test[index_misclassified_post].reset_index(drop=True)], axis=1)
combined_mis

Unnamed: 0,Invest prob,Entrepreneur prob,text
0,0.481165,0.518835,covid evaluation matrix
1,0.520048,0.479952,good career path mechanical engineer data scie...
2,0.480389,0.519611,covid
3,0.74998,0.25002,know wrong world constant money like end day m...
4,0.448877,0.551123,digital advertisement market reach b
5,0.562297,0.437703,competitor popping like crazy discussion
6,0.49071,0.50929,sien sientra going tit
7,0.769746,0.230254,worthwhile learning ment outside taking action
8,0.482642,0.517358,ace v icln
9,0.219636,0.780364,amazon start amazon pharmacy w free delivery p...


The model misclassified some posts as most of the words fall under the opposite class.

## Interpreting the coefficients for the Logisitics Regression

In [208]:
# Top coefficients for the positive class, the entrepreneur subreddit
lr_coef = pd.DataFrame(np.exp(lr.coef_[0]),
                          index=vectorizer.get_feature_names(),
                          columns=['Coefficients']).sort_values('Coefficients',ascending=False)
lr_coef.head(10)

Unnamed: 0,Coefficients
business,7.405037
idea,5.7705
product,3.912808
startup,3.892313
marketing,3.593916
name,2.623289
com,2.408979
free,2.39842
usa,2.264786
start,2.19602


As the logistic regression coefficients represent the log odds that an observation is in target class, 1, given the values of it X variables, the log odd coefficients need to be converted to regular odds to make sense of them. This is done through exponentiating the log odds coefficients.

For example:

For every one-unit in `business`, the odds that the observation is in entrepreneur class is 7.4 times as large as the odds that the observation is not in the entrepreneur class provided all other variables are constant.

In [209]:
# Top bottom coefficients for the negative class, for the investing subreddit
lr_coef.tail(10)

Unnamed: 0,Coefficients
growth,0.32888
tesla,0.32778
roku,0.309921
portfolio,0.300534
thought,0.292947
palantir,0.263532
ment,0.236508
etf,0.214843
ing,0.101773
stock,0.059345


These coefficients are least likely to represent the entrepreneur subreddit class and more likely to represent the investing subreddit class given their low coefficients.

## Training on the Multi-nominal Bayes Model

I'll be using Multi-nominal Bayes as the X column is filled with the integer counts of the terms in each document.

In [210]:
# Import the Multinominal Naive bayes
from sklearn.naive_bayes import MultinomialNB

In [212]:
# Instiate the model
mnb = MultinomialNB()
# Fit the training set
mnb.fit(X_train_vec, y_train) 

# Accuracy score of the training set
mnb.score(X_train_vec, y_train)

0.9624384236453202

In [213]:
# Accuracy score of the test set
mnb.score(X_test_vec, y_test)

0.9428044280442804

There a slight overfitting of the model on the test set but Multi-nominal Naive Bayes seems to generalize better than Logistics Regression and scores better than the baseline model.

## Interpreting the coefficients for the Multi-nominal Bayes Model

In [214]:
#prob for positive class
pos_class_prob_sorted = mnb.feature_log_prob_[1, :].argsort()
#prob for negative class
neg_class_prob_sorted = mnb.feature_log_prob_[0, :].argsort()
#getting the top features 
neg_top_features = np.take(cvec.get_feature_names(), neg_class_prob_sorted)
pos_top_features = np.take(cvec.get_feature_names(), pos_class_prob_sorted)

In [230]:
mnb.coef_

array([[-10.32900036, -11.02214754, -11.02214754, ...,  -9.63585318,
         -9.63585318, -11.02214754]])

In [235]:
pos_class_prob_sorted

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([10550,  7388,  7386,  7385,  3343,\n            ...\n             4022,  5429, 10440,  6799,  1274],\n           dtype='int64', length=10009). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

## Futher Model evaluation

### Optimizing Logisitics Regression model

Using GridSearch CV on the CountVectorizer, Logreg, mnb.

Using GridSearchCV on the TFIFD, Logreg, mnb 

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Pipe to add the count vectorizer and logistic regression
pipe_logreg = Pipeline([
    ('countvec', CountVectorizer(lowercase=False)), # Already coverted to lowercase
    ('logreg', LogisticRegression(max_iter=500))
])

# Parameters to test the different hyper parameters
params_log_reg = {
    'countvec__ngram_range': [(1,1),(2,2)], # Testing using unigrams and unigrams and bigrams
    'countvec__max_features': [5000, 6000, 7000, 8000], # Since features are about 10,551, I'll try to use lower features
    'countvec__min_df': [2,3], # Minimum number of documents to include token
    'countvec__max_df': [.8, .9], # Maximum number of documents to include token
    'logreg__solver': ['newton-cg', 'liblinear'], # Testing different algorithms
}

In [24]:
# Instantiate the GridSearchCV

gs_log_reg = GridSearchCV(pipe_logreg,
                 param_grid=params_log_reg,
                 cv=5)

In [25]:
gs_log_reg.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvec',
                                        CountVectorizer(lowercase=False)),
                                       ('logreg',
                                        LogisticRegression(max_iter=500))]),
             param_grid={'countvec__max_df': [0.8, 0.9],
                         'countvec__max_features': [5000, 6000, 7000, 8000],
                         'countvec__min_df': [2, 3],
                         'countvec__ngram_range': [(1, 1), (2, 2)],
                         'logreg__solver': ['newton-cg', 'liblinear']})

In [26]:
gs_log_reg.best_params_

{'countvec__max_df': 0.8,
 'countvec__max_features': 5000,
 'countvec__min_df': 2,
 'countvec__ngram_range': (1, 1),
 'logreg__solver': 'liblinear'}

In [27]:
gs_log_reg.best_score_

0.9014852801519467

In [28]:
gs_log_reg.score(X_train, y_train)

0.9926108374384236

In [29]:
gs_log_reg.score(X_test, y_test)

0.8985239852398524

In [30]:
y_pred = gs_log_reg.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negative:", tn)
print("False Positive:", fp)
print("True Positive:", tp)
print("False Negative:", fn)

True Negative: 218
False Positive: 32
True Positive: 269
False Negative: 23


In [31]:
confusion_matrix(y_test, y_pred)

array([[218,  32],
       [ 23, 269]], dtype=int64)

In [32]:
specificity = tn / (tn+fp) # How accurately can the model predict for the negative class
sensitivity = tp / (tp+fn) # How accurately can the model predict for the positive class

print('Specficity:', round(specificity,2))
print('Sensitivity:',round(sensitivity,2))

Specficity: 0.87
Sensitivity: 0.92


## Stopped here

### Optimizing Multi-nominal Naive Bayes model

In [122]:
# Pipe to add the count vectorizer and Multi-nominal Bayes model
pipe_mnb = Pipeline([
    ('countvec', CountVectorizer(lowercase=False)), # Already coverted to lowercase
    ('mnb', MultinomialNB())
])

# Parameters to test the different hyper parameters
params_mnb = {
    'countvec__ngram_range': [(1,1),(2,2)], # Testing using unigrams bigrams
    'countvec__max_features': [8000, 9000, 10000], # Since features are about 10,551, I'll try to use lower features
    'countvec__min_df': [1,2], # Minimum number of documents to include token
    'countvec__max_df': [.9, .95], # Maximum number of documents to include token
    'mnb__alpha': [0.1,0.2], # Testing different alpha values
}

In [123]:
# Instantiate the GridSearchCV

gs_mnb = GridSearchCV(pipe_mnb,
                 param_grid=params_mnb,
                 cv=5)

In [111]:
gs_mnb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvec',
                                        CountVectorizer(lowercase=False)),
                                       ('mnb', MultinomialNB())]),
             param_grid={'countvec__max_df': [0.9, 0.95],
                         'countvec__max_features': [8000, 9000, 10000],
                         'countvec__min_df': [1, 2],
                         'countvec__ngram_range': [(1, 1), (2, 2)],
                         'mnb__alpha': [0.1, 0.2]})

In [112]:
gs_mnb.best_params_

{'countvec__max_df': 0.9,
 'countvec__max_features': 9000,
 'countvec__min_df': 1,
 'countvec__ngram_range': (1, 1),
 'mnb__alpha': 0.2}

In [113]:
gs_mnb.best_score_

0.926727445394112

In [114]:
gs_mnb.score(X_train, y_train)

0.9655172413793104

In [115]:
gs_mnb.score(X_test, y_test)

0.9464944649446494

## Using TFTID 

In [145]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [146]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(lowercase=False)

In [147]:
X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

In [135]:
# Convert X_train and X_test into their DataFrames.

X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(),
                          columns=tvec.get_feature_names())

In [136]:
X_train_df.head()

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.080284,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
X_test_df.head()

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Training on the logistics regression model

In [148]:
# Instantiate the Logistic Regression model, setting max_iter to a higher value to prevent convergence warning.
lr = LogisticRegression(solver='newton-cg',max_iter=500)

# Fit the model to the training data
lr.fit(X_train, y_train)

# Evaluate the model on the training set
lr.score(X_train, y_train)

0.9858374384236454

In [149]:
# Evaluate the model on the test set
lr.score(X_test, y_test)

0.940959409594096

# Need to check
It seems the model is overfitting comparing against the train and test set as the train set has a higher accuracy score compared to the test test.

I can reduce the number of features in the model to reduce the variance which will decrease the overfitting and help improve the accuracy score.

I can also increase regularization strength of the model to reduce the overfitting.

In [150]:
from sklearn.metrics import confusion_matrix

y_pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negative:", tn)
print("False Positive:", fp)
print("True Positive:", tp)
print("False Negative:", fn)

True Negative: 233
False Positive: 17
True Positive: 277
False Negative: 15


In [151]:
confusion_matrix(y_test, y_pred)

array([[233,  17],
       [ 15, 277]], dtype=int64)

In [152]:
specificity = tn / (tn+fp) # How accurately can the model predict for the negative class
sensitivity = tp / (tp+fn) # How accurately can the model predict for the positive class

print('Specficity:', round(specificity,2))
print('Sensitivity:',round(sensitivity,2))

Specficity: 0.93
Sensitivity: 0.95


# Need to check

The Sensitivity is slightly higher compared to the Specificity, which means the model is slightly more likely to accurately predict the positive class compared to the negative class. However, since I'm trying to predict whether the model is able to accurately predict the subreddit where the post belongs to, optimizing for Sensitivity and Specificity would not be a good measure. Accuracy would be a better performance metric.

## Training on the Multi-nominal Bayes Model

I'll be using Multi-nominal Bayes as the X column is filled with the integer counts of the terms in each document.

In [153]:
# Import the Multinominal Naive bayes
from sklearn.naive_bayes import MultinomialNB

In [154]:
# Instiate the model
mnb = MultinomialNB()
# Fit the training set
mnb.fit(X_train, y_train) 

# Accuracy score of the training set
mnb.score(X_train, y_train)

0.9741379310344828

In [155]:
# Accuracy score of the test set
mnb.score(X_test, y_test)

0.940959409594096

There a slight overfitting of the model on the test set but Multi-nominal Naive Bayes seems to generalize better than Logistics Regression and scores better than the baseline model.

In [55]:
# AUC ROC Curve