# Project 3: Web APIs and Classification: Model Benchmarks

In [1]:
#Imports:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import re
from bs4 import BeautifulSoup as bs
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import time

%matplotlib inline

In [2]:
# Set the graph style
plt.style.use('ggplot')

## Reading the dataframe

In [3]:
final_df = pd.read_csv('./datasets/final_df.csv')
final_df

Unnamed: 0,text,label
0,driven individual rushing towards dream ever s...,1.0
1,reduce bounce rate webpage,1.0
2,made animated summary lean start eric ries hop...,1.0
3,skate ramp business,1.0
4,help getting textile prototype created,1.0
...,...,...
2161,trying learn various ing strategy came across ...,0.0
2162,pretend know lot finance economics sold positi...,0.0
2163,bill ackman bet market recovery despite covid ...,0.0
2164,news covid vaccine drugmaker pfizer pfe partne...,0.0


## Train test split

Split the model into their train and test set before transforming the text using the count vectorizer

In [68]:
X = final_df['text']
y = final_df['label']

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [70]:
print(X_train.shape)
print(X_test.shape)

(1624,)
(542,)


## Transforming the text using `countvectorizer`

In [71]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the "CountVectorizer" object
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = None) 

In [72]:
# Transform the words to tokenize the words  
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [73]:
# Convert X_train into a DataFrame.

X_train_df = pd.DataFrame(X_train_vec.toarray(),
                          columns=vectorizer.get_feature_names())
X_train_df

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
# Convert X_test into a DataFrame.
X_test_df = pd.DataFrame(X_test_vec.toarray(),
                         columns=vectorizer.get_feature_names())

X_test_df

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
540,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Baseline Accuracy

In [77]:
y_test.value_counts(normalize=True)

1.0    0.538745
0.0    0.461255
Name: label, dtype: float64

The baseline accuracy is required to check if the model performs better than the default model.

The baseline accuracy is the majority class fall which is the entrepreneur subreddit which is the class 1.

## Training on the logistics regression model

In [78]:
# Import the logistic regression model
from sklearn.linear_model import LogisticRegression

In [83]:
# Instantiate the Logistic Regression model, setting max_iter to a higher value to prevent convergence warning.
lr = LogisticRegression(solver='newton-cg',max_iter=500)

# Fit the model to the training data
lr.fit(X_train_vec, y_train)

# Evaluate the model on the training set
lr.score(X_train_vec, y_train)

0.9956896551724138

In [84]:
# Evaluate the model on the test set
lr.score(X_test_vec, y_test)

0.9059040590405905

It seems the model is overfitting comparing against the train and test set as the train set has a higher accuracy score compared to the test test.

I can reduce the number of features in the model to reduce the variance which will decrease the overfitting and help improve the accuracy score.

I can also increase regularization strength of the model to reduce the overfitting.

In [85]:
from sklearn.metrics import confusion_matrix

y_pred = lr.predict(X_test_vec)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negative:", tn)
print("False Positive:", fp)
print("True Positive:", tp)
print("False Negative:", fn)

True Negative: 219
False Positive: 31
True Positive: 272
False Negative: 20


In [86]:
confusion_matrix(y_test, y_pred)

array([[219,  31],
       [ 20, 272]], dtype=int64)

In [87]:
specificity = tn / (tn+fp) # How accurately can the model predict for the negative class
sensitivity = tp / (tp+fn) # How accurately can the model predict for the positive class

print('Specficity:', round(specificity,2))
print('Sensitivity:',round(sensitivity,2))

Specficity: 0.88
Sensitivity: 0.93


The Sensitivity is slightly higher compared to the Specificity, which means the model is slightly more likely to accurately predict the positive class compared to the negative class. However, since I'm trying to predict whether the model is able to accurately predict the subreddit where the post belongs to, optimizing for Sensitivity and Specificity would not be a good measure. Accuracy would be a better performance metric.

In [55]:
# AUC ROC Curve

## Investigating posts that were misclassified for Logistics Regression

In [88]:
# Index of Misclassified posts for the test set
index_misclassified_post = y_test[y_pred != y_test].index
index_misclassified_post

Int64Index([1356,   53, 1484, 1041, 1207,  157, 1258,  363, 1419, 1462, 1551,
             872,  346,  512, 1425, 1828,  370, 1229,    6, 1399, 1214,  235,
             722,   17, 1733,  975, 1506,  195, 2006, 1622,  726, 1301, 1933,
            1540, 1543, 1328, 1391, 1857,  177, 1368,  287, 1728, 1633, 1337,
             150, 1988, 1288, 2123, 1594,  419,  320],
           dtype='int64')

In [91]:
# Creating the dataframe for the misclassified posts
misclass_posts = pd.DataFrame(X_test[index_misclassified_post])
misclass_posts

Unnamed: 0,text
1356,covid evaluation matrix
53,good career path mechanical engineer data scie...
1484,covid
1041,know wrong world constant money like end day m...
1207,digital advertisement market reach b
157,competitor popping like crazy discussion
1258,sien sientra going tit
363,worthwhile learning ment outside taking action
1419,ace v icln
1462,amazon start amazon pharmacy w free delivery p...


In [97]:
# Setting the index values for y_pred
y_pred_series = pd.Series(y_pred, index=X_test.index)
y_pred_series

672     1.0
1356    1.0
53      0.0
1806    0.0
1118    1.0
       ... 
1949    0.0
875     1.0
1658    0.0
1896    0.0
549     1.0
Length: 542, dtype: float64

In [98]:
# Adding the predicted and actual values for the labels
misclass_posts['y_true'] =  y_test[index_misclassified_post]
misclass_posts['y_pred'] = y_pred_series[index_misclassified_post]

In [99]:
# Length of the text
misclass_posts['length_of_text'] = misclass_posts['text'].map(lambda x:len(x.split()))
misclass_posts.head()

Unnamed: 0,text,y_true,y_pred,length_of_text
1356,covid evaluation matrix,0.0,1.0,3
53,good career path mechanical engineer data scie...,1.0,0.0,7
1484,covid,0.0,1.0,1
1041,know wrong world constant money like end day m...,1.0,0.0,71
1207,digital advertisement market reach b,0.0,1.0,5


In [108]:
# Posts that were misclassified as entrepreneur subreddit
misclass_investing = misclass_posts[misclass_posts['y_pred'] == 1].sort_values('length_of_text')
misclass_investing

Unnamed: 0,text,y_true,y_pred,length_of_text
1484,covid,0.0,1.0,1
1214,stm,0.0,1.0,1
1328,nndm arkq,0.0,1.0,2
1229,short penumbra,0.0,1.0,2
1356,covid evaluation matrix,0.0,1.0,3
1288,black rock monopoly,0.0,1.0,3
1419,ace v icln,0.0,1.0,3
1368,hpq short analysis,0.0,1.0,3
1399,scam academy private,0.0,1.0,3
1540,looking amzn alternative,0.0,1.0,3


In [124]:
# Majority of the misclassified class for investing is 74% 
(misclass_investing['length_of_text'].map(lambda x:x<10).sum() / len(misclass_investing)) * 100

74.19354838709677

It's seems the majority of the posts that were misclassified as investing were have a length of less than 10 which is about 74%.

In [189]:
# Getting the frequency of the words that are misclassified as 
misclass_investing_words = " ".join(misclass_investing['text'].values)

misclass_investing_words_dict = {}
for word in misclass_investing_words.split():
    misclass_investing_words_dict[word] = misclass_investing_words.count(word)

In [208]:
pd.DataFrame(misclass_investing_words_dict.values(), 
             index=misclass_investing_words_dict.keys(),
             columns=['Frequency']).sort_values('Frequency',ascending=False).head(10)

Unnamed: 0,Frequency
e,377
b,53
w,47
v,35
ing,34
ed,26
go,12
day,9
men,8
hi,8


In [125]:
# Posts that were misclassified as investing subreddit
misclass_entre = misclass_posts[misclass_posts['y_pred'] == 0].sort_values('length_of_text')
misclass_entre

Unnamed: 0,text,y_true,y_pred,length_of_text
17,freight company,1.0,0.0,2
512,company formation,1.0,0.0,2
320,feel like paralyasis,1.0,0.0,3
419,freight transportation company,1.0,0.0,3
177,apply increase conversion rate,1.0,0.0,4
150,anyone opinion real estate wholesale,1.0,0.0,5
157,competitor popping like crazy discussion,1.0,0.0,5
235,st time importer home gym china,1.0,0.0,6
363,worthwhile learning ment outside taking action,1.0,0.0,6
346,cost efficient source branded eco friendly cup,1.0,0.0,7


In [126]:
# Majority of the misclassified class for entre is 74% 
(misclass_entre['length_of_text'].map(lambda x:x<10).sum() / len(misclass_entre)) * 100

80.0

It's seems the majority of the posts that were misclassified as investing were have a length of less than 10 which is about 80%.

## Interpreting the coefficients for the Logisitics Regression

In [156]:
# Top coefficients for the positive class
lr_coef = pd.DataFrame(lr.coef_[0],
                          index=vectorizer.get_feature_names(),
                          columns=['Coefficients']).sort_values('Coefficients',ascending=False)
lr_coef.head()

Unnamed: 0,Coefficients
business,2.002161
idea,1.752758
product,1.364253
startup,1.359004
marketing,1.279243


In [157]:
# Top negative coefficients for the negative class
lr_coef.tail()

Unnamed: 0,Coefficients
palantir,-1.333579
ment,-1.441775
etf,-1.537848
ing,-2.285011
stock,-2.824384


## Training on the Multi-nominal Bayes Model

I'll be using Multi-nominal Bayes as the X column is filled with the integer counts of the terms in each document.

In [19]:
# Import the Multinominal Naive bayes
from sklearn.naive_bayes import MultinomialNB

In [20]:
# Instiate the model
mnb = MultinomialNB()
# Fit the training set
mnb.fit(X_train, y_train) 

# Accuracy score of the training set
mnb.score(X_train, y_train)

0.9624384236453202

In [21]:
# Accuracy score of the test set
mnb.score(X_test, y_test)

0.9428044280442804

There a slight overfitting of the model on the test set but Multi-nominal Naive Bayes seems to generalize better than Logistics Regression and scores better than the baseline model.

## Futher Model evaluation

### Optimizing Logisitics Regression model

Using GridSearch CV on the CountVectorizer, Logreg, mnb.

Using GridSearchCV on the TFIFD, Logreg, mnb 

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Pipe to add the count vectorizer and logistic regression
pipe_logreg = Pipeline([
    ('countvec', CountVectorizer(lowercase=False)), # Already coverted to lowercase
    ('logreg', LogisticRegression(max_iter=500))
])

# Parameters to test the different hyper parameters
params_log_reg = {
    'countvec__ngram_range': [(1,1),(2,2)], # Testing using unigrams and unigrams and bigrams
    'countvec__max_features': [5000, 6000, 7000, 8000], # Since features are about 10,551, I'll try to use lower features
    'countvec__min_df': [2,3], # Minimum number of documents to include token
    'countvec__max_df': [.8, .9], # Maximum number of documents to include token
    'logreg__solver': ['newton-cg', 'liblinear'], # Testing different algorithms
}

In [24]:
# Instantiate the GridSearchCV

gs_log_reg = GridSearchCV(pipe_logreg,
                 param_grid=params_log_reg,
                 cv=5)

In [25]:
gs_log_reg.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvec',
                                        CountVectorizer(lowercase=False)),
                                       ('logreg',
                                        LogisticRegression(max_iter=500))]),
             param_grid={'countvec__max_df': [0.8, 0.9],
                         'countvec__max_features': [5000, 6000, 7000, 8000],
                         'countvec__min_df': [2, 3],
                         'countvec__ngram_range': [(1, 1), (2, 2)],
                         'logreg__solver': ['newton-cg', 'liblinear']})

In [26]:
gs_log_reg.best_params_

{'countvec__max_df': 0.8,
 'countvec__max_features': 5000,
 'countvec__min_df': 2,
 'countvec__ngram_range': (1, 1),
 'logreg__solver': 'liblinear'}

In [27]:
gs_log_reg.best_score_

0.9014852801519467

In [28]:
gs_log_reg.score(X_train, y_train)

0.9926108374384236

In [29]:
gs_log_reg.score(X_test, y_test)

0.8985239852398524

In [30]:
y_pred = gs_log_reg.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negative:", tn)
print("False Positive:", fp)
print("True Positive:", tp)
print("False Negative:", fn)

True Negative: 218
False Positive: 32
True Positive: 269
False Negative: 23


In [31]:
confusion_matrix(y_test, y_pred)

array([[218,  32],
       [ 23, 269]], dtype=int64)

In [32]:
specificity = tn / (tn+fp) # How accurately can the model predict for the negative class
sensitivity = tp / (tp+fn) # How accurately can the model predict for the positive class

print('Specficity:', round(specificity,2))
print('Sensitivity:',round(sensitivity,2))

Specficity: 0.87
Sensitivity: 0.92


## Stopped here

### Optimizing Multi-nominal Naive Bayes model

In [122]:
# Pipe to add the count vectorizer and Multi-nominal Bayes model
pipe_mnb = Pipeline([
    ('countvec', CountVectorizer(lowercase=False)), # Already coverted to lowercase
    ('mnb', MultinomialNB())
])

# Parameters to test the different hyper parameters
params_mnb = {
    'countvec__ngram_range': [(1,1),(2,2)], # Testing using unigrams bigrams
    'countvec__max_features': [8000, 9000, 10000], # Since features are about 10,551, I'll try to use lower features
    'countvec__min_df': [1,2], # Minimum number of documents to include token
    'countvec__max_df': [.9, .95], # Maximum number of documents to include token
    'mnb__alpha': [0.1,0.2], # Testing different alpha values
}

In [123]:
# Instantiate the GridSearchCV

gs_mnb = GridSearchCV(pipe_mnb,
                 param_grid=params_mnb,
                 cv=5)

In [111]:
gs_mnb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvec',
                                        CountVectorizer(lowercase=False)),
                                       ('mnb', MultinomialNB())]),
             param_grid={'countvec__max_df': [0.9, 0.95],
                         'countvec__max_features': [8000, 9000, 10000],
                         'countvec__min_df': [1, 2],
                         'countvec__ngram_range': [(1, 1), (2, 2)],
                         'mnb__alpha': [0.1, 0.2]})

In [112]:
gs_mnb.best_params_

{'countvec__max_df': 0.9,
 'countvec__max_features': 9000,
 'countvec__min_df': 1,
 'countvec__ngram_range': (1, 1),
 'mnb__alpha': 0.2}

In [113]:
gs_mnb.best_score_

0.926727445394112

In [114]:
gs_mnb.score(X_train, y_train)

0.9655172413793104

In [115]:
gs_mnb.score(X_test, y_test)

0.9464944649446494

## Using TFTID 

In [145]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [146]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(lowercase=False)

In [147]:
X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

In [135]:
# Convert X_train and X_test into their DataFrames.

X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(),
                          columns=tvec.get_feature_names())

In [136]:
X_train_df.head()

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.080284,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
X_test_df.head()

Unnamed: 0,aa,aaa,aapl,aar,aaron,aaxn,aaz,ab,abandon,abbv,...,zm,zoetis,zone,zoo,zookeeper,zoom,zts,zuck,zuckerberg,zweig
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Training on the logistics regression model

In [148]:
# Instantiate the Logistic Regression model, setting max_iter to a higher value to prevent convergence warning.
lr = LogisticRegression(solver='newton-cg',max_iter=500)

# Fit the model to the training data
lr.fit(X_train, y_train)

# Evaluate the model on the training set
lr.score(X_train, y_train)

0.9858374384236454

In [149]:
# Evaluate the model on the test set
lr.score(X_test, y_test)

0.940959409594096

# Need to check
It seems the model is overfitting comparing against the train and test set as the train set has a higher accuracy score compared to the test test.

I can reduce the number of features in the model to reduce the variance which will decrease the overfitting and help improve the accuracy score.

I can also increase regularization strength of the model to reduce the overfitting.

In [150]:
from sklearn.metrics import confusion_matrix

y_pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negative:", tn)
print("False Positive:", fp)
print("True Positive:", tp)
print("False Negative:", fn)

True Negative: 233
False Positive: 17
True Positive: 277
False Negative: 15


In [151]:
confusion_matrix(y_test, y_pred)

array([[233,  17],
       [ 15, 277]], dtype=int64)

In [152]:
specificity = tn / (tn+fp) # How accurately can the model predict for the negative class
sensitivity = tp / (tp+fn) # How accurately can the model predict for the positive class

print('Specficity:', round(specificity,2))
print('Sensitivity:',round(sensitivity,2))

Specficity: 0.93
Sensitivity: 0.95


# Need to check

The Sensitivity is slightly higher compared to the Specificity, which means the model is slightly more likely to accurately predict the positive class compared to the negative class. However, since I'm trying to predict whether the model is able to accurately predict the subreddit where the post belongs to, optimizing for Sensitivity and Specificity would not be a good measure. Accuracy would be a better performance metric.

## Training on the Multi-nominal Bayes Model

I'll be using Multi-nominal Bayes as the X column is filled with the integer counts of the terms in each document.

In [153]:
# Import the Multinominal Naive bayes
from sklearn.naive_bayes import MultinomialNB

In [154]:
# Instiate the model
mnb = MultinomialNB()
# Fit the training set
mnb.fit(X_train, y_train) 

# Accuracy score of the training set
mnb.score(X_train, y_train)

0.9741379310344828

In [155]:
# Accuracy score of the test set
mnb.score(X_test, y_test)

0.940959409594096

There a slight overfitting of the model on the test set but Multi-nominal Naive Bayes seems to generalize better than Logistics Regression and scores better than the baseline model.

In [55]:
# AUC ROC Curve