In [1]:
import requests
import time
import nltk
import pandas as pd
import regex as re
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('../csv_vectorized/subreddit_count_vec.csv')

In [3]:
df.shape

(3201, 18001)

In [4]:
def statistics_df_row(cm, model_name):
    
        d = {'Model': [model_name]}
        df = pd.DataFrame(data = d)
        
        model = cm
        
        TN = model.iloc[1,1]
        FP = model.iloc[0, 1]
        FN = model.iloc[1, 0]
        TP = model.iloc[0, 0]
        
        df['Accuracy'] = (TP + TN) / (TP+FP+TN+FN)
        df['Misclassification Rate'] = (FP + FN) / (TP +FP+TN+FN)
        df['Sensitivity'] = TP/(FN + TP)
        df['Specificity'] = TN/(TN+FP)
        df['Precision'] = TP / (TP + FP)
        
        
        return df

In [5]:
features = list(df.columns)
del_list = ['subreddit_topic', 'subreddit_body', 'target', 'subreddit_body_clean' ]
features = [i for i in features if i not in del_list]

In [6]:
#set X and y
X = df[features]
y = df['target']

In [7]:
#TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [8]:
#modeling

In [9]:
#Logistic Regression

In [10]:
lr = LogisticRegression(solver = 'liblinear')

In [11]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [12]:
lr.fit(X_train_sc, y_train)

LogisticRegression(solver='liblinear')

In [13]:
cross_val_score(lr, X_train_sc, y_train).mean()

0.44708333333333333

In [14]:
lr.score(X_train_sc, y_train)

0.9995833333333334

In [15]:
lr.score(X_test_sc, y_test)

0.8352059925093633

In [16]:
pred_lr = lr.predict(X_test_sc)

In [17]:
cm_1 = confusion_matrix(y_test, pred_lr)

In [18]:
cm_lr = pd.DataFrame(cm_1, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_lr

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,462,129
actual non-DadJokes,3,207


In [19]:
lr_stats = statistics_df_row(cm_lr, model_name ='linear_regression')
lr_stats

df name: linear_regression_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.835206,0.164794,0.993548,0.616071,0.781726


In [20]:
lr.coef_

array([[ 1.00298052e-02,  0.00000000e+00,  2.24967272e-02, ...,
         8.77659209e-04, -5.44667124e-05, -6.36729980e-03]])

In [21]:
#gridsearch on logistic regression

In [22]:
lr = LogisticRegression(solver = 'liblinear')

In [23]:
my_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.5, 1.0, 25],
    
}

In [24]:
grid = GridSearchCV(lr, param_grid=my_params, cv=5)

In [25]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.5, 1.0, 25], 'penalty': ['l1', 'l2']})

In [26]:
grid.score(X_train_sc, y_train)

0.9991666666666666

In [27]:
grid.score(X_test_sc, y_test)

0.9413233458177278

In [28]:
pred_lrgrid = grid.predict(X_test_sc)

In [29]:
cm_2 = confusion_matrix(y_test, pred_lrgrid)

In [30]:
cm_lrgrid = pd.DataFrame(cm_2, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_lrgrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,549,42
actual non-DadJokes,5,205


In [31]:
lrgrid_stats = statistics_df_row(cm_lrgrid, model_name ='linear_regression_with_grid')
lrgrid_stats

df name: linear_regression_with_grid_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression_with_grid,0.941323,0.058677,0.990975,0.82996,0.928934


In [32]:
#K Nearest Neighbors

In [33]:
knn = KNeighborsClassifier()

In [34]:
knn.fit(X_train_sc, y_train)

KNeighborsClassifier()

In [35]:
cross_val_score(knn, X_train_sc, y_train).mean()

0.30125

In [36]:
knn.score(X_train_sc, y_train)

0.33708333333333335

In [37]:
knn.score(X_test_sc, y_test)

0.30337078651685395

In [38]:
pred_knn = knn.predict(X_test_sc)

In [39]:
cm_3 = confusion_matrix(y_test, pred_knn)

In [40]:
cm_knn = pd.DataFrame(cm_3, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_knn

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,34,557
actual non-DadJokes,1,209


In [41]:
knn_stats = statistics_df_row(cm_knn, model_name ='k_nearest_neighbor')
knn_stats

df name: k_nearest_neighbor_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor,0.303371,0.696629,0.971429,0.272846,0.05753


In [42]:
#gridsearch on K Nearest Neighbors

In [43]:
knn = KNeighborsClassifier()

In [44]:
my_params = {
    'n_neighbors': [5, 10, 25],
    'weights': ['uniform', 'distance'],
    
}

In [45]:
grid = GridSearchCV(knn, param_grid=my_params, cv=5)

In [46]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 10, 25],
                         'weights': ['uniform', 'distance']})

In [47]:
grid.score(X_test_sc, y_test)

0.30337078651685395

In [48]:
pred_knngrid = grid.predict(X_test_sc)

In [49]:
cm_4 = confusion_matrix(y_test, pred_knngrid)

In [50]:
cm_knngrid = pd.DataFrame(cm_4, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_knngrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,34,557
actual non-DadJokes,1,209


In [51]:
knngrid_stats = statistics_df_row(cm_knngrid, model_name ='k_nearest_neighbor_with_grid')
knngrid_stats

df name: k_nearest_neighbor_with_grid_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor_with_grid,0.303371,0.696629,0.971429,0.272846,0.05753


In [52]:
#Random Forests:

In [53]:
rf = RandomForestClassifier()

In [54]:
rf.fit(X_train_sc, y_train)

RandomForestClassifier()

In [55]:
cross_val_score(rf, X_train_sc, y_train).mean()

0.9491666666666667

In [56]:
rf.score(X_train_sc, y_train)

0.9995833333333334

In [57]:
rf.score(X_test_sc, y_test)

0.947565543071161

In [58]:
pred_rf = rf.predict(X_test_sc)

In [59]:
cm_5 = confusion_matrix(y_test, pred_rf)

In [60]:
cm_rf = pd.DataFrame(cm_5, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_rf

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,556,35
actual non-DadJokes,7,203


In [61]:
rf_stats = statistics_df_row(cm_rf, model_name ='random_forrest')
rf_stats

df name: random_forrest_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest,0.947566,0.052434,0.987567,0.852941,0.940778


In [62]:
#Gridsearch on Random Forests

In [63]:
rf = RandomForestClassifier()

In [64]:
my_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [18, 20, 25],
    'max_depth': [4, 10, 20],
    'max_features': ['auto', 1.0, 2, 3]
    
}

In [65]:
grid = GridSearchCV(rf, param_grid = my_params, cv = 5)

In [66]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 10, 20],
                         'max_features': ['auto', 1.0, 2, 3],
                         'n_estimators': [18, 20, 25]})

In [67]:
grid.score(X_train_sc, y_train)

0.9479166666666666

In [68]:
grid.score(X_test_sc, y_test)

0.9263420724094882

In [69]:
pred_rfgrid = grid.predict(X_test_sc)

In [70]:
cm_6 = confusion_matrix(y_test, pred_rfgrid)

In [71]:
cm_rfgrid = pd.DataFrame(cm_6, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_rfgrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,577,14
actual non-DadJokes,45,165


In [72]:
rfgrid_stats = statistics_df_row(cm_rfgrid, model_name ='random_forrest_with_grid')
rfgrid_stats

df name: random_forrest_with_grid_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest_with_grid,0.926342,0.073658,0.927653,0.921788,0.976311


In [73]:
#Extra Trees

In [74]:
et = ExtraTreesClassifier()

In [75]:
et.fit(X_train_sc, y_train)

ExtraTreesClassifier()

In [76]:
cross_val_score(et, X_train_sc, y_train).mean()

0.9483333333333333

In [77]:
et.score(X_test_sc, y_test)

0.9500624219725343

In [79]:
pred_et = et.predict(X_test_sc)

In [80]:
cm_7 = confusion_matrix(y_test, pred_et)

In [81]:
cm_et = pd.DataFrame(cm_7, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_et

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,558,33
actual non-DadJokes,7,203


In [82]:
et_stats = statistics_df_row(cm_et, model_name ='extra_trees')
et_stats

df name: extra_trees_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees,0.950062,0.049938,0.987611,0.860169,0.944162


In [None]:
#gridsearch on extra trees

In [83]:
my_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [18, 20, 25],
    'max_depth': [4, 10, 20],
    'max_features': ['auto', 1.0, 2, 3]
    
}

In [84]:
grid = GridSearchCV(et, param_grid = my_params, cv = 5)

In [85]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=ExtraTreesClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 10, 20],
                         'max_features': ['auto', 1.0, 2, 3],
                         'n_estimators': [18, 20, 25]})

In [86]:
grid.score(X_train_sc, y_train)

0.9

In [87]:
grid.score(X_test_sc, y_test)

0.8589263420724095

In [88]:
cm_8 = confusion_matrix(y_test, pred_et)

In [92]:
cm_etgrid = pd.DataFrame(cm_8, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_etgrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,558,33
actual non-DadJokes,7,203


In [93]:
etgrid_stats = statistics_df_row(cm_etgrid, model_name ='extra_trees_wit_grid')
etgrid_stats

df name: extra_trees_wit_grid_accuracy


Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees_wit_grid,0.950062,0.049938,0.987611,0.860169,0.944162


In [94]:
lr_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.835206,0.164794,0.993548,0.616071,0.781726


In [95]:
lrgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression_with_grid,0.941323,0.058677,0.990975,0.82996,0.928934


In [96]:
knn_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor,0.303371,0.696629,0.971429,0.272846,0.05753


In [97]:
knngrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor_with_grid,0.303371,0.696629,0.971429,0.272846,0.05753


In [98]:
rf_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest,0.947566,0.052434,0.987567,0.852941,0.940778


In [99]:
rfgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest_with_grid,0.926342,0.073658,0.927653,0.921788,0.976311


In [100]:
et_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees,0.950062,0.049938,0.987611,0.860169,0.944162


In [101]:
etgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees_wit_grid,0.950062,0.049938,0.987611,0.860169,0.944162


In [111]:
stat_tables = [lrgrid_stats, knn_stats, knngrid_stats, rf_stats, rfgrid_stats, et_stats, etgrid_stats]

cvec_stats = lr_stats.append(stat_tables).reset_index(drop = True)

In [114]:
cvec_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.835206,0.164794,0.993548,0.616071,0.781726
1,linear_regression_with_grid,0.941323,0.058677,0.990975,0.82996,0.928934
2,k_nearest_neighbor,0.303371,0.696629,0.971429,0.272846,0.05753
3,k_nearest_neighbor_with_grid,0.303371,0.696629,0.971429,0.272846,0.05753
4,random_forrest,0.947566,0.052434,0.987567,0.852941,0.940778
5,random_forrest_with_grid,0.926342,0.073658,0.927653,0.921788,0.976311
6,extra_trees,0.950062,0.049938,0.987611,0.860169,0.944162
7,extra_trees_wit_grid,0.950062,0.049938,0.987611,0.860169,0.944162


In [115]:

cvec_stats.to_csv('../csv_model_statistics/cvec_model_statistics.csv', index = False)