In [1]:
import requests
import time
import nltk
import pandas as pd
import regex as re
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('../csv_vectorized/subreddit_tfidif.csv')

In [3]:
df.shape

(3201, 18001)

In [4]:
def statistics_df_row(cm, model_name):
    
        d = {'Model': [model_name]}
        df = pd.DataFrame(data = d)
        
        model = cm
        
        TN = model.iloc[1,1]
        FP = model.iloc[0, 1]
        FN = model.iloc[1, 0]
        TP = model.iloc[0, 0]
        
        df['Accuracy'] = (TP + TN) / (TP+FP+TN+FN)
        df['Misclassification Rate'] = (FP + FN) / (TP +FP+TN+FN)
        df['Sensitivity'] = TP/(FN + TP)
        df['Specificity'] = TN/(TN+FP)
        df['Precision'] = TP / (TP + FP)
        
        
        return df

In [5]:
features = list(df.columns)
del_list = ['subreddit_topic', 'subreddit_body', 'target', 'subreddit_body_clean' ]
features = [i for i in features if i not in del_list]

In [6]:
#set X and y
X = df[features]
y = df['target']

In [7]:
#TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [8]:
#modeling

In [9]:
#Logistic Regression

In [10]:
lr = LogisticRegression(solver = 'liblinear')

In [11]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [12]:
lr.fit(X_train_sc, y_train)

LogisticRegression(solver='liblinear')

In [13]:
cross_val_score(lr, X_train_sc, y_train).mean()

0.3983333333333333

In [14]:
lr.score(X_train_sc, y_train)

0.9995833333333334

In [15]:
lr.score(X_test_sc, y_test)

0.9188514357053683

In [16]:
pred_lr = lr.predict(X_test_sc)

In [17]:
cm_1 = confusion_matrix(y_test, pred_lr)

In [18]:
cm_lr = pd.DataFrame(cm_1, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_lr

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,537,54
actual non-DadJokes,11,199


In [19]:
lr_stats = statistics_df_row(cm_lr, model_name ='linear_regression')
lr_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.918851,0.081149,0.979927,0.786561,0.908629


In [20]:
lr.coef_

array([[-0.0160189 ,  0.        ,  0.00893043, ...,  0.00114716,
        -0.00049938, -0.00550497]])

In [21]:
#gridsearch on logistic regression

In [22]:
lr = LogisticRegression(solver = 'liblinear')

In [23]:
my_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.5, 1.0, 25],
    
}

In [24]:
grid = GridSearchCV(lr, param_grid=my_params, cv=5)

In [25]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.5, 1.0, 25], 'penalty': ['l1', 'l2']})

In [26]:
grid.score(X_train_sc, y_train)

0.9995833333333334

In [27]:
grid.score(X_test_sc, y_test)

0.9338327091136079

In [28]:
pred_lrgrid = grid.predict(X_test_sc)

In [29]:
cm_2 = confusion_matrix(y_test, pred_lrgrid)

In [30]:
cm_lrgrid = pd.DataFrame(cm_2, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_lrgrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,565,26
actual non-DadJokes,27,183


In [31]:
lrgrid_stats = statistics_df_row(cm_lrgrid, model_name ='linear_regression_with_grid')
lrgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression_with_grid,0.933833,0.066167,0.954392,0.875598,0.956007


In [32]:
#K Nearest Neighbors

In [33]:
knn = KNeighborsClassifier()

In [34]:
knn.fit(X_train_sc, y_train)

KNeighborsClassifier()

In [35]:
cross_val_score(knn, X_train_sc, y_train).mean()

0.27999999999999997

In [36]:
knn.score(X_train_sc, y_train)

0.28208333333333335

In [37]:
knn.score(X_test_sc, y_test)

0.27715355805243447

In [38]:
pred_knn = knn.predict(X_test_sc)

In [39]:
cm_3 = confusion_matrix(y_test, pred_knn)

In [40]:
cm_knn = pd.DataFrame(cm_3, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_knn

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,12,579
actual non-DadJokes,0,210


In [41]:
knn_stats = statistics_df_row(cm_knn, model_name ='k_nearest_neighbor')
knn_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor,0.277154,0.722846,1.0,0.26616,0.020305


In [42]:
#gridsearch on K Nearest Neighbors

In [43]:
knn = KNeighborsClassifier()

In [44]:
my_params = {
    'n_neighbors': [5, 10, 25],
    'weights': ['uniform', 'distance'],
    
}

In [45]:
grid = GridSearchCV(knn, param_grid=my_params, cv=5)

In [46]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 10, 25],
                         'weights': ['uniform', 'distance']})

In [47]:
grid.score(X_test_sc, y_test)

0.27715355805243447

In [48]:
pred_knngrid = grid.predict(X_test_sc)

In [49]:
cm_4 = confusion_matrix(y_test, pred_knngrid)

In [50]:
cm_knngrid = pd.DataFrame(cm_4, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_knngrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,12,579
actual non-DadJokes,0,210


In [51]:
knngrid_stats = statistics_df_row(cm_knngrid, model_name ='k_nearest_neighbor_with_grid')
knngrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor_with_grid,0.277154,0.722846,1.0,0.26616,0.020305


In [52]:
#Random Forests:

In [53]:
rf = RandomForestClassifier()

In [54]:
rf.fit(X_train_sc, y_train)

RandomForestClassifier()

In [55]:
cross_val_score(rf, X_train_sc, y_train).mean()

0.95375

In [56]:
rf.score(X_train_sc, y_train)

0.9995833333333334

In [57]:
rf.score(X_test_sc, y_test)

0.9500624219725343

In [58]:
pred_rf = rf.predict(X_test_sc)

In [59]:
cm_5 = confusion_matrix(y_test, pred_rf)

In [60]:
cm_rf = pd.DataFrame(cm_5, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_rf

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,560,31
actual non-DadJokes,9,201


In [61]:
rf_stats = statistics_df_row(cm_rf, model_name ='random_forrest')
rf_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest,0.950062,0.049938,0.984183,0.866379,0.947547


In [62]:
#Gridsearch on Random Forests

In [63]:
rf = RandomForestClassifier()

In [64]:
my_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [18, 20, 25],
    'max_depth': [4, 10, 20],
    'max_features': ['auto', 1.0, 2, 3]
    
}

In [65]:
grid = GridSearchCV(rf, param_grid = my_params, cv = 5)

In [66]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 10, 20],
                         'max_features': ['auto', 1.0, 2, 3],
                         'n_estimators': [18, 20, 25]})

In [67]:
grid.score(X_train_sc, y_train)

0.9725

In [68]:
grid.score(X_test_sc, y_test)

0.9425717852684145

In [69]:
pred_rfgrid = grid.predict(X_test_sc)

In [70]:
cm_6 = confusion_matrix(y_test, pred_rfgrid)

In [71]:
cm_rfgrid = pd.DataFrame(cm_6, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_rfgrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,556,35
actual non-DadJokes,11,199


In [72]:
rfgrid_stats = statistics_df_row(cm_rfgrid, model_name ='random_forrest_with_grid')
rfgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest_with_grid,0.942572,0.057428,0.9806,0.850427,0.940778


In [73]:
#Extra Trees

In [3]:
et = ExtraTreesClassifier()

NameError: name 'ExtraTreesClassifier' is not defined

In [75]:
et.fit(X_train_sc, y_train)

ExtraTreesClassifier()

In [76]:
cross_val_score(et, X_train_sc, y_train).mean()

0.9591666666666667

In [2]:
et.score(X_test_sc, y_test)

NameError: name 'et' is not defined

In [1]:
pred_et = et.predict(X_test_sc)

NameError: name 'et' is not defined

In [79]:
cm_7 = confusion_matrix(y_test, pred_et)

In [80]:
cm_et = pd.DataFrame(cm_7, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_et

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,569,22
actual non-DadJokes,7,203


In [81]:
et_stats = statistics_df_row(cm_et, model_name ='extra_trees')
et_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees,0.963795,0.036205,0.987847,0.902222,0.962775


In [82]:
#gridsearch on extra trees

In [83]:
my_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [18, 20, 25],
    'max_depth': [4, 10, 20],
    'max_features': ['auto', 1.0, 2, 3]
    
}

In [84]:
grid = GridSearchCV(et, param_grid = my_params, cv = 5)

In [85]:
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, estimator=ExtraTreesClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 10, 20],
                         'max_features': ['auto', 1.0, 2, 3],
                         'n_estimators': [18, 20, 25]})

In [86]:
grid.score(X_train_sc, y_train)

0.975

In [87]:
grid.score(X_test_sc, y_test)

0.9450686641697877

In [88]:
cm_8 = confusion_matrix(y_test, pred_et)

In [89]:
cm_etgrid = pd.DataFrame(cm_8, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_etgrid

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,569,22
actual non-DadJokes,7,203


In [90]:
etgrid_stats = statistics_df_row(cm_etgrid, model_name ='extra_trees_wit_grid')
etgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees_wit_grid,0.963795,0.036205,0.987847,0.902222,0.962775


In [91]:
lr_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.918851,0.081149,0.979927,0.786561,0.908629


In [92]:
lrgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression_with_grid,0.933833,0.066167,0.954392,0.875598,0.956007


In [93]:
knn_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor,0.277154,0.722846,1.0,0.26616,0.020305


In [94]:
knngrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,k_nearest_neighbor_with_grid,0.277154,0.722846,1.0,0.26616,0.020305


In [95]:
rf_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest,0.950062,0.049938,0.984183,0.866379,0.947547


In [96]:
rfgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,random_forrest_with_grid,0.942572,0.057428,0.9806,0.850427,0.940778


In [97]:
et_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees,0.963795,0.036205,0.987847,0.902222,0.962775


In [98]:
etgrid_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,extra_trees_wit_grid,0.963795,0.036205,0.987847,0.902222,0.962775


In [99]:
stat_tables = [lrgrid_stats, knn_stats, knngrid_stats, rf_stats, rfgrid_stats, et_stats, etgrid_stats]

tfidif_stats = lr_stats.append(stat_tables).reset_index(drop = True)

In [100]:
tfidif_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.918851,0.081149,0.979927,0.786561,0.908629
1,linear_regression_with_grid,0.933833,0.066167,0.954392,0.875598,0.956007
2,k_nearest_neighbor,0.277154,0.722846,1.0,0.26616,0.020305
3,k_nearest_neighbor_with_grid,0.277154,0.722846,1.0,0.26616,0.020305
4,random_forrest,0.950062,0.049938,0.984183,0.866379,0.947547
5,random_forrest_with_grid,0.942572,0.057428,0.9806,0.850427,0.940778
6,extra_trees,0.963795,0.036205,0.987847,0.902222,0.962775
7,extra_trees_wit_grid,0.963795,0.036205,0.987847,0.902222,0.962775


In [102]:

tfidif_stats.to_csv('../csv_model_statistics/tfidif_model_statistics.csv', index = False)