In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification # this is a library to make classification datasets
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('../csv_vectorized/subreddit_count_vec.csv')

In [3]:
df.head()

Unnamed: 0,subreddit_topic,subreddit_body,target,subreddit_body_clean,subreddit_body_clean_aa,subreddit_body_clean_aaa,subreddit_body_clean_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,subreddit_body_clean_aaaaaaahhhhhhhhhhhhhhhhhhhhhhhh,subreddit_body_clean_aate,subreddit_body_clean_ab,...,subreddit_body_clean_zombie,subreddit_body_clean_zone,subreddit_body_clean_zoo,subreddit_body_clean_zoolander,subreddit_body_clean_zoom,subreddit_body_clean_zoroastrianism,subreddit_body_clean_zorro,subreddit_body_clean_zosimus,subreddit_body_clean_zu,subreddit_body_clean_zulu
0,80sRock,Hey All Just wanted to share my acoustic ins...,0,hey wanted share acoustic instrumental version...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80sRock,My workplace has one of those pumped in music ...,0,workplace ha one pumped music playlist pm bloc...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,80sRock,People of the world I just thought I d shar...,0,people world thought share attempt song includ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,80sRock,I want to listen to a song by them but don t k...,0,want listen song know start,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,80sRock,Title says it but my dad went to the Now and Z...,0,title say dad went zen tour wa younger got shi...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def statistics_df_row(cm, model_name):
    
        d = {'Model': [model_name]}
        df = pd.DataFrame(data = d)
        
        model = cm
        
        TN = model.iloc[1,1]
        FP = model.iloc[0, 1]
        FN = model.iloc[1, 0]
        TP = model.iloc[0, 0]
        
        df['Accuracy'] = (TP + TN) / (TP+FP+TN+FN)
        df['Misclassification Rate'] = (FP + FN) / (TP +FP+TN+FN)
        df['Sensitivity'] = TP/(FN + TP)
        df['Specificity'] = TN/(TN+FP)
        df['Precision'] = TP / (TP + FP)
        
        
        return df

In [5]:
features = list(df.columns)
del_list = ['subreddit_topic', 'subreddit_body', 'target', 'subreddit_body_clean' ]
features = [i for i in features if i not in del_list]

In [6]:
#set X and y
X = df[features]
y = df['target']

In [7]:
#TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [8]:
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2))

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2))

In [9]:
vote = VotingClassifier([
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('tree', DecisionTreeClassifier())
    
])
params = {
    'ada__n_estimators' : [95, 100, 110],
    'gb__n_estimators' : [15, 25, 35],
    'tree__max_depth' : [6]
    
}
gs = GridSearchCV(vote, param_grid=params, cv=5, n_jobs=8)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.9166666666666667


{'ada__n_estimators': 95, 'gb__n_estimators': 15, 'tree__max_depth': 6}

In [10]:
pred = gs.predict(X_test)

In [11]:
cm = confusion_matrix(y_test, pred)

In [12]:
cm_voting = pd.DataFrame(cm, columns=['pred DadJokes', 'pred non-DadJokes'], index=['actual DadJokes', 'actual non-DadJokes'])
cm_voting

Unnamed: 0,pred DadJokes,pred non-DadJokes
actual DadJokes,564,27
actual non-DadJokes,35,175


In [13]:
boost_stats = statistics_df_row(cm_voting, model_name ='voting_class_boost')
boost_stats

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,voting_class_boost,0.922597,0.077403,0.941569,0.866337,0.954315


In [14]:
df_2 = pd.read_csv('../csv_model_statistics/cvec_model_statistics.csv')

In [15]:
df_2

Unnamed: 0,Model,Accuracy,Misclassification Rate,Sensitivity,Specificity,Precision
0,linear_regression,0.835206,0.164794,0.993548,0.616071,0.781726
1,linear_regression_with_grid,0.941323,0.058677,0.990975,0.82996,0.928934
2,k_nearest_neighbor,0.303371,0.696629,0.971429,0.272846,0.05753
3,k_nearest_neighbor_with_grid,0.303371,0.696629,0.971429,0.272846,0.05753
4,random_forrest,0.947566,0.052434,0.987567,0.852941,0.940778
5,random_forrest_with_grid,0.926342,0.073658,0.927653,0.921788,0.976311
6,extra_trees,0.950062,0.049938,0.987611,0.860169,0.944162
7,extra_trees_wit_grid,0.950062,0.049938,0.987611,0.860169,0.944162


In [16]:
final = df_2.append(boost_stats)

In [17]:
final.to_csv('../csv_model_statistics/cvec_model_statistics_with_boost.csv', index = False)