In [1]:
%matplotlib inline

# Model Building Part 2

Code for building the models
_Author: Jimmy Charité_  
_Email: jimmy.charite@gmail.com_  

Following up with model building part one, I experiment with using the word vectors and class imbalance corrections like over sampling with SMOTE and under sampling Tomek link removal.

I limit myself to the random forest, the best performing model in part 2, you streamline the exercise.

In [2]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML

In [3]:
retval=os.chdir("..")

In [4]:
clean_data=pd.read_pickle('./clean_data/clean_data.pkl')

In [5]:
clean_data.head()

Unnamed: 0,helpful,num_sents,num_words,readability,neg_senti,pos_senti,neu_senti,comp_senti,text_lemma,vec0,...,vec290,vec291,vec292,vec293,vec294,vec295,vec296,vec297,vec298,vec299
0,0.0,0.693147,3.610918,6.742881,0.079,0.068,0.853,-0.1027,product arrive label peanut actually small siz...,0.033346,...,-0.023125,-0.005069,0.007344,-0.045929,-0.017832,-0.018206,-0.017281,0.01241,0.020198,-0.002511
1,0.0,1.386294,3.555348,6.734948,0.0,0.448,0.552,0.9468,great taffy great price wide assortment yummy ...,0.037825,...,-0.015524,0.009058,0.020853,-0.058746,-0.001076,-0.013715,-0.035464,0.006317,0.023066,0.012566
2,0.0,1.609438,4.49981,6.743588,0.029,0.163,0.809,0.883,get wild hair taffy order pound bag taffy enjo...,0.039023,...,-0.011637,0.008717,0.007918,-0.046595,-0.012542,-0.028316,-0.036677,0.015261,0.016227,0.00893
3,0.0,1.609438,4.143135,6.742527,0.034,0.273,0.693,0.9346,saltwater taffy great flavor soft chewy candy ...,0.038912,...,-0.01044,0.006156,0.007695,-0.039642,-0.01208,-0.026868,-0.018743,0.009134,0.021543,0.016047
4,0.0,1.609438,3.526361,6.737915,0.0,0.48,0.52,0.9487,taffy good soft chewy flavor amazing definitel...,0.043776,...,-0.010004,-0.003239,0.014308,-0.050601,-0.0241,-0.023046,-0.017151,0.017009,0.010729,0.004194


In [6]:
del clean_data['text_lemma']

In [7]:
clean_data.columns

Index(['helpful', 'num_sents', 'num_words', 'readability', 'neg_senti',
       'pos_senti', 'neu_senti', 'comp_senti', 'vec0', 'vec1',
       ...
       'vec290', 'vec291', 'vec292', 'vec293', 'vec294', 'vec295', 'vec296',
       'vec297', 'vec298', 'vec299'],
      dtype='object', length=308)

## Training and Testing Split

In [8]:
my_rand_state=0
test_size=0.25

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = (clean_data.iloc[:,1:]).as_matrix()
y = (clean_data.iloc[:,0]).tolist()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                              random_state=my_rand_state)

## Feature Selection

In [12]:
from sklearn.feature_selection import VarianceThreshold

In [13]:
vt = VarianceThreshold()
threshold=[p*(1-p) for p in [0, 0.05, 0.1, 0.15]]

Note, since the formula for the variance of binary variables is p*(1-p), where p is the proportion of times that the binary variable is 1, I use the proportion to define the variance thresholds. The max variance is 0.25 at p=0.5.

## Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
std_scale=StandardScaler()

## Class Imbalance Corrections

In [16]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks



In [17]:
smote = SMOTE(random_state=my_rand_state)
tl = TomekLinks(random_state=my_rand_state)

## Classification Models

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
rf_clf=RandomForestClassifier()
n_estimators=[100,200]
max_features=[.1,.3,.5]

In [20]:
class_weight=['balanced']
class_weight.extend([{1: w} for w in [1, 2, 10]])

## Creating Pipelines

In [21]:
from imblearn import pipeline #needed if mixing imblearn with sklearn classes
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

I plan on using imblearn classes for later iterations so I use it's pipeline in the beginning for convenience

In [22]:
n_jobs=4

In [23]:
n_folds=10
skfold = StratifiedKFold(n_splits=n_folds,random_state=my_rand_state, shuffle=False)

#### Random Forest Estimators

In [24]:
rf_clf_b = pipeline.Pipeline(steps=[('vt',vt),('scale',std_scale),('clf',rf_clf)])
rf_clf_est_b_wv = GridSearchCV(estimator=rf_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [25]:
rf_clf_b_smote = pipeline.Pipeline(steps=[('vt',vt),('smote',smote),('scale',std_scale),('clf',rf_clf)])
rf_clf_est_b_wv_smote = GridSearchCV(estimator=rf_clf_b_smote,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [26]:
rf_clf_b_tl = pipeline.Pipeline(steps=[('vt',vt),('tl',tl),('scale',std_scale),('clf',rf_clf)])
rf_clf_est_b_wv_tl = GridSearchCV(estimator=rf_clf_b_tl,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

## Fitting Estimators

In [None]:
from sklearn.externals import joblib

Basic Estimators: no bag of words or PCA

In [None]:
rf_clf_est_b_wv.fit(X_train,y_train)
joblib.dump(rf_clf_est_b_wv, './other_output/rf_clf_est_b_wv.pkl')

In [None]:
rf_clf_est_b_wv_smote.fit(X_train,y_train)
joblib.dump(rf_clf_est_b_wv_smote, './other_output/rf_clf_est_b_wv_smote.pkl')

In [None]:
rf_clf_est_b_wv_tl.fit(X_train,y_train)
joblib.dump(rf_clf_est_b_wv_tl, './other_output/rf_clf_est_b_wv_tl.pkl')

## Testing Estimators

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
rf_clf_est_b_wv=joblib.load('./other_output/rf_clf_est_b_wv.pkl')
rf_clf_est_b_wv_smote=joblib.load('./other_output/rf_clf_est_b_wv_smote.pkl')
rf_clf_est_b_wv_tl=joblib.load('./other_output/rf_clf_est_b_wv_tl.pkl')

Basic Estimators: no bag of words or PCA

In [None]:
rf_fpr, rf_tpr, _ = roc_curve(y_test, 
                    rf_clf_est_b_wv.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)

rf_fpr_smote, rf_tpr_smote, _ = roc_curve(y_test, 
                    rf_clf_est_b_wv_smote.predict_proba(X_test)[:,1])
rf_roc_auc_smote = auc(rf_fpr_smote, rf_tpr_smote)

rf_fpr_tl, rf_tpr_tl, _ = roc_curve(y_test, 
                    rf_clf_est_b_wv_tl.predict_proba(X_test)[:,1])
rf_roc_auc_tl = auc(rf_fpr_tl, rf_tpr_tl)

In [None]:
plt.plot(rf_fpr, rf_tpr, color='cyan', linestyle='--',
         label='RF (area = %0.2f)' % rf_roc_auc, lw=2)

plt.plot(rf_fpr_smote, rf_tpr_smote, color='indigo', linestyle='--',
         label='RF w/ SMOTE (area = %0.2f)' % rf_roc_auc_smote, lw=2)

plt.plot(rf_fpr_tl, rf_tpr_tl, color='seagreen', linestyle='--',
         label='RF w/ Tomek Links (area = %0.2f)' % rf_roc_auc_tl, lw=2)

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
         label='Luck')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Basic Models')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Part_2.png', bbox_inches='tight')
plt.show()