In [1]:
%matplotlib inline

# Experimental Model Building

Code for building the models  
_Author: Jimmy Charité_  
_Email: jimmy.charite@gmail.com_  

Experimenting with tensorflow

In [1]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML
import tensorflow as tf

In [2]:
retval=os.chdir("..")

In [3]:
clean_data=pd.read_pickle('./clean_data/clean_data.pkl')

In [4]:
clean_data.head()

Unnamed: 0,helpful,num_sents,num_words,readability,neg_senti,pos_senti,neu_senti,comp_senti,text_lemma,vec0,...,vec290,vec291,vec292,vec293,vec294,vec295,vec296,vec297,vec298,vec299
0,0.0,0.693147,3.610918,6.742881,0.079,0.068,0.853,-0.1027,product arrive label peanut actually small siz...,0.033346,...,-0.023125,-0.005069,0.007344,-0.045929,-0.017832,-0.018206,-0.017281,0.01241,0.020198,-0.002511
1,0.0,1.386294,3.555348,6.734948,0.0,0.448,0.552,0.9468,great taffy great price wide assortment yummy ...,0.037825,...,-0.015524,0.009058,0.020853,-0.058746,-0.001076,-0.013715,-0.035464,0.006317,0.023066,0.012566
2,0.0,1.609438,4.49981,6.743588,0.029,0.163,0.809,0.883,get wild hair taffy order pound bag taffy enjo...,0.039023,...,-0.011637,0.008717,0.007918,-0.046595,-0.012542,-0.028316,-0.036677,0.015261,0.016227,0.00893
3,0.0,1.609438,4.143135,6.742527,0.034,0.273,0.693,0.9346,saltwater taffy great flavor soft chewy candy ...,0.038912,...,-0.01044,0.006156,0.007695,-0.039642,-0.01208,-0.026868,-0.018743,0.009134,0.021543,0.016047
4,0.0,1.609438,3.526361,6.737915,0.0,0.48,0.52,0.9487,taffy good soft chewy flavor amazing definitel...,0.043776,...,-0.010004,-0.003239,0.014308,-0.050601,-0.0241,-0.023046,-0.017151,0.017009,0.010729,0.004194


In [7]:
kept_cols=['helpful']
kept_cols.extend(clean_data.columns[9:])

## Training and Testing Split

In [8]:
my_rand_state=0
test_size=0.25

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = (clean_data[kept_cols].iloc[:,1:]).as_matrix()
y = (clean_data[kept_cols].iloc[:,0]).tolist()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                              random_state=my_rand_state)

## Setting Up Tensor Flow

In [25]:
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=len(X[0,:]))]

In [31]:
dnn_clf=tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                       hidden_units=[10,20,10],
                                       model_dir='./other_output/tf_model')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_task_id': 0, '_save_summary_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': None, '_model_dir': None, '_is_chief': True, '_master': '', '_keep_checkpoint_max': 5, '_evaluation_master': '', '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdd54f55e10>, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_environment': 'local', '_task_type': None, '_num_worker_replicas': 0}


In [48]:
class PassData(object):
    '''
    Callable object that can be initialized and 
    used to pass data to tensorflow
    '''
    
    def __init__(self,X,y):
        self.X=tf.constant(X)
        self.y=tf.constant(y)
    
    def __call__(self):
        return self.X, self.y

In [49]:
train_data=PassData(X,y)

In [50]:
dnn_clf.fit(input_fn=train_data)

ValueError: Partitioned variable with name dnn/hiddenlayer_0/weights already exists. Did you mean to set reuse=True in VarScope?

In [11]:
from sklearn.feature_selection import VarianceThreshold

In [12]:
vt = VarianceThreshold()
threshold=[p*(1-p) for p in [0, 0.05, 0.1, 0.15]]

## Scaling

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
std_scale=StandardScaler()

## Dimension Reduction

In [15]:
from sklearn.decomposition import TruncatedSVD

In [16]:
tsvd=TruncatedSVD()
n_components=[100] #limited to keep the training size managable

## Text

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf=TfidfVectorizer(lowercase=False)

## Custom Feature Separator

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin

In [20]:
class ExtractText(BaseEstimator, TransformerMixin):
    """
    Separates the features by numerical and text
    """
    def __init__(self, text,n_text=-1):
        self.text = text
        self.n_text=n_text

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if(self.text):
            return X[:,self.n_text]
        else:
            return X[:,:self.n_text]

In [21]:
from sklearn.pipeline import FeatureUnion

## Classification Models

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

Although tuning is not necessary for Naive Bayes, I pass the default parameters of those models to GridSearchCV anyway so that I can do a direct pair-wise comparison with the other models across the different steps of cross-validation.  

In the interest of time, I didn't use the SVM classifier.

In [23]:
nb_clf=GaussianNB()
priors=[None]

In [24]:
qda_clf=QuadraticDiscriminantAnalysis()
reg_param=[0.0, 0.25, 0.5, 0.75]

In [25]:
log_clf=LogisticRegression(penalty='l2')
C=[0.001 , 0.01, 10, 100,1000]

In [26]:
rf_clf=RandomForestClassifier()
n_estimators=[100,200]
max_features=[.1,.3,.5]

In [27]:
dtree=DecisionTreeClassifier(max_depth=None, min_samples_split=2)
bagTree_clf=BaggingClassifier(base_estimator=dtree)
max_samples=[.3,.6]

In [28]:
class_weight=['balanced']
class_weight.extend([{1: w} for w in [1, 2, 10]])

## Creating Pipelines

In [29]:
from imblearn import pipeline #needed if mixing imblearn with sklearn classes
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

I plan on using imblearn classes for later iterations so I use it's pipeline in the beginning for convenience

In [30]:
n_jobs=4

In [31]:
n_folds=10
skfold = StratifiedKFold(n_splits=n_folds,random_state=my_rand_state, shuffle=False)

#### Main Feature Union

In [32]:
ft_union=FeatureUnion(transformer_list=[('text_pipe',pipeline.Pipeline([('extract',ExtractText(text=True)),
                                                                       ('tfidf',tfidf),
                                                                       ('tsvd',tsvd)])),
                                        ('numb_pipe',pipeline.Pipeline([('extract',ExtractText(text=False)),
                                                                       ('vt',vt),
                                                                       ('scale',std_scale)]))])

#### Naive Bayes Estimators

In [33]:
nb_clf_b = pipeline.Pipeline(steps=[('union',ft_union),('clf',nb_clf)])
nb_clf_est_b = GridSearchCV(estimator=nb_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(union__text_pipe__tsvd__n_components=n_components,
                              union__numb_pipe__vt__threshold=threshold,
                              clf__priors=priors))

#### QDA Estimators

In [34]:
qda_clf_b = pipeline.Pipeline(steps=[('union',ft_union),('clf',qda_clf)])
qda_clf_est_b = GridSearchCV(estimator=qda_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(union__text_pipe__tsvd__n_components=n_components,
                              union__numb_pipe__vt__threshold=threshold,
                              clf__reg_param=reg_param))

#### Logistic Estimators

In [35]:
log_clf_b = pipeline.Pipeline(steps=[('union',ft_union),('clf',log_clf)])
log_clf_est_b = GridSearchCV(estimator=log_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(union__text_pipe__tsvd__n_components=n_components,
                              union__numb_pipe__vt__threshold=threshold,
                              clf__C=C,
                              clf__class_weight=class_weight))

#### Random Forest Estimators

In [36]:
rf_clf_b = pipeline.Pipeline(steps=[('union',ft_union),('clf',rf_clf)])
rf_clf_est_b = GridSearchCV(estimator=rf_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(union__text_pipe__tsvd__n_components=n_components,
                              union__numb_pipe__vt__threshold=threshold,
                              clf__n_estimators=n_estimators,
                              clf__max_features=max_features,
                              clf__class_weight=class_weight))

## Fitting Estimators

In [37]:
from sklearn.externals import joblib

Basic Estimators: no bag of words or PCA

In [40]:
nb_clf_est_b.fit(X_train,y_train)
joblib.dump(nb_clf_est_b, './other_output/merged/nb_clf_est_b.pkl')

['./other_output/merged/nb_clf_est_b.pkl']

In [41]:
qda_clf_est_b.fit(X_train,y_train)
joblib.dump(qda_clf_est_b, './other_output/merged/qda_clf_est_b.pkl')

['./other_output/merged/qda_clf_est_b.pkl']

In [None]:
log_clf_est_b.fit(X_train,y_train)
joblib.dump(log_clf_est_b, './other_output/merged/log_clf_est_b.pkl')

['./other_output/merged/log_clf_est_b.pkl']

In [None]:
rf_clf_est_b.fit(X_train,y_train)
joblib.dump(rf_clf_est_b, './other_output/merged/rf_clf_est_b.pkl')

## Testing Estimators

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
nb_clf_est_b=joblib.load('./other_output/merged/nb_clf_est_b.pkl')
qda_clf_est_b=joblib.load('./other_output/merged/qda_clf_est_b.pkl')
log_clf_est_b=joblib.load('./other_output/merged/log_clf_est_b.pkl')
rf_clf_est_b=joblib.load('./other_output/merged/rf_clf_est_b.pkl')

Basic Estimators: no bag of words or PCA

In [None]:
nb_fpr, nb_tpr, _ = roc_curve(y_test, 
                    nb_clf_est_b.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)

qda_fpr, qda_tpr, _ = roc_curve(y_test, 
                    qda_clf_est_b.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)

log_fpr, log_tpr, _ = roc_curve(y_test, 
                    log_clf_est_b.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)

rf_fpr, rf_tpr, _ = roc_curve(y_test, 
                    rf_clf_est_b.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)

In [None]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
         label='NB (area = %0.2f)' % nb_roc_auc, lw=2)

plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
         label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)

plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
         label='LOG (area = %0.2f)' % log_roc_auc, lw=2)

plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
         label='RF (area = %0.2f)' % rf_roc_auc, lw=2)

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
         label='Luck')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Basic Models Using BOW & Macro-Text Stats')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Basic_BOW_MERGED.png', bbox_inches='tight')
plt.show()