## Model Selection

Import libraries and load the current modeling data for `AmazonReviews`

In [1]:
from nltk.corpus import words, stopwords
from nltk import SnowballStemmer
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import StratifiedKFold

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, accuracy_score


import numpy as np
import pandas as pd
import pickle

stemmer = SnowballStemmer('english')

def english_corpus(doc, tkpat=re.compile('\\b[a-z][a-z]+\\b')):
    return [stemmer.stem(w) for w in tkpat.findall(doc)]

MODELING_PATH = '../data/modeling/'
PATH = '../data/amazon_reviews_us_Toys_v1_00.tsv'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from AmazonReviews import AmazonReviews

ar = AmazonReviews()

In [5]:
ar.load_data(PATH)
ar.calc_trend_score()
ar.create_observations()
ar.create_train_test_split()

Read from pickle...


## Setup

Create inital dictionaries of standard parameters to be utilized in the different algorithms. 

In [6]:
skf = StratifiedKFold(n_splits = 5, random_state=ar.RANDOM_STATE)

count_vectorizer_parameters = {
    'stop_words': set(stopwords.words()), 
    'tokenizer': english_corpus, 
    'min_df': 100,
    'max_df': 0.2,
    'ngram_range': (1,2)
}

lda_parameters = {
    'n_components': 5,
    'learning_decay': 0.6,
    'batch_size': 256,
    'learning_offset': 1024,
    'learning_method': 'online',
    'topic_word_prior': 0.005,
    'n_jobs': -1,
    'random_state': ar.RANDOM_STATE
}

xgb_parameters = {
    'random_state': ar.RANDOM_STATE,
    'n_jobs': -1,
    'objective': 'binary:logistic'
}

bayes_cv_parameters = {
    'n_iter': 10,
    'scoring': 'roc_auc',
    'n_jobs': -1,
    'cv': skf,
    'random_state': ar.RANDOM_STATE
}


Create the document transformation pipeline.

In [17]:
doc_pipeline = Pipeline(
    [
        ('cnt_vector', CountVectorizer(**count_vectorizer_parameters)),
        ('lda', LatentDirichletAllocation(**lda_parameters)),
        ('log_tansform', FunctionTransformer(np.log))
    ]
)

Transform `X_train`.

In [18]:
ar.X_train.head()

43652     this is great for your hair  and it chainges c...
110963    Not worth   bucks!!!!! This toy feels really c...
148853    I saw this same size at Walmart for $ . .  Grr...
193505    There is nothing wrong with the quality of the...
141197    I purchased this boat as a RTR boat it ran awe...
Name: review_body, dtype: object

In [19]:
X_train_transformed = doc_pipeline.fit_transform(ar.X_train)
pd.DataFrame(X_train_transformed).head()

Unnamed: 0,0,1,2,3,4
0,-1.171857,-1.519457,-2.252676,-3.803085,-1.067233
1,-3.386933,-0.663706,-5.518817,-5.505281,-0.813796
2,-3.552461,-0.675773,-3.555348,-0.904099,-3.536834
3,-0.74688,-1.060022,-2.414319,-6.074681,-2.430635
4,-4.932876,-0.375,-1.234507,-4.922828,-4.926993


In [28]:
# save progress
def save(obj, obj_name):
    f = MODELING_PATH + obj_name
    pickle.dump(obj, open(f, 'wb'))

def load(obj_name):
    f = MODELING_PATH + obj_name
    return pickle.load(open(f, 'rb'))

In [30]:
save(X_train_transformed, 'baseline_X_train')

Create the parameters to perform `BayesSeachCV` over.

In [10]:
xgb_search_params = {
    'max_depth': Integer(2, 6),
    'learning_rate': Real(0, 0.5),
    'n_estimators': Integer(100, 1000),
    'gamma': Real(0, 0.5)
}

Fit the baseline model.

In [11]:
xgb = XGBClassifier(**xgb_parameters)

In [33]:
bayes_search = BayesSearchCV(xgb, xgb_search_params, **bayes_cv_parameters)
bayes_search.fit(X_train_transformed, ar.y_train)

BayesSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_iter=10, n_jobs=-1, n_points=1,
       optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=42,
       refit=True, return_train_score=False, scoring='roc_auc',
       search_spaces={'max_depth': Integer(low=2, high=6), 'learning_rate': Real(low=0, high=0.5, prior='uniform', transform='identity'), 'n_estimators': Integer(low=100, high=1000), 'gamma': Real(low=0, high=0.5, prior='uniform', transform='identity')},
       verbose=0)

In [100]:
y_pred = bayes_search.best_estimator_.predict_proba(X_train_transformed)[:,1]

In [68]:
bayes_search.best_params_

{'gamma': 0.4061979941786817,
 'learning_rate': 0.08593578069828035,
 'max_depth': 4,
 'n_estimators': 822}

In [101]:
ar.log_score(ar.y_train, y_pred, 'baseline')
ar.results

Unnamed: 0,Precision,Recall,F1,Accuracy,AUC
baseline,1.0,0.006575,0.013064,0.989508,0.91225


In [None]:
save(bayes_search.best_estimator_, 'baeline_estimator.pkl')

In [116]:
save(ar.results, 'results_df.pkl')
save(ar.y_scores, 'y_scores.pkl')

Pretty high AUC, but the confusion matrix metrics are most likely skewed with the default cutoff of 50%.

## Next model

Let's now try up-sampling to see if this can improve the classification.

In [105]:
sm = SMOTE(random_state=ar.RANDOM_STATE, n_jobs=-1)

X_train_SMOTE, y_train_SMOTE = sm.fit_sample(X_train_transformed, ar.y_train)

save(X_train_SMOTE, 'X_train_baseline_SMOTE.pkl')
save(y_train_SMOTE, 'y_train_baseline_SMOTE.pkl')

In [14]:
# add some logging in the BayesSearchCV
bayes_cv_parameters['verbose'] = 2
bayes_cv_parameters['n_jobs'] = 1

In [None]:
bayes_search = BayesSearchCV(xgb, xgb_search_params, **bayes_cv_parameters)
bayes_search.fit(X_train_SMOTE, y_train_SMOTE)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384 
[CV]  gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384, total=  59.7s
[CV] gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   59.7s remaining:    0.0s


[CV]  gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384, total= 1.0min
[CV] gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384 
[CV]  gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384, total= 1.1min
[CV] gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384 
[CV]  gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384, total= 1.0min
[CV] gamma=0.20505197942665693, learning_rate=0.36386287158866254, max_depth=6, n_estimators=384 


## Model runs

We are going to first fit the model with 10% of the training data, but socred on the entire unsampled train data set.

In [28]:
lda_grid = BayesSearchCV(lda_pipeline, lda_rf_params, **bayes_cv_parameters)
ar.run_model(lda_grid, 'lda_rf_10_cnt_v_1', pre_models[0], 0.1)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backen

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backen

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


In [32]:
ar.results

Unnamed: 0,mvp,lda_rf_10_cnt_v_1
Accuracy,0.229066,0.832942
Precision,0.009297,0.030742
Recall,0.682008,0.485356
F1 Score,0.018344,0.057822
AUC,0.45312,0.661004


In [35]:
pd.DataFrame(ar.models['lda_rf_10_cnt_v_1']['cv_results']).sort_values('mean_test_score', ascending=False)

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lda__learning_decay,param_lda__n_components,param_rf__max_depth,param_rf__n_estimators,params
9,0.857817,0.801573,0.82092,0.826771,0.023332,1,140.353823,12.70332,1.714957,0.212995,0.501815,17,4,276,"{'lda__learning_decay': 0.5018151536273716, 'l..."
0,0.769689,0.741657,0.752783,0.75471,0.011525,1,103.080245,2.314802,1.022536,0.134233,0.705052,16,5,295,"{'lda__learning_decay': 0.705051979426657, 'ld..."
8,0.712352,0.717764,0.720894,0.717003,0.003528,1,115.556525,0.50718,1.606475,0.273019,0.977739,16,5,325,"{'lda__learning_decay': 0.9777389931549642, 'l..."
3,0.70306,0.703625,0.718804,0.708496,0.007292,1,88.096165,0.242528,1.607255,0.129377,0.906198,8,4,441,"{'lda__learning_decay': 0.9061979941786817, 'l..."
4,0.69718,0.693569,0.704954,0.698567,0.00475,1,98.910181,0.18424,1.368057,0.14386,0.899777,12,4,415,"{'lda__learning_decay': 0.8997767208035865, 'l..."
6,0.667693,0.681364,0.672151,0.673736,0.005693,1,109.595652,1.697966,1.618853,0.542314,0.80854,17,3,462,"{'lda__learning_decay': 0.8085396792511581, 'l..."
7,0.663577,0.66933,0.669697,0.667534,0.002803,1,117.815482,1.137846,1.534261,0.58842,0.771702,19,3,451,"{'lda__learning_decay': 0.7717015338451563, 'l..."
1,0.649434,0.663032,0.662975,0.65848,0.006397,1,129.023766,0.599657,1.835318,0.305419,0.918694,18,3,485,"{'lda__learning_decay': 0.9186941777766422, 'l..."
5,0.639168,0.582403,0.648415,0.623327,0.029184,1,115.871979,0.3548,1.482222,0.088353,0.867014,19,2,257,"{'lda__learning_decay': 0.8670140089927842, 'l..."
2,0.594002,0.594373,0.599904,0.596093,0.002699,1,119.594627,0.944643,1.514466,0.534633,0.722416,19,2,330,"{'lda__learning_decay': 0.7224162561505759, 'l..."


In [36]:
import pickle

In [38]:
pickle.dump(ar.models, open('../data/model_pickles/model_20181106_1011.pkl', 'wb'))

In [46]:
lda_grid_bi = BayesSearchCV(lda_pipeline, lda_rf_params, **bayes_cv_parameters)
ar.run_model(lda_grid_bi, 'lda_rf_10_cnt_v_2', pre_models[1], 0.1)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backen

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


KeyboardInterrupt: 

In [45]:
pre_models[1]

'cnt_v_2_gram_sm'