In [15]:
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from mlxtend.classifier import StackingClassifier
from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_validate
import sklearn
from skmultilearn.ensemble import MajorityVotingClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble

In [2]:
%load_ext autoreload
%autoreload 2

## Data formatting for classification

In [3]:
# Unpickle data 
with open('/Users/greenapple/project3/data/processed/house_bal.pkl', 'rb') as f:
    house_bal = pickle.load(f)

In [4]:
house_bal.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
5,b'--ZhevVpy1s',375,toothbrush,117,35,163,90,198,103,63,...,0,1,202,9,116,0,247,72,44,166
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
28,b'-3pPrlCm6gg',198,clarinet,179,190,122,19,0,114,255,...,58,15,207,0,108,43,97,57,42,0
63,b'-70wVF5u-gg',366,chopping_food,0,114,186,34,87,250,58,...,0,122,144,63,110,255,139,138,59,154
82,b'-ASYwidRD7M',43,snoring,53,100,144,84,223,68,95,...,56,78,153,65,208,207,200,255,255,66


In [5]:
house_bal.shape

(45717, 643)

In [6]:
len(house_bal.y_name.value_counts())

30

In [7]:
house_bal.y_name.value_counts()

speech            4042
music             3781
laughter          3772
snoring           3370
vacuum_cleaner    3054
typing            2644
dishes_pots       2560
frying_food       2102
blender           1884
toilet_flush      1882
door              1868
whoop             1736
footsteps         1492
baby_cry          1414
screeming         1116
whispering         972
clarinet           960
crying             918
microwave          894
television         866
hair_dryer         772
video_games        592
shaving            552
bathtab            472
water_tap          458
chopping_food      410
meow               388
dog                358
purr               304
toothbrush          84
Name: y_name, dtype: int64

In [8]:
two_class_list = [
    'footsteps',
    'purr'
]

In [9]:
house_2_classes = house_bal.loc[house_bal.y_name.isin(two_class_list)]

In [10]:
house_2_classes.shape

(1796, 643)

In [11]:
house_2_classes.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
138,b'-G_hnfp4a0M',53,footsteps,141,93,96,107,139,0,220,...,250,104,46,0,98,12,234,61,81,133
150,b'-IWlQN6cfe4',53,footsteps,78,146,188,30,200,44,120,...,130,96,33,255,83,228,123,120,58,236
281,b'-blH_CYo09w',53,footsteps,190,217,227,113,96,140,43,...,255,255,163,45,183,31,39,0,54,196
562,b'0EsJvIfMx0w',53,footsteps,50,106,202,91,186,156,46,...,42,4,58,140,214,143,47,100,14,255


In [12]:
# Assign features X and target y
X = house_2_classes[house_2_classes.columns[3:643]]
y = house_2_classes.y_name

In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1436, 640), (1436,), (360, 640), (360,))

In [14]:
# Look up attributes
RandomOverSampler('self').get_params().keys()

dict_keys(['random_state', 'ratio', 'return_indices', 'sampling_strategy'])

## Build and evaluate models

### Logistic regression

In [21]:
model_fldr = '/Users/greenapple/project3/models/'

In [17]:
# Train and fit logistic regression with  RandomizedSearchCV:  1st set of parameteres
from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
solver = ['liblinear', 'saga']

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__solver=solver)

# Call parameter selection function
logreg1_2_cls_scores_params, logreg1_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open(os.path.join(model_fldr, 'logreg1_2_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(logreg1_2_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'logreg1_2_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(logreg1_2_cls_model, f)



In [24]:
# Load parameteres and scores
pickling_out = open(os.path.join(model_fldr, 'logreg1_2_cls_scores_params_rand.pkl'), 'rb')
logreg1_2_cls_scores_params = pickle.load(pickling_out)
logreg1_2_cls_scores_params

{'best_train_score': 0.9178272980501393,
 'best_test_score': 0.9055555555555556,
 'time_sec': 28.675217,
 'time_best_fit_sec': 1.0755522,
 'best_params': {'model__solver': 'saga',
  'model__penalty': 'l1',
  'model__C': 1.0},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='warn', n_jobs=None,
                                     penalty='l1', random_state=3, solver='saga',
                                     tol=0.0001, verbose=0, warm_start=False))],
          verbose=False),
 'bes

In [25]:
# Load best model
pickling_out = open(os.path.join(model_fldr, 'logreg1_2_cls_model_rand.pkl'), 'rb')
logreg1_2_cls_model = pickle.load(pickling_out)
logreg1_2_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                 

In [208]:
# Train and fit logistic regression with  RandomizedSearchCV: 2nd set of parameteres
from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3, multi_class='multinomial', solver='saga')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['elasticnet']
C = np.logspace(0, 4, 10)
solver = 'saga'
multiclass='multinomial'
l1_1ratio = list(np.arange(0, 1, 0.1))

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__l1_ratio=l1_ratio)


# Call parameter selection function
logreg2_2_cls_scores_params, logreg2_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open('/Users/greenapple/project3/models/logreg2_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(logreg2_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg2_2_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(logreg2_2_cls_model, f)



{'best_train_score': 0.9143454038997214,
 'best_test_score': 0.9055555555555556,
 'time_sec': 65.068219,
 'time_best_fit_sec': 1.7317128,
 'best_params': {'model__penalty': 'elasticnet',
  'model__l1_ratio': 0.7000000000000001,
  'model__C': 21.544346900318832},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=21.544346900318832, class_weight=None,
                                     dual=False, fit_intercept=True,
                                     intercept_scaling=1,
                                     l1_ratio=0.7000000000000001, max_iter=100,
                                     multi_class='multinomial', n_jobs=None,
                                     penalty='elasticnet', random_state=3,
         

In [212]:
# Train and fit logistic regression with  RandomizedSearchCV: 3rd set of parameteres
from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3, multi_class='multinomial')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l2']
C = np.logspace(0, 4, 10)
solver = ['sag', 'lbfgs', 'newton-cg']

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__solver=solver
)

# Call parameter selection function
logreg3_2_cls_scores_params, logreg3_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open('/Users/greenapple/project3/models/logreg3_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(logreg3_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg3_2_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(logreg3_2_cls_model, f)  



{'best_train_score': 0.9157381615598886,
 'best_test_score': 0.9083333333333333,
 'time_sec': 59.726693,
 'time_best_fit_sec': 0.6185069999999999,
 'best_params': {'model__solver': 'lbfgs',
  'model__penalty': 'l2',
  'model__C': 1291.5496650148827},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=1291.5496650148827, class_weight=None,
                                     dual=False, fit_intercept=True,
                                     intercept_scaling=1, l1_ratio=None,
                                     max_iter=100, multi_class='multinomial',
                                     n_jobs=None, penalty='l2', random_state=3,
                                     solver='lbfgs', tol=0.0001, verbose=0,
    

In [73]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/logreg_2_cls_scores_params_rand.pkl', 'rb')
logreg_2_cls_scores_params_rand = pickle.load(unpicking_out)
logreg_2_cls_scores_params_rand

{'best_train_score': 0.9136490250696379,
 'best_test_score': 0.8944444444444444,
 'time_sec': 11.059135,
 'time_best_fit_sec': 0.2037918,
 'best_params': {'model__penalty': 'l1', 'model__C': 3593.813663804626},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=3593.813663804626, class_weight=None,
                                     dual=False, fit_intercept=True,
                                     intercept_scaling=1, l1_ratio=None,
                                     max_iter=100, multi_class='warn',
                                     n_jobs=None, penalty='l1', random_state=3,
                                     solver='warn', tol=0.0001, verbose=0,
                                     warm_start=False

In [80]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/logreg_2_cls_model_rand.pkl', 'rb')
logreg_2_cls_model = pickle.load(unpicking_out)
logreg_2_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                 

In [None]:
# Pick best logreg model

### K-Nearest Neighbors

In [46]:
# Train and fit KNN with  RandomizedSearchCV
from src.models import model_sel_rand_search

# Function arguments:
model = KNeighborsClassifier()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
k_range = list(range(1, 10, 1))

param_grid = dict(model__n_neighbors=k_range)

# Call parameter selection function
KNN_2_cls_scores_params, KNN_2_cls_models = model_sel_rand_search.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/KNN_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(KNN_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/KNN_2_cls_models_rand.pkl', 'wb') as f:
    pickle.dump(KNN_2_cls_models, f)



In [74]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/KNN_2_cls_scores_params_rand.pkl', 'rb')
KNN_2_cls_scores_params_rand = pickle.load(unpicking_out)
KNN_2_cls_scores_params_rand

{'best_train_score': 0.903899721448468,
 'best_test_score': 0.9,
 'time_sec': 14.115907,
 'time_best_fit_sec': 0.0217182,
 'best_params': {'model__n_neighbors': 2},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                       metric='minkowski', metric_params=None,
                                       n_jobs=None, n_neighbors=2, p=2,
                                       weights='uniform'))],
          verbose=False),
 'best_test_proba': array([[0. , 1. ],
        [1. , 0. ],
        [1. , 0. ],
        [1. , 0. ],
        [1. , 0. ],
        [1. , 0. ],
        [1. , 0. ],
        [1. , 0. ],
        [1. , 0. ],
        [0. , 1. ],
        [1. , 0.

In [84]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/KNN_2_cls_models_rand.pkl', 'rb')
KNN_2_cls_model_rand = pickle.load(unpicking_out)
KNN_2_cls_model_rand

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              KNeighborsClassifier(algorithm='auto',
                                                                   leaf_size=30,
                                                                   metric='minkowski',
                                                                   metric_params=None,
                                                                   n_jobs=None,
                 

### Naive Bayes MultiNomial

In [50]:
# Train and fit naive Bayes MultiNomial with RandomizedSearchCV
from src.models import model_sel_rand_search

# Function arguments:
model = MultinomialNB()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
alphas = [1, 10, 100]
# Selects the min alpha. Keep alpha = 1 to make sure the model can take data it has not seen before 
# from the test set.

param_grid = dict(model__alpha=alphas)

# Call parameter selection function
NBmultinomial_2_cls_scores_params, NBmultinomial_2_cls_models = model_sel_rand_search.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/NBmultinomial_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/NBmultinomial_2_cls_models_rand.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_2_cls_models, f)



In [123]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/NBmultinomial_2_cls_scores_params_rand.pkl', 'rb')
NBmultinomial_2_cls_scores_params_rand = pickle.load(unpicking_out)
NBmultinomial_2_cls_scores_params_rand

{'best_train_score': 0.8704735376044568,
 'best_test_score': 0.8916666666666667,
 'time_sec': 4.496852,
 'time_best_fit_sec': 0.0111012,
 'best_params': {'model__alpha': 1},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  MultinomialNB(alpha=1, class_prior=None, fit_prior=True))],
          verbose=False),
 'best_test_proba': array([[0.00000000e+000, 1.00000000e+000],
        [5.00000000e-001, 5.00000000e-001],
        [1.00000000e+000, 8.63638130e-095],
        [1.00000000e+000, 1.25560528e-063],
        [1.00000000e+000, 0.00000000e+000],
        [1.00000000e+000, 0.00000000e+000],
        [1.00000000e+000, 0.00000000e+000],
        [1.00000000e+000, 0.00000000e+000],
        [1.00000000e+000, 0.00000000e+000],
        [0.00000

In [124]:
# Unpickle best model
unpicking_out = open('/Users/greenapple/project3/models/NBmultinomial_2_cls_models_rand.pkl', 'rb')
NBmultinomial_2_cls_models = pickle.load(unpicking_out)
NBmultinomial_2_cls_models

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              MultinomialNB(alpha=1.0,
                                                            class_prior=None,
                                                            fit_prior=True))],
                                      verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'model__alpha': [1, 10, 100]},
                   pre_dispatch

### Support Vector Machines

In [61]:
# Train and fit SVC with RandomizedSearchCV
from src.models import model_sel_rand_search

# Function arguments:
model = SVC(probability=True)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
gamma = [0.01, 0.1, 1, 10]
degree = [2, 3, 4]

param_grid = dict(model__C=C,
                 model__kernel=kernel,
                 model__gamma=gamma,
                 model__degree=degree
                 )

# Call parameter selection function
SVC_2_cls_scores_params, SVC_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/SVC_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(SVC_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/SVC_2_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(SVC_2_cls_model, f)

In [76]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/SVC_2_cls_scores_params_rand.pkl', 'rb')
SVC_2_cls_scores_params_rand = pickle.load(unpicking_out)
SVC_2_cls_scores_params_rand

{'best_train_score': 0.9282729805013927,
 'best_test_score': 0.9194444444444444,
 'time_sec': 179.059009,
 'time_best_fit_sec': 7.195515,
 'best_params': {'model__kernel': 'poly',
  'model__gamma': 0.01,
  'model__degree': 4,
  'model__C': 1},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
                      decision_function_shape='ovr', degree=4, gamma=0.01,
                      kernel='poly', max_iter=-1, probability=True,
                      random_state=None, shrinking=True, tol=0.001,
                      verbose=False))],
          verbose=False),
 'best_test_proba': array([[1.85874190e-01, 8.14125810e-01],
        [9.99982674e-01, 1.73259139e-05],
        [9.9

In [151]:
# Unpickle model
unpicking_out = open('/Users/greenapple/project3/models/SVC_2_cls_model_rand.pkl', 'rb')
SVC_2_cls_model = pickle.load(unpicking_out)
SVC_2_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              SVC(C=1.0, cache_size=200,
                                                  class_weight=None, coef0=0.0,
                                                  decision_function_shape='ovr',
                                                  degree=3,
                                                  gamma='auto_deprecated',
                                                  kernel='rbf', max_

### Random Forest

In [63]:
# Train and fit Random Forest
from src.models import model_sel_rand_search

# Function arguments:
model = RandomForestClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [10, 50, 100, 300, 500]
criterion = ['gini', 'entropy']
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10, 50]
max_features = [5, 10, 25, 50, 100]
bootstrap = [True, False]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__criterion=criterion,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                  model__bootstrap=bootstrap
                 )

# Call parameter selection function
RF_2_cls_scores_params, RF_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/RF_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(RF_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/RF_2_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(RF_2_cls_model, f)

In [77]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/RF_2_cls_scores_params_rand.pkl', 'rb')
RF_2_cls_scores_params_rand = pickle.load(unpicking_out)
RF_2_cls_scores_params_rand

{'best_train_score': 0.9115598885793872,
 'best_test_score': 0.9222222222222223,
 'time_sec': 50.344088,
 'time_best_fit_sec': 1.3690788,
 'best_params': {'model__n_estimators': 300,
  'model__min_samples_split': 10,
  'model__max_features': 10,
  'model__max_depth': 10,
  'model__criterion': 'gini',
  'model__bootstrap': True},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  RandomForestClassifier(bootstrap=True, class_weight=None,
                                         criterion='gini', max_depth=10,
                                         max_features=10, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                

In [None]:
# Unpickle model
unpicking_out = open('/Users/greenapple/project3/models/RF_2_cls_model_rand.pkl', 'rb')
RF_2_cls_model = pickle.load(unpicking_out)
RF_2_cls_model

### Gradient Boosting Classifier

In [65]:
# Train and fit Gradient Boosting
from src.models import model_sel_rand_search

# Function arguments:
model = GradientBoostingClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 300, 500]
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10, 20, 50]
max_features = [5, 10, 25, 50, 100]


param_grid = dict(model__n_estimators=n_estimators,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                 )

# Call parameter selection function
GBM_2_cls_scores_params, GBM_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/GBM_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(GBM_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/GBM_2_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(GBM_2_cls_model, f)

In [78]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/GBM_2_cls_scores_params_rand.pkl', 'rb')
GBM_2_cls_scores_params_rand = pickle.load(unpicking_out)
GBM_2_cls_scores_params_rand

{'best_train_score': 0.9220055710306406,
 'best_test_score': 0.9333333333333333,
 'time_sec': 64.064161,
 'time_best_fit_sec': 2.9993064,
 'best_params': {'model__n_estimators': 500,
  'model__min_samples_split': 20,
  'model__max_features': 50,
  'model__max_depth': 5},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='deviance',
                                             max_depth=5, max_features=50,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_spl

In [144]:
# Unpickle model
unpicking_out = open('/Users/greenapple/project3/models/GBM_2_cls_model_rand.pkl', 'rb')
GBM_2_cls_model = pickle.load(unpicking_out)
GBM_2_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              GradientBoostingClassifier(criterion='friedman_mse',
                                                                         init=None,
                                                                         learning_rate=0.1,
                                                                         loss='deviance',
                                                                        

### Dummy classifier

In [67]:
# Train and fit Dummy classifier
from src.models import model_sel_rand_search

# Function arguments:
model = DummyClassifier(random_state=3, strategy='stratified')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params

param_grid = dict()

# Call parameter selection function
Dummy_2_cls_scores_params, Dummy_2_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/Dummy_2_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(Dummy_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/Dummy_2_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(Dummy_2_cls_model, f)



In [68]:
Dummy_2_cls_scores_params

{'best_train_score': 0.4742339832869081,
 'best_test_score': 0.4888888888888889,
 'time_sec': 4.429952,
 'time_best_fit_sec': 0.0040272,
 'best_params': {},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  DummyClassifier(constant=None, random_state=3,
                                  strategy='stratified'))],
          verbose=False),
 'best_test_proba': array([[1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
    

In [69]:
# Dummy classifier
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
f1_dummy = f1_score(y_test, dummy.predict(X_test), average='micro')
accuracy_dummy = accuracy_score(y_test, dummy.predict(X_test))

In [70]:
print('Dummy classifier F1 score: ', f1_dummy)
print('Dummy classifier accuracy score: ', accuracy_dummy)

Dummy classifier F1 score:  0.6805555555555556
Dummy classifier accuracy score:  0.675


### Ensembling

In [None]:
%pylab inline
%config InlineBackend.figure_formats = ['retina']

In [161]:
model_list = [
    ('logreg', logreg_2_cls_model),
    ('KNN', KNN_2_cls_models_rand),
    ('NBmultinomial', NBmultinomial_2_cls_models),
    ('SVC', SVC_2_cls_model),
    ('RF', RF_2_cls_model),
    ('GBM', GBM_2_cls_model)
]

In [159]:
# Max voting classifier

# Max voting classifier
average_voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='hard',
                                    n_jobs=-1)

f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [160]:
f1_train, f1_test

(0.9234030197444831, 0.9138888888888888)

In [162]:
# Average voting classifier
average_voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='soft',
                                    n_jobs=-1)

f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [163]:
f1_train, f1_test

(0.9234030197444831, 0.9138888888888888)

In [164]:
# Stacked classifier
model = logreg_2_cls_model

stacked_classifier = StackingClassifier(classifiers=model_list, 
                                        meta_classifier=model, 
                                        use_probas=False)


f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [165]:
f1_train, f1_test

(0.9227061556329849, 0.9138888888888888)

In [182]:
# Convert list of tuples onto a dictionary
classifier_list = [x[1] for x in model_list]
classifier_names = [x[0] for x in model_list]
classifier_dict = dict(zip(classifier_names, classifier_list))

In [183]:
for name, classifier in classifier_dict.items():
    
    # Stacked classifier
    model = classifier

    stacked_classifier = StackingClassifier(classifiers=model_list, 
                                        meta_classifier=model, 
                                        use_probas=False)


    f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

    max_voting_classifer.fit(X_train, y_train)
    y_hat = max_voting_classifer.predict(X_test) 

    f1_test = f1_score(y_test, y_hat, average='micro')
    # f1_test_s = f1_score(y_test, y_hat, average='samples')
    
    print(name, f1_train, f1_test)

logreg 0.9227061556329849 0.9138888888888888
KNN 0.9227061556329849 0.9138888888888888
NBmultinomial 0.9227061556329849 0.9138888888888888
SVC 0.9227061556329849 0.9138888888888888
RF 0.9234030197444831 0.9138888888888888
GBM 0.9227061556329849 0.9138888888888888


### Null accuracy

In [93]:
y_test.value_counts()

footsteps    291
purr          69
Name: y_name, dtype: int64

In [94]:
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy: ', null_accuracy)

Null accuracy:  footsteps    0.808333
Name: y_name, dtype: float64


## Visualize scores

### Model summary

In [131]:
model_score_table = pd.DataFrame(columns=['Models', 'F1_score_train'])
model_score_table

Unnamed: 0,Models,F1_score_train


In [132]:
model_score_table['Models'] = ['Logistis_Regression',
                              'KNN',
                              'Multinomial_NB',
                              'SVC_poly',
                              'Random_Forest',
                               'GBM',
                               'Dummy'
                              ]

In [139]:
model_score_table['F1_score_train'] = trainin

In [140]:
model_score_table

Unnamed: 0,Models,F1_score_train
0,Logistis_Regression,0.914345
1,KNN,0.9039
2,Multinomial_NB,0.870474
3,SVC_poly,0.926184
4,Random_Forest,0.916435
5,GBM,0.924095
6,Dummy,0.474234


In [141]:
# Pickle results
with open('/Users/greenapple/project3/reports/figures/model_score_table_MVP.pkl', 'wb') as f:
    pickle.dump(model_score_table, f)