In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from mlxtend.classifier import StackingClassifier
from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_validate
import sklearn
from skmultilearn.ensemble import MajorityVotingClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!pip install --editable ..

Obtaining file:///Users/greenapple/project3
Installing collected packages: src
  Found existing installation: src 0.1.0
    Uninstalling src-0.1.0:
      Successfully uninstalled src-0.1.0
  Running setup.py develop for src
Successfully installed src


## Data formatting for classification

In [4]:
# Unpickle data 
with open('/users/greenapple/project3/data/processed/house_bal.pkl', 'rb') as f:
    house_bal = pickle.load(f)

In [5]:
house_bal.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
5,b'--ZhevVpy1s',375,toothbrush,117,35,163,90,198,103,63,...,0,1,202,9,116,0,247,72,44,166
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
28,b'-3pPrlCm6gg',198,clarinet,179,190,122,19,0,114,255,...,58,15,207,0,108,43,97,57,42,0
63,b'-70wVF5u-gg',366,chopping_food,0,114,186,34,87,250,58,...,0,122,144,63,110,255,139,138,59,154
82,b'-ASYwidRD7M',43,snoring,53,100,144,84,223,68,95,...,56,78,153,65,208,207,200,255,255,66


In [6]:
house_bal.shape

(45717, 643)

In [7]:
len(house_bal.y_name.value_counts())

30

In [8]:
house_bal.y_name.value_counts()

speech            4042
music             3781
laughter          3772
snoring           3370
vacuum_cleaner    3054
typing            2644
dishes_pots       2560
frying_food       2102
blender           1884
toilet_flush      1882
door              1868
whoop             1736
footsteps         1492
baby_cry          1414
screeming         1116
whispering         972
clarinet           960
crying             918
microwave          894
television         866
hair_dryer         772
video_games        592
shaving            552
bathtab            472
water_tap          458
chopping_food      410
meow               388
dog                358
purr               304
toothbrush          84
Name: y_name, dtype: int64

In [9]:
ten_class_list = [
    'speech',
    'music',
    'clarinet',
    'water_tap',
    'footsteps',
    'microwave',
    'door',
    'blender',
    'vacuum_cleaner',
    'meow'
]

In [10]:
house_10_classes = house_bal.loc[house_bal.y_name.isin(ten_class_list)]

In [11]:
house_10_classes.y_name.value_counts()

speech            4042
music             3781
vacuum_cleaner    3054
blender           1884
door              1868
footsteps         1492
clarinet           960
microwave          894
water_tap          458
meow               388
Name: y_name, dtype: int64

In [12]:
# Assign features X and target y
X = house_10_classes[house_10_classes.columns[3:643]]
y = house_10_classes.y_name

In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15056, 640), (15056,), (3765, 640), (3765,))

In [14]:
import numpy as np
import pandas as pd
import pickle
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC

from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from imblearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import sklearn
from datetime import datetime

from src.models.time_best_fit import time_best_fit

# Function for finding the best parameters
def train_fit_time(model,
                   param_distributions,
                   transformer,
                   X_train,
                   X_test,
                   y_train,
                   y_test,
                   CV):

    '''Function searches for the best model paramenters through a cross-validation on the train set
    and returns results of test set fit.

    Args:
        model - supervised learning classifier
        param_grid (dict) - set of model specific parameters
        transformer - data transformer
        X_train (DataFrame) - training data set
        y_train (DataFrame) - training target set
        X_test (DataFrame) - test data set
        y_test (DataFrame) - test traget set
        CV (int) - number of cross-validation folds

    Returns:
        dictionary with the following keys:
            train_score
            test_score
            time (sec) - how long this function runs for
            time_best_fit (sec)
            params
            estimator
            test_proba
            y_hat
            all_scores
        model
    '''
    # Time the function: record start time
    start_time = datetime.now()

    pipe = Pipeline([('transformer', transformer), ('model', model)])

    # Parameter grid
    grid = RandomizedSearchCV(pipe, param_distributions, cv=CV, scoring='f1_micro', refit=True, n_jobs=-1)
    grid.fit(X_train, y_train)

    # Time the function: record end time
    end_time = datetime.now()
    time = (end_time - start_time).total_seconds()

    # Parameters and scores
    params_scores_pred = {
        'best_train_score':grid.best_score_,
        'best_test_score': grid.score(X_test, y_test),
        'time_sec': time,
        'time_best_fit_sec': time_best_fit(grid, X_train, y_train),
        'best_params':grid.best_params_,
        'best_estimator':grid.best_estimator_,
        'best_test_proba': grid.predict_proba(X_test),
        'best_y_hat': grid.predict(X_test),
        'all_scores': grid.cv_results_

    }

    return params_scores_pred, grid


# Function for estimating the run time for the best fit
def time_best_fit(grid, X_train, y_train):

    '''Function returns the time of fit for the best parameters found by GridSearchCV.

    Args:
       best model from train_fit_time() - grid.

    Returns:
       time of fit in seconds.
    '''

    # Time the function: record start time
    start_time = datetime.now()

    pipe = grid.best_estimator_

    # Time the function: record start time
    start_time = datetime.now()

    # Fit
    pipe.fit(X_train, y_train)

    # Time the function: record end time
    end_time = datetime.now()
    fit_time = (end_time - start_time).total_seconds()

    if int(fit_time) < 5.0:
        time = 0
        for i in range(4):
            start_time_ = datetime.now()
            pipe.fit(X_train, y_train)
            start_time_ = datetime.now()
            time = (end_time - start_time).total_seconds()
            time += time
        fit_time = (time + fit_time)/5

    return fit_time


## Build and evaluate models

### Logistic regression

In [15]:
model_fldr = '/home/ubuntu/project3_aws/models/'

In [15]:
# Train and fit logistic regression with  RandomizedSearchCV:  1st set of parameteres
# from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
solver = ['liblinear', 'saga']

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__solver=solver)

# Call parameter selection function
logreg1_10_cls_scores_params, logreg1_10_cls_model = train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open(os.path.join(model_fldr,'logreg1_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(logreg1_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr,'logreg1_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(logreg1_10_cls_model, f)



In [19]:
# Load parameters and scores
pickling_out = open(os.path.join(model_fldr, 'logreg1_10_cls_scores_params_rand.pkl'), 'rb')
logreg1_10_cls_scores_params = pickle.load(pickling_out)

In [20]:
logreg1_10_cls_scores_params

{'best_train_score': 0.6857066950053134,
 'best_test_score': 0.6783532536520585,
 'time_sec': 3817.727388,
 'time_best_fit_sec': 205.980575,
 'best_params': {'model__solver': 'saga',
  'model__penalty': 'l1',
  'model__C': 1.0},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='warn', n_jobs=None,
                                     penalty='l1', random_state=3, solver='saga',
                                     tol=0.0001, verbose=0, warm_start=False))],
          verbose=False),
 '

In [None]:
# Load best model
pickling_out = open(os.path.join(model_fldr, 'logreg1_10_cls_model_rand.pkl'), 'rb')
logreg1_10_cls_model = pickle.load(pickling_out)
logreg1_10_cls_model

In [26]:
# Train and fit logistic regression with  RandomizedSearchCV: 2nd set of parameteres
# from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3, multi_class='multinomial', solver='saga')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['elasticnet']
C = np.logspace(0, 4, 10)
solver = 'saga'
multiclass='multinomial'
l1_ratio = list(np.arange(0, 1, 0.1))

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__l1_ratio=l1_ratio)


# Call parameter selection function
logreg2_10_cls_scores_params, logreg2_10_cls_model = train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open(os.path.join(model_fldr, 'logreg2_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(logreg2_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'logreg2_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(logreg2_10_cls_model, f)



In [27]:
# Load parameters and scores
pickling_out = open(os.path.join(model_fldr, 'logreg2_10_cls_scores_params_rand.pkl'), 'rb')
logreg2_10_cls_scores_params = pickle.load(pickling_out)
logreg2_10_cls_scores_params

{'best_train_score': 0.6667773645058448,
 'best_test_score': 0.6586985391766268,
 'time_sec': 1360.105574,
 'time_best_fit_sec': 183.314682,
 'best_params': {'model__penalty': 'elasticnet',
  'model__l1_ratio': 0.2,
  'model__C': 464.15888336127773},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=464.15888336127773, class_weight=None,
                                     dual=False, fit_intercept=True,
                                     intercept_scaling=1, l1_ratio=0.2,
                                     max_iter=100, multi_class='multinomial',
                                     n_jobs=None, penalty='elasticnet',
                                     random_state=3, solver='saga', tol=0.0001,
         

In [28]:
# Load best model
pickling_out = open(os.path.join(model_fldr, 'logreg2_10_cls_model_rand.pkl'), 'rb')
logreg2_10_cls_model = pickle.load(pickling_out)
logreg2_10_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                 

In [29]:
# Train and fit logistic regression with  RandomizedSearchCV: 3rd set of parameteres
# from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3, multi_class='multinomial')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l2']
C = np.logspace(0, 4, 10)
solver = ['sag', 'lbfgs', 'newton-cg']

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__solver=solver
)

# Call parameter selection function
logreg3_10_cls_scores_params, logreg3_10_cls_model = train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open(os.path.join(model_fldr,'logreg3_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(logreg3_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'logreg3_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(logreg3_10_cls_model, f)  



In [30]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr, 'logreg3_10_cls_scores_params_rand.pkl'), 'rb')
logreg3_10_cls_scores_params = pickle.load(unpicking_out)
logreg3_10_cls_scores_params

{'best_train_score': 0.6875664187035069,
 'best_test_score': 0.6799468791500664,
 'time_sec': 686.810264,
 'time_best_fit_sec': 1.8927684,
 'best_params': {'model__solver': 'lbfgs',
  'model__penalty': 'l2',
  'model__C': 7.742636826811269},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  LogisticRegression(C=7.742636826811269, class_weight=None,
                                     dual=False, fit_intercept=True,
                                     intercept_scaling=1, l1_ratio=None,
                                     max_iter=100, multi_class='multinomial',
                                     n_jobs=None, penalty='l2', random_state=3,
                                     solver='lbfgs', tol=0.0001, verbose=0,
              

In [31]:
# Unpickle best model
unpicking_out = open(os.path.join(model_fldr, 'logreg3_10_cls_model_rand.pkl'), 'rb')
logreg3_10_cls_model = pickle.load(unpicking_out)
logreg3_10_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                 

### Summary: logistic regression classifiers

In [None]:
# Load model scores

model_fldr = '/Users/greenapple/project3/aws/models'

pickling_out = open(os.path.join(model_fldr, 'logreg1_10_cls_scores_params_rand.pkl'), 'rb')
logreg1_10_cls_scores_params_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'logreg2_10_cls_scores_params_rand.pkl'), 'rb')
logreg2_10_cls_scores_params_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'logreg3_10_cls_scores_params_rand.pkl'), 'rb')
logreg3_10_cls_scores_params_rand = pickle.load(pickling_out)
pickling_out.close()

In [None]:
# LoadMake df with model scores
logreg_table = pd.DataFrame()
logreg_table['model'] = ['logreg1', 'logreg2', 'logreg3']
logreg_table['f1_train'] = [
    logreg1_10_cls_scores_params_rand['best_train_score'],
    logreg2_10_cls_scores_params_rand['best_train_score'],
    logreg3_10_cls_scores_params_rand['best_train_score']   
]

logreg_table['f1_test'] = [
    logreg1_10_cls_scores_params_rand['best_test_score'],
    logreg2_10_cls_scores_params_rand['best_test_score'],
    logreg3_10_cls_scores_params_rand['best_test_score']   
]

logreg_table['train_time'] = [
    logreg1_10_cls_scores_params_rand['time_best_fit_sec'],
    logreg2_10_cls_scores_params_rand['time_best_fit_sec'],
    logreg3_10_cls_scores_params_rand['time_best_fit_sec']   
]

In [39]:
logreg_table

Unnamed: 0,model,f1_train,f1_test,train_time
0,logreg1,0.685707,0.678353,205.980575
1,logreg2,0.666777,0.658699,183.314682
2,logreg3,0.687566,0.679947,1.892768


In [None]:
# Best logreg model is logreg3_10_cls_scores_params. Will be using this model for ensembling models.

### K-Nearest Neighbors

In [32]:
# Train and fit KNN with  RandomizedSearchCV
# from src.models import model_os.path.join(model_fldr,sel_rand_search)

# Function arguments:
model = KNeighborsClassifier()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
k_range = list(range(1, 10, 1))

param_grid = dict(model__n_neighbors=k_range)

# Call parameter selection function
KNN_30_cls_scores_params, KNN_30_cls_model = train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open(os.path.join(model_fldr,'KNN_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(KNN_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr,'KNN_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(KNN_10_cls_model, f)



NameError: name 'KNN_10_cls_scores_params' is not defined

In [33]:
# Pickle results
with open(os.path.join(model_fldr,'KNN_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(KNN_30_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr,'KNN_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(KNN_30_cls_model, f)

In [34]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr,'KNN_10_cls_scores_params_rand.pkl'), 'rb')
KNN_10_cls_scores_params = pickle.load(unpicking_out)
KNN_10_cls_scores_params

{'best_train_score': 0.5996280552603613,
 'best_test_score': 0.6,
 'time_sec': 490.554191,
 'time_best_fit_sec': 0.2989806,
 'best_params': {'model__n_neighbors': 1},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                       metric='minkowski', metric_params=None,
                                       n_jobs=None, n_neighbors=1, p=2,
                                       weights='uniform'))],
          verbose=False),
 'best_test_proba': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
      

In [39]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr,'KNN_10_cls_model_rand.pkl'), 'rb')
KNN_10_cls_model = pickle.load(unpicking_out)
KNN_10_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              KNeighborsClassifier(algorithm='auto',
                                                                   leaf_size=30,
                                                                   metric='minkowski',
                                                                   metric_params=None,
                                                                   n_jobs=None,
                 

### Naive Bayes MultiNomial

In [40]:
# Train and fit naive Bayes MultiNomial with RandomizedSearchCV
# from src.models import model_sel_rand_search

# Function arguments:
model = MultinomialNB()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
alphas = [1, 10, 100]
# Selects the min alpha. Keep alpha = 1 to make sure the model can take data it has not seen before 
# from the test set.

param_grid = dict(model__alpha=alphas)

# Call parameter selection function
NBmultinomial_10_cls_scores_params, NBmultinomial_10_cls_model = train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open(os.path.join(model_fldr,'NBmultinomial_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(NBmultinomial_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'NBmultinomial_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(NBmultinomial_10_cls_model, f)



In [41]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr, 'NBmultinomial_10_cls_scores_params_rand.pkl'), 'rb')
NBmultinomial_10_cls_scores_params = pickle.load(unpicking_out)
NBmultinomial_10_cls_scores_params

{'best_train_score': 0.6592720510095643,
 'best_test_score': 0.6398406374501993,
 'time_sec': 2.933915,
 'time_best_fit_sec': 0.0907902,
 'best_params': {'model__alpha': 1},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  MultinomialNB(alpha=1, class_prior=None, fit_prior=True))],
          verbose=False),
 'best_test_proba': array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         1.66292612e-178, 0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        [1.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        ...,
        [0.00000000e+

In [42]:
# Unpickle best model
unpicking_out = open(os.path.join(model_fldr, 'NBmultinomial_10_cls_model_rand.pkl'), 'rb')
NBmultinomial_10_cls_model = pickle.load(unpicking_out)
NBmultinomial_10_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              MultinomialNB(alpha=1.0,
                                                            class_prior=None,
                                                            fit_prior=True))],
                                      verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'model__alpha': [1, 10, 100]},
                   pre_dispatch

### Support Vector Machines

In [None]:
# Train and fit SVC with RandomizedSearchCV
# from src.models import model_sel_rand_search

# Function arguments:
model = SVC(probability=True)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
gamma = [0.01, 0.1, 1, 10]
degree = [2, 3, 4]

param_grid = dict(model__C=C,
                 model__kernel=kernel,
                 model__gamma=gamma,
                 model__degree=degree
                 )

# Call parameter selection function
SVC_10_cls_scores_params, SVC_10_cls_model =train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open(os.path.join(model_fldr, 'SVC_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(SVC_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr,'SVC_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(SVC_10_cls_model, f)

In [None]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr,'SVC_10_cls_scores_params_rand.pkl'), 'rb')
SVC_10_cls_scores_params = pickle.load(unpicking_out)
SVC_10_cls_scores_params

In [None]:
# Unpickle model
unpicking_out = open(os.path.join(model_fldr,'SVC_10_cls_model_rand.pkl'), 'rb')
SVC_10_cls_model = pickle.load(unpicking_out)
SVC_10_cls_model

### Random Forest

In [14]:
# Train and fit Random Forest
# from src.models import model_sel_rand_search

# Function arguments:
model = RandomForestClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [10, 50, 100, 300, 500]
criterion = ['gini', 'entropy']
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10, 50]
max_features = [5, 10, 25, 50, 100]
bootstrap = [True, False]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__criterion=criterion,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                  model__bootstrap=bootstrap
                 )

# Call parameter selection function
RF_10_cls_scores_params, RF_10_cls_model = train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open(os.path.join(model_fldr,'RF_10_cls_scores_params_rand.pkl.pkl'), 'wb') as f:
    pickle.dump(RF_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'RF_10_cls_model_rand.pkl.pkl'), 'wb') as f:
    pickle.dump(RF_10_cls_model, f)

NameError: name 'model_fldr' is not defined

In [24]:
# Pickle results
with open(os.path.join(model_fldr,'RF_10_cls_scores_params_rand.pkl'), 'wb') as f:
    pickle.dump(RF_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'RF_10_cls_model_rand.pkl'), 'wb') as f:
    pickle.dump(RF_10_cls_model, f)

In [15]:
RF_10_cls_scores_params

{'best_train_score': 0.6880313496280552,
 'best_test_score': 0.6873837981407702,
 'time_sec': 590.899273,
 'time_best_fit_sec': 33.69221,
 'best_params': {'model__n_estimators': 300,
  'model__min_samples_split': 10,
  'model__max_features': 10,
  'model__max_depth': 50,
  'model__criterion': 'gini',
  'model__bootstrap': True},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  RandomForestClassifier(bootstrap=True, class_weight=None,
                                         criterion='gini', max_depth=50,
                                         max_features=10, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                

In [25]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr, 'RF_10_cls_scores_params_rand.pkl'), 'rb')
RF_10_cls_scores_params = pickle.load(unpicking_out)
RF_10_cls_scores_params

{'best_train_score': 0.6880313496280552,
 'best_test_score': 0.6873837981407702,
 'time_sec': 590.899273,
 'time_best_fit_sec': 33.69221,
 'best_params': {'model__n_estimators': 300,
  'model__min_samples_split': 10,
  'model__max_features': 10,
  'model__max_depth': 50,
  'model__criterion': 'gini',
  'model__bootstrap': True},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  RandomForestClassifier(bootstrap=True, class_weight=None,
                                         criterion='gini', max_depth=50,
                                         max_features=10, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                

In [26]:
# Unpickle model
unpicking_out = open(os.path.join(model_fldr,'RF_10_cls_model_rand.pkl'), 'rb')
RF_10_cls_model = pickle.load(unpicking_out)
RF_10_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              RandomForestClassifier(bootstrap=True,
                                                                     class_weight=None,
                                                                     criterion='gini',
                                                                     max_depth=None,
                                                                     max_features='auto',
  

### Gradient Boosting Classifier

In [27]:
# Train and fit Gradient Boosting
# from src.models import model_sel_rand_search

# Function arguments:
model = GradientBoostingClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 300, 500]
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10, 20, 50]
max_features = [5, 10, 25, 50, 100]


param_grid = dict(model__n_estimators=n_estimators,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                 )

# Call parameter selection function
GBM_10_cls_scores_params, GBM_10_cls_model = train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)
# Pickle results
with open(os.path.join(model_fldr,'GBM_10_cls_scores_params_rand.pkl.pkl.pkl'), 'wb') as f:
    pickle.dump(GBM_10_cls_scores_params, f)
    
# Pickle model
with open(os.path.join(model_fldr, 'GBM_10_cls_model_rand.pkl.pkl.pkl'), 'wb') as f:
    pickle.dump(GBM_10_cls_model, f)



In [17]:
# Unpickle results
unpicking_out = open(os.path.join(model_fldr,'GBM_10_cls_scores_params_rand.pkl.pkl.pkl'), 'rb')
GBM_10_cls_scores_params = pickle.load(unpicking_out)
GBM_10_cls_scores_params

{'best_train_score': 0.7307385759829969,
 'best_test_score': 0.7304116865869854,
 'time_sec': 2570.612416,
 'time_best_fit_sec': 129.15343,
 'best_params': {'model__n_estimators': 300,
  'model__min_samples_split': 10,
  'model__max_features': 10,
  'model__max_depth': 5},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='deviance',
                                             max_depth=5, max_features=10,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_s

In [19]:
# Unpickle model
unpicking_out = open(os.path.join(model_fldr,'GBM_10_cls_model_rand.pkl.pkl.pkl'), 'rb')
GBM_10_cls_model = pickle.load(unpicking_out)
GBM_10_cls_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('transformer',
                                              RandomOverSampler(random_state=3,
                                                                ratio=None,
                                                                return_indices=False,
                                                                sampling_strategy='minority')),
                                             ('model',
                                              GradientBoostingClassifier(criterion='friedman_mse',
                                                                         init=None,
                                                                         learning_rate=0.1,
                                                                         loss='deviance',
                                                                        

### Dummy classifier

In [None]:
# Train and fit Dummy classifier
from src.models import model_sel_rand_search

# Function arguments:
model = DummyClassifier(random_state=3, strategy='stratified')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params

param_grid = dict()

# Call parameter selection function
Dummy_30_cls_scores_params, Dummy_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/Dummy_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(Dummy_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/Dummy_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(Dummy_30_cls_scores_params, f)

In [None]:
Dummy_30_cls_scores_params

In [None]:
# Dummy classifier
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
f1_dummy = f1_score(y_test, dummy.predict(X_test), average='micro')
accuracy_dummy = accuracy_score(y_test, dummy.predict(X_test))

In [None]:
print('Dummy classifier F1 score: ', f1_dummy)
print('Dummy classifier accuracy score: ', accuracy_dummy)

### Ensembling

In [None]:
# Load models. 
# MAKE THIS MORE EFFICIENT LATER
model_fldr = '/Users/greenapple/project3/aws/models'

pickling_out = open(os.path.join(model_fldr, 'logreg1_10_cls_model_rand.pkl'), 'rb')
logreg1_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'logreg2_10_cls_model_rand.pkl'), 'rb')
logreg2_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'logreg3_10_cls_model_rand.pkl'), 'rb')
logreg3_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'KNN_10_cls_model_rand.pkl'), 'rb')
KNN_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'NBmultinomial_10_cls_model_rand.pkl'), 'rb')
NBmultinomial_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'RF_10_cls_model_rand.pkl'), 'rb')
RF_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

pickling_out = open(os.path.join(model_fldr, 'GBM_10_cls_model_rand.pkl'), 'rb')
GBM_10_cls_model_rand = pickle.load(pickling_out)
pickling_out.close()

In [15]:
%pylab inline
%config InlineBackend.figure_formats = ['retina']

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [5]:
# NEED TO BUILD THIS
# Load models for ensembling

model_fldr = '/Users/greenapple/project3/aws/models'

file_dict = {
    'logreg1_10_cls_model_rand':'logreg1_10_cls_model_rand.pkl',
    'logreg2_10_cls_model_rand':'logreg2_10_cls_model_rand.pkl',
    'logreg3_10_cls_model_rand':'logreg3_10_cls_model_rand.pkl',
    'KNN_10_cls_model_rand':'KNN_10_cls_model_rand.pkl',
    'NBmultinomial_10_cls_model_rand':'NBmultinomial_10_cls_model_rand.pkl',
    'RF_10_cls_model_rand':'RF_10_cls_model_rand.pkl',
    'GBM_10_cls_model_rand': 'GBM_10_cls_model_rand.pkl'
}

# Load models into memory
for name, file in file_dict.items():
    pickling_out = open(os.path.join(model_fldr, file), 'rb')
    name = pickle.load(pickling_out)
    pickling_out.close()
    print(type(name))


<class 'sklearn.model_selection._search.RandomizedSearchCV'>
<class 'sklearn.model_selection._search.RandomizedSearchCV'>
<class 'sklearn.model_selection._search.RandomizedSearchCV'>
<class 'sklearn.model_selection._search.RandomizedSearchCV'>
<class 'sklearn.model_selection._search.RandomizedSearchCV'>
<class 'sklearn.model_selection._search.RandomizedSearchCV'>
<class 'sklearn.model_selection._search.RandomizedSearchCV'>


In [40]:
model_list = [
    ('logreg3', logreg3_10_cls_model_rand),
    ('KNN', KNN_10_cls_model_rand),
    ('NBmultinomial', NBmultinomial_10_cls_model_rand),
    ('RF', RF_10_cls_model_rand),
    ('GBM', GBM_10_cls_model_rand)
]

In [41]:
# Max voting classifier

# Max voting classifier
max_voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='hard',
                                    n_jobs=-1)

f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

KeyboardInterrupt: 

In [None]:
f1_train, f1_test

In [None]:
# Average voting classifier
average_voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='soft',
                                    n_jobs=-1)

f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

average_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [None]:
f1_train, f1_test

In [None]:
# Stacked classifier
model = logreg_2_cls_model

stacked_classifier = StackingClassifier(classifiers=model_list, 
                                        meta_classifier=model, 
                                        use_probas=False)


f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [None]:
f1_train, f1_test

In [None]:
# Convert list of tuples onto a dictionary
classifier_list = [x[1] for x in model_list]
classifier_names = [x[0] for x in model_list]
classifier_dict = dict(zip(classifier_names, classifier_list))

In [None]:
for name, classifier in classifier_dict.items():
    
    # Stacked classifier
    model = classifier

    stacked_classifier = StackingClassifier(classifiers=model_list, 
                                        meta_classifier=model, 
                                        use_probas=False)


    f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

    max_voting_classifer.fit(X_train, y_train)
    y_hat = max_voting_classifer.predict(X_test) 

    f1_test = f1_score(y_test, y_hat, average='micro')
    # f1_test_s = f1_score(y_test, y_hat, average='samples')
    
    print(name, f1_train, f1_test)