In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime

Using TensorFlow backend.


In [2]:
%load_ext autoreload
%autoreload 2

## Data formatting for classification

In [3]:
# Unpickle data 
with open('/Users/greenapple/project3/data/processed/house_bal.pkl', 'rb') as f:
    house_bal = pickle.load(f)

In [4]:
house_bal.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
5,b'--ZhevVpy1s',375,toothbrush,117,35,163,90,198,103,63,...,0,1,202,9,116,0,247,72,44,166
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
28,b'-3pPrlCm6gg',198,clarinet,179,190,122,19,0,114,255,...,58,15,207,0,108,43,97,57,42,0
63,b'-70wVF5u-gg',366,chopping_food,0,114,186,34,87,250,58,...,0,122,144,63,110,255,139,138,59,154
82,b'-ASYwidRD7M',43,snoring,53,100,144,84,223,68,95,...,56,78,153,65,208,207,200,255,255,66


In [5]:
house_bal.shape

(45717, 643)

In [6]:
len(house_bal.y_name.value_counts())

30

In [7]:
house_bal.y_name.value_counts()

speech            4042
music             3781
laughter          3772
snoring           3370
vacuum_cleaner    3054
typing            2644
dishes_pots       2560
frying_food       2102
blender           1884
toilet_flush      1882
door              1868
whoop             1736
footsteps         1492
baby_cry          1414
screeming         1116
whispering         972
clarinet           960
crying             918
microwave          894
television         866
hair_dryer         772
video_games        592
shaving            552
bathtab            472
water_tap          458
chopping_food      410
meow               388
dog                358
purr               304
toothbrush          84
Name: y_name, dtype: int64

In [8]:
# Assign features X and target y
X = house_bal[house_bal.columns[3:643]]
y = house_bal.y_name

In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((36573, 640), (36573,), (9144, 640), (9144,))

## Build and evaluate models

### Logistic regression

In [60]:
# Train and fit logistic regression
from src.models import model_sel_grid

# Function arguments:
model = LogisticRegression(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)

param_grid = dict(
        model__C = C, 
        model__penalty = penalty)

# Call parameter selection function
logreg_2_cls_scores_params, logreg_2_cls_model = model_sel_grid.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/logreg_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(logreg_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg_2_cls_model.pkl', 'wb') as f:
    pickle.dump(logreg_2_cls_model, f)
    



### K-Nearest Neighbors

In [62]:
# Train and fit KNN
from src.models import model_sel_grid

# Function arguments:
model = KNeighborsClassifier()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
k_range = range(1, 31)

param_grid = dict(model__n_neighbors=k_range)

# Call parameter selection function
KNN_2_cls_scores_params, KNN_2_cls_models = model_sel_grid.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/KNN_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(KNN_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/KNN_2_cls_models.pkl', 'wb') as f:
    pickle.dump(KNN_2_cls_models, f)

### Naive Bayes MultiNomial

In [64]:
# Train and fit naive Bayes MultiNomial
from src.models import model_sel_grid

# Function arguments:
model = MultinomialNB()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
alphas = [1, 10, 100]
# Selects the min alpha. Keep alpha = 1 to make sure the model can take data it has not seen before 
# from the test set.

param_grid = dict(model__alpha=alphas)

# Call parameter selection function
NBmultinomial_2_cls_scores_params, NBmultinomial_2_cls_models = model_sel_grid.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/NBmultinomial_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/NBmultinomial_2_cls_models.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_2_cls_models, f)

### Support Vector Machines

In [66]:
# Train and fit SVC
from src.models import model_sel_grid

# Function arguments:
model = svm.SVC(probability=True)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
gamma = [0.001, 0.01, 0.1, 1]
degree = [2, 3]

param_grid = dict(model__C=C,
                 model__kernel=kernel,
                 model__gamma=gamma,
                 model__degree=degree
                 )

# Call parameter selection function
SVC_2_cls_scores_params, SVC_2_cls_models = model_sel_grid.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/SVC_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(SVC_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/SVC_2_cls_models.pkl', 'wb') as f:
    pickle.dump(SVC_2_cls_models, f)

KeyboardInterrupt: 

In [None]:
SVC_2_cls_scores_params

In [71]:
# Unpickle data 
with open('/Users/greenapple/project3/models/SVC_2_classes.pkl', 'rb') as f:
    SVC_2_classes = pickle.load(f)
SVC_2_classes

{'train_score': 0.9261838440111421,
 'test_score': 0.9166666666666666,
 'params': {'model__C': 0.001,
  'model__degree': 3,
  'model__gamma': 0.001,
  'model__kernel': 'poly'},
 'estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
                      decision_function_shape='ovr', degree=3, gamma=0.001,
                      kernel='poly', max_iter=-1, probability=True,
                      random_state=None, shrinking=True, tol=0.001,
                      verbose=False))],
          verbose=False),
 'test_proba': array([[5.71174124e-02, 9.42882588e-01],
        [9.99982371e-01, 1.76286963e-05],
        [9.99997796e-01, 2.20390074e-06],
        [9.99902262e-01, 9.77382165e-05],


### Random Forest

In [10]:
#HAVE NOT RUN THIS SEARCH!!!!

# Train and fit Random Forest
from src.models import model_sel_grid

# Function arguments:
model = RandomForestClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 200, 300]
criterion = ['gini', 'entropy']
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10]
max_features = [5, 10, 25]
bootstrap = [True, False]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__criterion=criterion,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                  model__bootstrap=bootstrap
                 )

# Call parameter selection function
RF_30_cls_scores_params, RF_30_cls_models = model_sel_grid.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/RF_30_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(RF_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/RF_30_cls_models.pkl', 'wb') as f:
    pickle.dump(RF_30_cls_models, f)

KeyboardInterrupt: 

In [145]:
RF_2_cls_scores_params

{'best_train_score': 0.9192200557103064,
 'best_test_score': 0.9166666666666666,
 'time_sec': 2577.827588,
 'time_best_fit_sec': 0.48924120000000004,
 'best_params': {'model__bootstrap': True,
  'model__criterion': 'gini',
  'model__max_depth': 10,
  'model__max_features': 25,
  'model__min_samples_split': 10,
  'model__n_estimators': 50},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  RandomForestClassifier(bootstrap=True, class_weight=None,
                                         criterion='gini', max_depth=10,
                                         max_features=25, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
     

In [78]:
# Train and fit Random Forest over fewer parameters
from src.models import model_sel_grid

# Function arguments:
model = RandomForestClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 300]
criterion = ['gini', 'entropy']
max_depth = [5, 50, None]
min_samples_split = [2, 10]
max_features = [25]
bootstrap = [True]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__criterion=criterion,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                  model__bootstrap=bootstrap
                 )

# Call parameter selection function
RF_2_cls_scores_params, RF_2_cls_models = model_sel_grid.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/RF_2_cls_scores_fewer_params.pkl', 'wb') as f:
    pickle.dump(RF_2_cls_scores_fewer_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/RF_2_cls_models_fewer.pkl', 'wb') as f:
    pickle.dump(RF_2_cls_models_fewer, f)

NameError: name 'RF_2_cls_scores_fewer_params' is not defined

In [126]:
RF_2_cls_scores_params

{'best_train_score': 0.9164345403899722,
 'best_test_score': 0.9333333333333333,
 'time_sec': 173.056771,
 'time_best_fit_sec': 2.0784239999999996,
 'best_params': {'model__bootstrap': True,
  'model__criterion': 'gini',
  'model__max_depth': 5,
  'model__max_features': 25,
  'model__min_samples_split': 10,
  'model__n_estimators': 300},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  RandomForestClassifier(bootstrap=True, class_weight=None,
                                         criterion='gini', max_depth=5,
                                         max_features=25, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
        

### Gradient Boosting Classifier

In [143]:
# Train and fit Gradient Boosting
from src.models import model_sel_grid

# Function arguments:
model = GradientBoostingClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 200, 300]
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10]
max_features = [5, 10, 25]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                 )

# Call parameter selection function
GBM_2_cls_scores_params, GBM_2_cls_models = model_sel_grid.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/GBM_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(GBM_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/GBM_2_cls_models.pkl', 'wb') as f:
    pickle.dump(GBM_2_cls_models, f)

In [144]:
GBM_2_cls_scores_params

{'best_train_score': 0.9240947075208914,
 'best_test_score': 0.9333333333333333,
 'time_sec': 494.899637,
 'time_best_fit_sec': 0.7078482000000001,
 'best_params': {'model__max_depth': 5,
  'model__max_features': 5,
  'model__min_samples_split': 10,
  'model__n_estimators': 300},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='deviance',
                                             max_depth=5, max_features=5,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impu

In [None]:
n_estimators = [50, 100, 300]
criterion = ['gini', 'entropy']
max_depth = [5, 50, None]
min_samples_split = [2, 10]
max_features = [25]
bootstrap = [True]

In [84]:
# Train and fit Gradient Boosting over fewer parameters
from src.models import model_sel_grid

# Function arguments:
model = GradientBoostingClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 300]
max_depth = [5, 50, None]
min_samples_split = [2, 10]
max_features = [25]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                 )

# Call parameter selection function
GBM_2_cls_scores_fewer_params, GBM_2_cls_model_fewer = model_sel_grid.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/GBM_2_cls_scores_fewer_params.pkl', 'wb') as f:
    pickle.dump(GBM_2_cls_scores_fewer_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/GBM_2_cls_model_fewer.pkl', 'wb') as f:
    pickle.dump(GBM_2_cls_model_fewer, f)

In [87]:
GBM_2_cls_scores_fewer_params

{'best_train_score': 0.9240947075208914,
 'best_test_score': 0.9277777777777778,
 'time_sec': 73.11709,
 'time_best_fit_sec': 1.6232022000000002,
 'best_params': {'model__max_depth': 5,
  'model__max_features': 25,
  'model__min_samples_split': 10,
  'model__n_estimators': 300},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='deviance',
                                             max_depth=5, max_features=25,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impu

### Dummy classifier

In [89]:
# Train and fit Dummy classifier

# Function arguments:
model = DummyClassifier(random_state=3, strategy='stratified')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params

param_grid = dict()

# Call parameter selection function
Dummy_2_cls_scores_params, Dummy_2_cls_model = model_sel_grid.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/Dummy_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(Dummy_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/Dummy_2_cls_model.pkl', 'wb') as f:
    pickle.dump(Dummy_2_cls_model, f)

In [90]:
Dummy_2_cls_scores_params

{'best_train_score': 0.4742339832869081,
 'best_test_score': 0.4888888888888889,
 'time_sec': 0.146509,
 'time_best_fit_sec': 0.008307600000000002,
 'best_params': {},
 'best_estimator': Pipeline(memory=None,
          steps=[('transformer',
                  RandomOverSampler(random_state=3, ratio=None,
                                    return_indices=False,
                                    sampling_strategy='minority')),
                 ('model',
                  DummyClassifier(constant=None, random_state=3,
                                  strategy='stratified'))],
          verbose=False),
 'best_test_proba': array([[1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0.

In [91]:
# Dummy classifier
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
f1_dummy = f1_score(y_test, dummy.predict(X_test), average='micro')
accuracy_dummy = accuracy_score(y_test, dummy.predict(X_test))

In [92]:
print('Dummy classifier F1 score: ', f1_dummy)
print('Dummy classifier accuracy score: ', accuracy_dummy)

Dummy classifier F1 score:  0.6888888888888889
Dummy classifier accuracy score:  0.7194444444444444


### Null accuracy

In [93]:
y_test.value_counts()

footsteps    291
purr          69
Name: y_name, dtype: int64

In [94]:
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy: ', null_accuracy)

Null accuracy:  footsteps    0.808333
Name: y_name, dtype: float64


## Visualize scores

### Model summary

In [131]:
model_score_table = pd.DataFrame(columns=['Models', 'F1_score_train'])
model_score_table

Unnamed: 0,Models,F1_score_train


In [132]:
model_score_table['Models'] = ['Logistis_Regression',
                              'KNN',
                              'Multinomial_NB',
                              'SVC_poly',
                              'Random_Forest',
                               'GBM',
                               'Dummy'
                              ]

In [139]:
model_score_table['F1_score_train'] = trainin

In [140]:
model_score_table

Unnamed: 0,Models,F1_score_train
0,Logistis_Regression,0.914345
1,KNN,0.9039
2,Multinomial_NB,0.870474
3,SVC_poly,0.926184
4,Random_Forest,0.916435
5,GBM,0.924095
6,Dummy,0.474234


In [141]:
# Pickle results
with open('/Users/greenapple/project3/reports/figures/model_score_table_MVP.pkl', 'wb') as f:
    pickle.dump(model_score_table, f)

### Confusion matrix

In [None]:
# Helper function for printing confusion matrices (see: https://gist.github.com/shaypal5/94c53d765083101efc0240d776a23823)

# prints confusion matrix as a heatmap which is nicer to visaulize

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=18):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names, )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
cm = print_confusion_matrix(conf_mat, ['Class 0', 'Class 1'])

### ROC

In [None]:
# Save the model for later
filename = /Users/greenapple/project3/models/logreg.sav'
joblib.dump(logreg, filename)