In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from mlxtend.classifier import StackingClassifier
from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_validate
import sklearn
from skmultilearn.ensemble import MajorityVotingClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
%load_ext autoreload
%autoreload 2

## Data formatting for classification

In [3]:
# Unpickle data 
with open('/Users/greenapple/project3/data/processed/house_bal.pkl', 'rb') as f:
    house_bal = pickle.load(f)

In [4]:
house_bal.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
5,b'--ZhevVpy1s',375,toothbrush,117,35,163,90,198,103,63,...,0,1,202,9,116,0,247,72,44,166
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
28,b'-3pPrlCm6gg',198,clarinet,179,190,122,19,0,114,255,...,58,15,207,0,108,43,97,57,42,0
63,b'-70wVF5u-gg',366,chopping_food,0,114,186,34,87,250,58,...,0,122,144,63,110,255,139,138,59,154
82,b'-ASYwidRD7M',43,snoring,53,100,144,84,223,68,95,...,56,78,153,65,208,207,200,255,255,66


In [5]:
house_bal.shape

(45717, 643)

In [6]:
len(house_bal.y_name.value_counts())

30

In [7]:
house_bal.y_name.value_counts()

speech            4042
music             3781
laughter          3772
snoring           3370
vacuum_cleaner    3054
typing            2644
dishes_pots       2560
frying_food       2102
blender           1884
toilet_flush      1882
door              1868
whoop             1736
footsteps         1492
baby_cry          1414
screeming         1116
whispering         972
clarinet           960
crying             918
microwave          894
television         866
hair_dryer         772
video_games        592
shaving            552
bathtab            472
water_tap          458
chopping_food      410
meow               388
dog                358
purr               304
toothbrush          84
Name: y_name, dtype: int64

In [8]:
# Assign features X and target y
X = house_bal[house_bal.columns[3:643]]
y = house_bal.y_name

In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((36573, 640), (36573,), (9144, 640), (9144,))

## Build and evaluate models

### Logistic regression

In [None]:
# Train and fit logistic regression with  RandomizedSearchCV:  1st set of parameteres
from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
solver = ['liblinear', 'saga']

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__solver=solver)

# Call parameter selection function
logreg1_30_cls_scores_params, logreg1_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open('/Users/greenapple/project3/models/logreg1_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(logreg1_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg1_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(logreg1_30_cls_model, f)

In [None]:
# Train and fit logistic regression with  RandomizedSearchCV: 2nd set of parameteres
from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3, multi_class='multinomial', solver='saga')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['elasticnet']
C = np.logspace(0, 4, 10)
solver = 'saga'
multiclass='multinomial'
l1_1ratio = list(np.arange(0, 1, 0.1))

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__l1_ratio=l1_ratio)


# Call parameter selection function
logreg2_30_cls_scores_params, logreg2_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open('/Users/greenapple/project3/models/logreg2_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(logreg2_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg2_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(logreg2_30_cls_model, f)

In [None]:
# Train and fit logistic regression with  RandomizedSearchCV: 3rd set of parameteres
from src.models import model_sel_rand_search

# Function arguments:
model = LogisticRegression(random_state=3, multi_class='multinomial')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l2']
C = np.logspace(0, 4, 10)
solver = ['sag', 'lbfgs', 'newton-cg']

param_distributions = dict(
        model__C = C, 
        model__penalty = penalty,
        model__solver=solver
)

# Call parameter selection function
logreg3_30_cls_scores_params, logreg3_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_distributions,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)
# Pickle results
with open('/Users/greenapple/project3/models/logreg3_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(logreg3_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg3_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(logreg3_30_cls_model, f)  

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/logreg3_30_cls_scores_params_rand.pkl', 'rb')
logreg3_30_cls_scores_params = pickle.load(unpicking_out)
logreg3_30_cls_scores_params

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/logreg3_30_cls_model_rand.pkl', 'rb')
logreg3_30_cls_model = pickle.load(unpicking_out)
logreg3_30_cls_model

In [None]:
# Pick best logreg model

### K-Nearest Neighbors

In [None]:
# Train and fit KNN with  RandomizedSearchCV
from src.models import model_sel_rand_search

# Function arguments:
model = KNeighborsClassifier()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
k_range = list(range(1, 10, 1))

param_grid = dict(model__n_neighbors=k_range)

# Call parameter selection function
KNN_30_cls_scores_params, KNN_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/KNN_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(KNN_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/KNN_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(KNN_30_cls_model, f)

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/KNN_30_cls_scores_params_rand.pkl', 'rb')
KNN_30_cls_scores_params = pickle.load(unpicking_out)
KNN_30_cls_scores_params

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/KNN_30_cls_model_rand.pkl', 'rb')
KNN_30_cls_model = pickle.load(unpicking_out)
KNN_30_cls_model

### Naive Bayes MultiNomial

In [None]:
# Train and fit naive Bayes MultiNomial with RandomizedSearchCV
from src.models import model_sel_rand_search

# Function arguments:
model = MultinomialNB()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
alphas = [1, 10, 100]
# Selects the min alpha. Keep alpha = 1 to make sure the model can take data it has not seen before 
# from the test set.

param_grid = dict(model__alpha=alphas)

# Call parameter selection function
NBmultinomial_30_cls_scores_params, NBmultinomial_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/NBmultinomial_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/NBmultinomial_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_30_cls_model, f)

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/NBmultinomial_30_cls_scores_params_rand.pkl', 'rb')
NBmultinomial_30_cls_scores_params = pickle.load(unpicking_out)
NBmultinomial_30_cls_scores_params

In [None]:
# Unpickle best model
unpicking_out = open('/Users/greenapple/project3/models/NBmultinomial_30_cls_model_rand.pkl', 'rb')
NBmultinomial_30_cls_model = pickle.load(unpicking_out)
NBmultinomial_30_cls_model

### Support Vector Machines

In [None]:
# Train and fit SVC with RandomizedSearchCV
from src.models import model_sel_rand_search

# Function arguments:
model = SVC(probability=True)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
gamma = [0.01, 0.1, 1, 10]
degree = [2, 3, 4]

param_grid = dict(model__C=C,
                 model__kernel=kernel,
                 model__gamma=gamma,
                 model__degree=degree
                 )

# Call parameter selection function
SVC_30_cls_scores_params, SVC_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                               param_grid,
                                                                               transformer,
                                                                               X_train,
                                                                               X_test,
                                                                               y_train,
                                                                               y_test,
                                                                               CV)

# Pickle results
with open('/Users/greenapple/project3/models/SVC_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(SVC_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/SVC_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(SVC_30_cls_model, f)

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/SVC_30_cls_scores_params_rand.pkl', 'rb')
SVC_30_cls_scores_params = pickle.load(unpicking_out)
SVC_30_cls_scores_params

In [None]:
# Unpickle model
unpicking_out = open('/Users/greenapple/project3/models/SVC_30_cls_model_rand.pkl', 'rb')
SVC_30_cls_model = pickle.load(unpicking_out)
SVC_30_cls_model

### Random Forest

In [None]:
# Train and fit Random Forest
from src.models import model_sel_rand_search

# Function arguments:
model = RandomForestClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [10, 50, 100, 300, 500]
criterion = ['gini', 'entropy']
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10, 50]
max_features = [5, 10, 25, 50, 100]
bootstrap = [True, False]

                
param_grid = dict(model__n_estimators=n_estimators,
                  model__criterion=criterion,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                  model__bootstrap=bootstrap
                 )

# Call parameter selection function
RF_30_cls_scores_params, RF_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/RF_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(RF_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/RF_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(RF_30_cls_model, f)

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/RF_30_cls_scores_params_rand.pkl', 'rb')
RF_30_cls_scores_params = pickle.load(unpicking_out)
RF_30_cls_scores_params

In [None]:
# Unpickle model
unpicking_out = open('/Users/greenapple/project3/models/RF_30_cls_model_rand.pkl', 'rb')
RF_30_cls_model = pickle.load(unpicking_out)
RF_30_cls_model

### Gradient Boosting Classifier

In [None]:
# Train and fit Gradient Boosting
from src.models import model_sel_rand_search

# Function arguments:
model = GradientBoostingClassifier(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
n_estimators = [50, 100, 300, 500]
max_depth = [5, 10, 50, 100, None]
min_samples_split = [2, 5, 10, 20, 50]
max_features = [5, 10, 25, 50, 100]


param_grid = dict(model__n_estimators=n_estimators,
                  model__max_depth=max_depth,
                  model__min_samples_split=min_samples_split,
                  model__max_features=max_features,
                 )

# Call parameter selection function
GBM_30_cls_scores_params, GBM_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/GBM_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(GBM_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/GBM_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(GBM_30_cls_model, f)

In [None]:
# Unpickle results
unpicking_out = open('/Users/greenapple/project3/models/GBM_30_cls_scores_params_rand.pkl', 'rb')
GBM_30_cls_scores_params = pickle.load(unpicking_out)
GBM_30_cls_scores_params

In [None]:
# Unpickle model
unpicking_out = open('/Users/greenapple/project3/models/GBM_30_cls_model_rand.pkl', 'rb')
GBM_30_cls_model = pickle.load(unpicking_out)
GBM_30_cls_model

### Dummy classifier

In [None]:
# Train and fit Dummy classifier
from src.models import model_sel_rand_search

# Function arguments:
model = DummyClassifier(random_state=3, strategy='stratified')
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params

param_grid = dict()

# Call parameter selection function
Dummy_30_cls_scores_params, Dummy_30_cls_model = model_sel_rand_search.train_fit_time(model,
                                                                        param_grid,
                                                                        transformer,
                                                                        X_train,
                                                                        X_test,
                                                                        y_train,
                                                                        y_test,
                                                                        CV)

# Pickle results
with open('/Users/greenapple/project3/models/Dummy_30_cls_scores_params_rand.pkl', 'wb') as f:
    pickle.dump(Dummy_30_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/Dummy_30_cls_model_rand.pkl', 'wb') as f:
    pickle.dump(Dummy_30_cls_scores_params, f)

In [None]:
Dummy_30_cls_scores_params

In [None]:
# Dummy classifier
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
f1_dummy = f1_score(y_test, dummy.predict(X_test), average='micro')
accuracy_dummy = accuracy_score(y_test, dummy.predict(X_test))

In [None]:
print('Dummy classifier F1 score: ', f1_dummy)
print('Dummy classifier accuracy score: ', accuracy_dummy)

### Ensembling

In [None]:
%pylab inline
%config InlineBackend.figure_formats = ['retina']

In [None]:
model_list = [
    ('logreg', logreg_30_cls_model),
    ('KNN', KNN_30_cls_models_rand),
    ('NBmultinomial', NBmultinomial_30_cls_model),
    ('SVC', SVC_30_cls_model),
    ('RF', RF_30_cls_model),
    ('GBM', GBM_30_cls_model)
]

In [None]:
# Max voting classifier

# Max voting classifier
average_voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='hard',
                                    n_jobs=-1)

f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [None]:
f1_train, f1_test

In [None]:
# Average voting classifier
average_voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='soft',
                                    n_jobs=-1)

f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [None]:
f1_train, f1_test

In [None]:
# Stacked classifier
model = logreg_2_cls_model

stacked_classifier = StackingClassifier(classifiers=model_list, 
                                        meta_classifier=model, 
                                        use_probas=False)


f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

max_voting_classifer.fit(X_train, y_train)
y_hat = max_voting_classifer.predict(X_test) 

f1_test = f1_score(y_test, y_hat, average='micro')
# f1_test_s = f1_score(y_test, y_hat, average='samples')

In [None]:
f1_train, f1_test

In [None]:
# Convert list of tuples onto a dictionary
classifier_list = [x[1] for x in model_list]
classifier_names = [x[0] for x in model_list]
classifier_dict = dict(zip(classifier_names, classifier_list))

In [None]:
for name, classifier in classifier_dict.items():
    
    # Stacked classifier
    model = classifier

    stacked_classifier = StackingClassifier(classifiers=model_list, 
                                        meta_classifier=model, 
                                        use_probas=False)


    f1_train = cross_val_score(max_voting_classifer, 
            X_train, y_train, scoring='f1_micro', cv=5).mean()

    max_voting_classifer.fit(X_train, y_train)
    y_hat = max_voting_classifer.predict(X_test) 

    f1_test = f1_score(y_test, y_hat, average='micro')
    # f1_test_s = f1_score(y_test, y_hat, average='samples')
    
    print(name, f1_train, f1_test)

### Null accuracy

In [None]:
y_test.value_counts()

In [None]:
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy: ', null_accuracy)

## Visualize scores

### Model summary

In [None]:
model_score_table = pd.DataFrame(columns=['Models', 'F1_score_train'])
model_score_table

In [None]:
model_score_table['Models'] = ['Logistis_Regression',
                              'KNN',
                              'Multinomial_NB',
                              'SVC_poly',
                              'Random_Forest',
                               'GBM',
                               'Dummy'
                              ]

In [None]:
model_score_table['F1_score_train'] = trainin

In [None]:
model_score_table

In [None]:
# Pickle results
with open('/Users/greenapple/project3/reports/figures/model_score_table_MVP.pkl', 'wb') as f:
    pickle.dump(model_score_table, f)

### Confusion matrix

In [None]:
# Helper function for printing confusion matrices (see: https://gist.github.com/shaypal5/94c53d765083101efc0240d776a23823)

# prints confusion matrix as a heatmap which is nicer to visaulize

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=18):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names, )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
cm = print_confusion_matrix(conf_mat, ['Class 0', 'Class 1'])

### ROC

In [None]:
# Save the model for later
filename = /Users/greenapple/project3/models/logreg.sav'
joblib.dump(logreg, filename)