In [260]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime

## Data formatting for classification

In [122]:
# Unpickle data 
with open('/Users/greenapple/project3/data/processed/house_bal.pkl', 'rb') as f:
    house_bal = pickle.load(f)

In [123]:
house_bal.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
5,b'--ZhevVpy1s',375,toothbrush,117,35,163,90,198,103,63,...,0,1,202,9,116,0,247,72,44,166
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
28,b'-3pPrlCm6gg',198,clarinet,179,190,122,19,0,114,255,...,58,15,207,0,108,43,97,57,42,0
63,b'-70wVF5u-gg',366,chopping_food,0,114,186,34,87,250,58,...,0,122,144,63,110,255,139,138,59,154
82,b'-ASYwidRD7M',43,snoring,53,100,144,84,223,68,95,...,56,78,153,65,208,207,200,255,255,66


In [124]:
house_bal.shape

(45717, 643)

In [125]:
len(house_bal.y_name.value_counts())

30

In [126]:
house_bal.y_name.value_counts()

speech            4042
music             3781
laughter          3772
snoring           3370
vacuum_cleaner    3054
typing            2644
dishes_pots       2560
frying_food       2102
blender           1884
toilet_flush      1882
door              1868
whoop             1736
footsteps         1492
baby_cry          1414
screeming         1116
whispering         972
clarinet           960
crying             918
microwave          894
television         866
hair_dryer         772
video_games        592
shaving            552
bathtab            472
water_tap          458
chopping_food      410
meow               388
dog                358
purr               304
toothbrush          84
Name: y_name, dtype: int64

In [127]:
five_class_list = [
    'speech',
    'footsteps',
    'toilet_flush',
    'bathtab',
    'purr'
]

In [128]:
two_class_list = [
    'footsteps',
    'purr'
]

In [129]:
house_5_classes = house_bal.loc[house_bal.y_name.isin(five_class_list)]

In [130]:
house_2_classes = house_bal.loc[house_bal.y_name.isin(two_class_list)]

In [131]:
house_5_classes.shape

(8192, 643)

In [132]:
house_5_classes.head()

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,630,631,632,633,634,635,636,637,638,639
20,b'-2hQKCE-oTI',53,footsteps,30,162,44,7,216,116,206,...,213,109,50,88,19,46,54,154,42,211
138,b'-G_hnfp4a0M',53,footsteps,141,93,96,107,139,0,220,...,250,104,46,0,98,12,234,61,81,133
150,b'-IWlQN6cfe4',53,footsteps,78,146,188,30,200,44,120,...,130,96,33,255,83,228,123,120,58,236
281,b'-blH_CYo09w',53,footsteps,190,217,227,113,96,140,43,...,255,255,163,45,183,31,39,0,54,196
552,b'0DefiYiEF4E',0,speech,104,150,137,162,243,70,59,...,192,216,0,197,0,144,128,38,127,216


In [133]:
# Assign features X and target y
X = house_2_classes[house_2_classes.columns[3:643]]
y = house_2_classes.y_name

In [134]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

X_train.shape, X_val.shape, X_test.shape

In [136]:
# Look up attributes
RandomOverSampler('self').get_params().keys()

dict_keys(['random_state', 'ratio', 'return_indices', 'sampling_strategy'])

In [287]:
# MOVE TO SRC FOLDER

import numpy as np
import pandas as pd
import pickle
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC

from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, GridSearchCV
import sklearn
from datetime import datetime

def train_fit(model, 
              param_grid, 
              tranformer, 
              X_train, 
              y_train, 
              X_test, 
              y_test, 
              CV):
   
    '''Function searches for the best model paramenters through a cross-validation on the train set
    and returns results of test set fit.
   
    Args:
        model - supervised learning classifier
        param_grid (dict) - set of model specific parameters
        tranformer - data transformer
        X_train (DataFrame) - training data set
        y_train (DataFrame) - training target set
        X_test (DataFrame) - test data set
        y_test (DataFrame) - test traget set
        CV (int) - number of cross-validation folds
        
    Returns:
        dictionary with the following keys:
            train_score
            test_score
            params
            estimator
            test_proba
            y_hat
        model
    '''
    # Time the function: record start time
    start_time = datetime.now()
    
    pipe = Pipeline([('transformer', transformer), ('model', model)])
    
    # Parameter grid
    grid = GridSearchCV(pipe, param_grid, cv=CV, scoring='f1_micro', refit=True)
    grid.fit(X_train, y_train)
    
    # Time the function: record end time
    end_time = datetime.now()
    time = str(end_time - start_time)
    
    # Parameters and scores
    params_scores_pred = {
        'best_train_score':grid.best_score_,
        'best_test_score': grid.score(X_test, y_test),
        'time': time,
        'best_params':grid.best_params_,
        'best_estimator':grid.best_estimator_,
        'best_test_proba': grid.predict_proba(X_test),
        'best_y_hat': grid.predict(X_test),
        'all_scores': grid.cv_results_

    }

    return params_scores_pred, grid

In [291]:
np.logspace(0, 4, 10)

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

## Build and evaluate models

### Logistic regression

In [293]:
# Train and fit logistic regression
# Function arguments:
model = LogisticRegression(random_state=3)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)

param_grid = dict(
        model__C = C, 
        model__penalty = penalty)

# Call fitting function
logreg_2_cls_scores_params, logreg_2_cls_models = train_fit(model, 
                                                            param_grid, 
                                                            tranformer, 
                                                            X_train, 
                                                            y_train, 
                                                            X_test, 
                                                            y_test, 
                                                            CV)

# Pickle results
with open('/Users/greenapple/project3/models/logreg_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(logreg_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/logreg_2_cls_models.pkl', 'wb') as f:
    pickle.dump(logreg_2_cls_models, f)







### K-Nearest Neighbors

In [None]:
# Train and fit KNN

# Function arguments:
model = KNeighborsClassifier()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
k_range = range(1, 31)

param_grid = dict(model__n_neighbors=k_range)

# Call fitting function
KNN_2_cls_scores_params, KNN_2_cls_models = train_fit(model, 
                                                            param_grid, 
                                                            tranformer, 
                                                            X_train, 
                                                            y_train, 
                                                            X_test, 
                                                            y_test, 
                                                            CV)

# Pickle results
with open('/Users/greenapple/project3/models/KNN_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(KNN_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/KNN_2_cls_models.pkl', 'wb') as f:
    pickle.dump(KNN_2_cls_models, f)

### Naive Bayes MultiNomial ?? alpha

In [None]:
# Train and fit naive Bayes MultiNomial

# Function arguments:
model = MultinomialNB()
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
alphas = [0.001, 0.01, 0.1, 1, 10]
# selects the min alpha!!!

param_grid = dict(model__alpha=alphas)

# Call fitting function
NBmultinomial_2_cls_scores_params, NBmultinomial_2_cls_models = train_fit(model, 
                                                            param_grid, 
                                                            tranformer, 
                                                            X_train, 
                                                            y_train, 
                                                            X_test, 
                                                            y_test, 
                                                            CV)

# Pickle results
with open('/Users/greenapple/project3/models/NBmultinomial_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/NBmultinomial_2_cls_models.pkl', 'wb') as f:
    pickle.dump(NBmultinomial_2_cls_models, f)

### Support Vector Machines

In [None]:
# Train and fit SVC

# Function arguments:
model = svm.SVC(probability=True)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
gamma = [0.001, 0.01, 0.1, 1]
degree = [2, 3]

param_grid = dict(model__C=C,
                 model__kernel=kernel,
                 model__gamma=gamma,
                 model__degree=degree
                 )

# Call fitting function
SVC_2_cls_scores_params, SVC_2_cls_models = train_fit(model, 
                                                            param_grid, 
                                                            tranformer, 
                                                            X_train, 
                                                            y_train, 
                                                            X_test, 
                                                            y_test, 
                                                            CV)

# Pickle results
with open('/Users/greenapple/project3/models/SVC_2_cls_scores_params.pkl', 'wb') as f:
    pickle.dump(SVC_2_cls_scores_params, f)
    
# Pickle model
with open('/Users/greenapple/project3/models/SVC_2_cls_models.pkl', 'wb') as f:
    pickle.dump(SVC_2_cls_models, f)

### Decision  trees

### Dummy classifier ??????

In [None]:
# Train and fit Dummy classifier

# Function arguments:
model = DummyClassifier(random_state=3, strategy=?????????????)
transformer = RandomOverSampler(random_state=3, sampling_strategy='minority')
CV = 5

# Function arguments: classifier params
k_range = range(1, 31)

param_grid = dict(model__n_neighbors=k_range)

# Call fitting function
Dummy_2_classes = train_fit(model, param_grid, tranformer, X_train, y_train, X_test, y_test, CV)

# Pickle results
with open('/Users/greenapple/project3/models/Dummy_2_classes.pkl', 'wb') as f:
    pickle.dump(Dummy_2_classes, f)

In [201]:
# Dummy classifier
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
f1_dummy = f1_score(y_test, dummy.predict(X_test), average='micro')
accuracy_dummy = accuracy_score(y_test, dummy.predict(X_test))

<function sklearn.metrics.classification.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)>

In [208]:
print('Dummy classifier F1 score: ', f1_dummy)
print('Dummy classifier accuracy score: ', accuracy_dummy)

Dummy classifier F1 score:  0.7055555555555556
Dummy classifier accuracy score:  0.7138888888888889


### Null accuracy

In [194]:
y_test.value_counts()

footsteps    291
purr          69
Name: y_name, dtype: int64

In [209]:
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy: ', null_accuracy)

Null accuracy:  footsteps    0.808333
Name: y_name, dtype: float64


In [None]:
# Save the model for later
filename = /Users/greenapple/project3/models/logreg.sav'
joblib.dump(logreg, filename)