## In this notebook, I select my features, scale them, and resample the data so that the classes are balanced. 

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [2]:
# Read Data
with open('all_engineered_labeled_data.pickle', 'rb') as read_file:
    df = pickle.load(read_file)
    
df.head()

Unnamed: 0,centroid,chroma,chroma_cens,chroma_cqt,contrast,file_number,flatness,index,mean_centroid,mean_chroma,...,pctl_50_spec_contrast,pctl_75_spec_contrast,stdev_tonnetz,pctl_25_tonnetz,pctl_50_tonnetz,pctl_75_tonnetz,stdev_zero_cr,pctl_25_zero_cr,pctl_50_zero_cr,pctl_75_zero_cr
0,"[504.48257148, 486.40693278, 470.00318888, 458...","[0.54581436, 0.60204302, 0.75422708, 0.8372386...","[0.23143955, 0.23162181, 0.23182812, 0.2789998...","[0.55334989, 0.73690282, 0.70845463, 0.7077047...","[19.62451933, 11.26503332, 12.72140427, 56.165...",1,"[1.32692594e-06, 3.85900563e-07, 8.42322656e-0...",train1.wav,443.787989,0.685376,...,16.262727,20.029834,0.024888,0.005155,0.019143,0.041264,0.004743,0.039795,0.04248,0.04541
1,"[577.97243044, 524.01977987, 427.78777341, 420...","[0.6937604, 0.6619282, 0.67327483, 0.83409719,...","[0.21704897, 0.21644774, 0.21590343, 0.2980798...","[0.91323336, 0.72939455, 0.66281015, 1.0, 0.99...","[16.21541347, 10.36397665, 13.25033723, 54.129...",2,"[6.5010463e-05, 2.9992112e-05, 4.2579142e-08, ...",train2.wav,463.286988,0.70604,...,15.050572,19.15955,0.008353,-0.000481,0.003629,0.007551,0.004022,0.040771,0.041992,0.043945
2,"[559.55729635, 528.68537997, 493.25946834, 479...","[0.67377754, 0.67545543, 0.76603449, 0.7567035...","[0.36211961, 0.3571492, 0.35205534, 0.28488815...","[0.57887618, 0.8278652, 0.80876065, 0.44839986...","[19.49029831, 10.76560549, 12.28624703, 54.706...",3,"[5.3348005e-05, 1.6811937e-05, 3.5331706e-08, ...",train3.wav,468.395865,0.727035,...,15.721419,19.831365,0.009168,-0.003178,0.002421,0.009207,0.004212,0.041992,0.043457,0.04541
3,"[476.40339026, 466.14735071, 486.65554654, 493...","[0.71536, 0.68535892, 0.68436785, 0.73879001, ...","[0.38081637, 0.37959923, 0.3784084, 0.28371449...","[0.71287569, 0.91182481, 0.81314382, 0.8411274...","[21.99398376, 11.21640425, 13.21343611, 56.224...",4,"[3.93400853e-07, 1.46186906e-07, 3.61700003e-0...",train4.wav,456.602276,0.71394,...,15.965496,20.192855,0.008188,0.003535,0.005865,0.011568,0.005485,0.035645,0.041016,0.044678
4,"[549.73110503, 506.98113396, 473.99013424, 473...","[0.62032704, 0.61695566, 0.67535189, 0.6934418...","[0.29312185, 0.29195579, 0.2914121, 0.29096149...","[0.43504997, 0.84417333, 0.96267805, 0.2019479...","[21.30596447, 10.20325473, 12.75112812, 55.260...",5,"[3.01305136e-05, 8.84687415e-06, 1.99340775e-0...",train5.wav,471.288952,0.69525,...,15.684625,19.96512,0.008685,-0.001283,0.002361,0.012658,0.006112,0.040771,0.043945,0.047363


In [3]:
# remove unneccesary columns - these are arrays
X = df.drop(columns=['label','centroid','chroma', 'chroma_cens', 'chroma_cqt', 'contrast','file_number', 'flatness', 'index','mel', 'mfccs','rmse', 'rolloff', 'spec_bw', 'spec_contrast', 'tonnetz', 'zero_cr'])
y = df.label

### Hold out 20% of data as pure test set. Stratify to maintain % of each class in each split.

In [4]:
X_trainVal, X_test, y_trainVal, y_test = train_test_split(X, y, test_size=0.20, random_state=101,stratify=y)

### Set up validation set

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_trainVal, y_trainVal, test_size=0.20, random_state=101,stratify=y_trainVal)

# Pickle test and trainVal sets

In [None]:
with open('X_test', 'wb') as to_write:
    pickle.dump(X_test, to_write) 
with open('y_test', 'wb') as to_write:
    pickle.dump(y_test, to_write) 
with open('X_trainVal', 'wb') as to_write:
    pickle.dump(X_trainVal, to_write) 
with open('y_trainVal', 'wb') as to_write:
    pickle.dump(y_trainVal, to_write) 

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_trainVal, y_trainVal, test_size=0.20, random_state=101,stratify=y_trainVal)

# Selecting Features

In [3]:
extra_trees = ExtraTreesClassifier()
extra_trees.fit(X_train,y_train)
importances = extra_trees.feature_importances_
print(importances)

NameError: name 'X_train' is not defined

In [6]:
print(np.mean(importances))
print(np.max(importances))
print(np.min(importances))
print(np.argmin(importances))

0.014285714285714285
0.10734239105571977
0.002704274908297886
35


In [7]:
importances = dict(zip(X_train.columns,importances))
print(importances)

{'mean_centroid': 0.00956499966950861, 'mean_chroma': 0.011507438699748814, 'mean_chroma_cens': 0.014182609841153581, 'mean_chroma_cqt': 0.0317800246377064, 'mean_contrast': 0.017123813977686527, 'mean_flatness': 0.009089014187211902, 'mean_mel': 0.009940266853127391, 'mean_mfccs': 0.10734239105571977, 'mean_rmse': 0.01750655594395714, 'mean_rolloff': 0.009023033349186934, 'mean_spec_bw': 0.01025448584029864, 'mean_spec_contrast': 0.015208073920625117, 'mean_tonnetz': 0.009311237906190229, 'mean_zero_cr': 0.010652348188579621, 'stdev_centroid': 0.015621446356337785, 'pctl_25_centroid': 0.011553226897678858, 'pctl_50_centroid': 0.011631115775086206, 'pctl_75_centroid': 0.010586989657613604, 'stdev_chroma': 0.011715485436876864, 'pctl_25_chroma': 0.014344300482393916, 'pctl_50_chroma': 0.010436373417608499, 'pctl_75_chroma': 0.008988800138444799, 'stdev_chroma_cens': 0.01091426664353152, 'pctl_25_chroma_cens': 0.010906567282768023, 'pctl_50_chroma_cens': 0.010196581632246238, 'pctl_75_ch

In [8]:
for i in importances: 
    if importances[i] > 0.014:
        print(i)

mean_chroma_cens
mean_chroma_cqt
mean_contrast
mean_mfccs
mean_rmse
mean_spec_contrast
stdev_centroid
pctl_25_chroma
pctl_75_contrast
stdev_mfccs
pctl_25_mfccs
pctl_50_mfccs
pctl_75_mfccs
stdev_rmse
pctl_75_rmse
stdev_rolloff
pctl_25_spec_contrast


In [4]:
# Try with subset of features with greatest importance. 
X2 = df[['mean_chroma_cqt','mean_contrast','mean_mfccs','mean_spec_bw','mean_spec_contrast','stdev_centroid','pctl_75_contrast','stdev_mel','stdev_mfccs','pctl_25_mfccs','pctl_50_mfccs','pctl_75_mfccs','stdev_rmse','pctl_25_rmse','pctl_50_rmse','pctl_50_spec_bw','pctl_50_spec_contrast','pctl_75_spec_contrast','stdev_tonnetz','pctl_25_zero_cr']]
y2 = df.label

X2_trainVal, X2_test, y2_trainVal, y2_test = train_test_split(X2, y2, test_size=0.20, random_state=101,stratify=y2)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2_trainVal, y2_trainVal, test_size=0.20, random_state=101,stratify=y2_trainVal)

# Try with a different subset of features
X3 = df[['mean_chroma_cens','mean_chroma_cqt','mean_contrast','mean_mfccs','mean_rmse','mean_spec_contrast','stdev_centroid','pctl_25_chroma','pctl_75_contrast','stdev_mfccs','pctl_25_mfccs','pctl_50_mfccs','pctl_75_mfccs','stdev_rmse','pctl_75_rmse','stdev_rolloff','pctl_25_spec_contrast']]
y3 = df.label

X3_trainVal, X3_test, y3_trainVal, y3_test = train_test_split(X3, y3, test_size=0.20, random_state=101,stratify=y3)
X3_train, X3_val, y3_train, y3_val = train_test_split(X3_trainVal, y3_trainVal, test_size=0.20, random_state=101,stratify=y3_trainVal)


In [55]:
logmodel2 = LogisticRegression()
logmodel2.fit(X2_train,y2_train)
log_predictions2 = logmodel2.predict(X2_val)
print(classification_report(y2_val,log_predictions2))
print(confusion_matrix(y2_val,log_predictions2))



              precision    recall  f1-score   support

           0       0.87      0.90      0.88      3676
           1       0.63      0.55      0.59      1124

   micro avg       0.82      0.82      0.82      4800
   macro avg       0.75      0.73      0.74      4800
weighted avg       0.81      0.82      0.82      4800

[[3309  367]
 [ 501  623]]


# Use pipeline models to identify which subset of features performs best

In [None]:
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(random_state=42))])

pipe_knn = Pipeline([('scl', StandardScaler()),
                    ('clf', KNeighborsClassifier(n_neighbors=5))])

pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('clf', RandomForestClassifier(n_estimators=100))])

pipe_svm = Pipeline([('scl', StandardScaler()),
                    ('clf', SVC(kernel="linear"))])


pipelines = [pipe_lr, pipe_knn, pipe_rf, pipe_svm]

for pipe in pipelines:
    pipe.fit(X2_train, y2_train)
    prediction_name = (str(pipe) + "_predictions")
    prediction_name = pipe.predict(X2_val)
    print(prediction_name)
    print(classification_report(y2_val,prediction_name))
    print(confusion_matrix(y2_val,prediction_name))



[0 0 0 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3676
           1       0.65      0.56      0.60      1124

   micro avg       0.82      0.82      0.82      4800
   macro avg       0.76      0.73      0.74      4800
weighted avg       0.82      0.82      0.82      4800

[[3332  344]
 [ 496  628]]
[0 1 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      3676
           1       0.64      0.61      0.63      1124

   micro avg       0.83      0.83      0.83      4800
   macro avg       0.76      0.75      0.76      4800
weighted avg       0.83      0.83      0.83      4800

[[3288  388]
 [ 435  689]]
[0 1 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      3676
           1       0.71      0.58      0.64      1124

   micro avg       0.85      0.85      0.85      4800
   macro avg       0

In [5]:
#X3 

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(random_state=42))])

pipe_knn = Pipeline([('scl', StandardScaler()),
                    ('clf', KNeighborsClassifier(n_neighbors=5))])

pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('clf', RandomForestClassifier(n_estimators=100))])

pipe_svm = Pipeline([('scl', StandardScaler()),
                    ('clf', SVC(kernel="linear"))])


pipelines = [pipe_lr, pipe_knn, pipe_rf, pipe_svm]

for pipe in pipelines:
    pipe.fit(X3_train, y3_train)
    prediction_name = (str(pipe) + "_predictions")
    prediction_name = pipe.predict(X3_val)
    print(prediction_name)
    print(classification_report(y3_val,prediction_name))
    print(confusion_matrix(y3_val,prediction_name))



[0 0 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      3676
           1       0.64      0.57      0.61      1124

   micro avg       0.82      0.82      0.82      4800
   macro avg       0.76      0.74      0.75      4800
weighted avg       0.82      0.82      0.82      4800

[[3313  363]
 [ 479  645]]
[0 0 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3676
           1       0.64      0.60      0.62      1124

   micro avg       0.83      0.83      0.83      4800
   macro avg       0.76      0.75      0.75      4800
weighted avg       0.82      0.83      0.83      4800

[[3304  372]
 [ 452  672]]
[0 0 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      3676
           1       0.70      0.58      0.63      1124

   micro avg       0.84      0.84      0.84      4800
   macro avg       0

Note, performed worse. Go with X2 subset

# Resampling

In [20]:
# Over-sampling
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)

In [21]:
Counter(y_resampled)

Counter({0: 14702, 1: 14702})

In [22]:
# Logistic Regression on over-sampled data
logmodel_over = LogisticRegression()
logmodel_over.fit(X_train,y_train)
log_predictions_over = logmodel_over.predict(X_val)



In [23]:
print(classification_report(y_val,log_predictions_over))
print(confusion_matrix(y_val,log_predictions_over))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3676
           1       0.65      0.56      0.60      1124

   micro avg       0.82      0.82      0.82      4800
   macro avg       0.76      0.73      0.74      4800
weighted avg       0.82      0.82      0.82      4800

[[3333  343]
 [ 497  627]]
