In [1]:
%load_ext jupyternotify
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import time
import csv

import re
import string
import librosa
import librosa.display
import IPython.display as ipd

from model_wrapper import ModelWrapper

<IPython.core.display.Javascript object>



In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import grid_search
from sklearn.grid_search import ParameterGrid

In [3]:
SAMPLE_RATE = 22050

In [4]:
%%notify
df_train = pd.read_csv("train.csv", header=None)
train = df_train.values

<IPython.core.display.Javascript object>

In [5]:
N_train = train.shape[0]
NUM_SAMPLES = train.shape[1]-1

X_train = train[:,:-1]
y_train = train[:,-1]

In [8]:
def extract_design_matrix(ffs, Xs):
    '''
    arguments:
        ffs: a list of feature functions, functions have to return a list (sorry)
        Xs: a numpy array of the data (X only)
    returns:
        a design matrix
    '''
    flatten = lambda l: [item for sublist in l for item in sublist]
    
    design_matrix = np.array([flatten([f(x) for f in ffs]) for x in Xs])
    return design_matrix

In [9]:
f0 = lambda x : [np.mean(np.array(x))]
f1 = lambda x : np.mean(librosa.feature.mfcc(x).T, axis = 0)
f2 = lambda x : np.mean(librosa.feature.spectral_centroid(x).T, axis = 0)
f3 = lambda x : np.mean(librosa.feature.spectral_flatness(x).T, axis = 0)
f4 = lambda x : np.mean(librosa.feature.spectral_rolloff(x).T, axis = 0)

''' Added these features and camelot accuracy got worse; also took forever to compute 
sample_rate = SAMPLE_RATE
chroma = lambda X: np.mean(librosa.feature.chroma_stft(S=np.abs(librosa.stft(X)), sr=sample_rate).T,axis=0)
mel = lambda X : np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
contrast = lambda X: np.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(X)), sr=sample_rate).T,axis=0)
tonnetz = lambda X: np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),sr=sample_rate).T,axis=0)
'''
ffs = [f0, f1, f2, f3, f4]# chroma, mel, contrast, tonnetz]

In [10]:
%%notify
X_design = extract_design_matrix(ffs, X_train)

<IPython.core.display.Javascript object>

In [12]:
# save extracted features to csv
pd.DataFrame(X_design).to_csv("X_design_train.csv")
pd.DataFrame(y_train).to_csv("y_train.csv")

In [14]:
%%notify
# Random Forest
# rf_parameters = {'n_estimators': [50, 500, 750, 1000], 'max_features': ['sqrt', 'log2', None], 
#                  'max_depth': [4, 25, 50, None], 'class_weight': ['balanced', None]}
rf_parameters = {'n_estimators': [100, 500], 'max_features': ['sqrt', None], 'max_depth': [25, None], 'class_weight': ['balanced', None]}
rf = ModelWrapper(X_design, y_train, RandomForestClassifier(), rf_parameters)
rf.build()

Training model...
Done training model:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
-----
Train error: 1.0
Validation error: 0.961441213653603


<IPython.core.display.Javascript object>

In [15]:
%%notify
# kNN
knn_parameters = {'n_neighbors': [2, 5, 10], 'weights': ['uniform','distance'], 'p':[1, 2]}
knn = ModelWrapper(X_design, y_train, KNeighborsClassifier(), knn_parameters)
knn.build()

Training model...
Done training model:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=1,
           weights='distance')
-----
Train error: 1.0
Validation error: 0.8135271807838179


<IPython.core.display.Javascript object>

In [16]:
%%notify
# Logistic Regression
lr_parameters = {'cv': [3, 5], 'fit_intercept': [True, False], 'penalty': ['l2'], 'class_weight': ['balanced', None]}
lr = ModelWrapper(X_design, y_train, LogisticRegressionCV(), lr_parameters)
lr.build()

Training model...
Done training model:
LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
-----
Train error: 0.8406072106261859
Validation error: 0.8350189633375474


<IPython.core.display.Javascript object>

In [17]:
%%notify
# NN
nn_parameters = {'activation': ['identity','logistic','tanh','relu'], 'alpha': [.0001, .01, .1, 1]}
nn = ModelWrapper(X_design, y_train, MLPClassifier(), nn_parameters)
nn.build()

Training model...
Done training model:
MLPClassifier(activation='logistic', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
-----
Train error: 0.8387096774193549
Validation error: 0.8324905183312263


<IPython.core.display.Javascript object>

In [26]:
%%notify
# SVM
svm_parameters = {'C':[.25, .5, 1, 5], 'kernel':['linear', 'rbf']}
svm = ModelWrapper(X_design, y_train, SVC(), svm_parameters)
svm.build()

In [None]:
# X_design = pd.read_csv("X_design_train.csv").values
# y_train = pd.read_csv("y_train.csv")['0'].values

In [20]:
%%notify
# XG Boost
xg_parameters = {'n_estimators': [50, 200, 500], 'max_depth':[2, 3, 5]}
xg = ModelWrapper(X_design, y_train, GradientBoostingClassifier(), xg_parameters)
xg.build()

Training model...
Done training model:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
-----
Train error: 1.0
Validation error: 0.9551201011378002


<IPython.core.display.Javascript object>

In [27]:
%%notify
##########################
## Generate predictions ##
##########################

df_test = pd.read_csv("test.csv", header=None)
X_test = df_test.values[:,1:]

<IPython.core.display.Javascript object>

In [113]:
# keep track of data points that are 0's
missing_data = []
for index, row in df_test.iloc[:,1:].iterrows():
    if sum(row) == 0.0:
        missing_data.append(index)
print(missing_data)

[12, 13, 19, 24, 28, 33, 34, 37, 42, 49, 50, 56, 62, 63, 65, 75, 98, 108, 112, 116, 123, 127, 128, 136, 139, 142, 145, 151, 159, 163, 169, 173, 175, 176, 182, 185, 188, 195, 199, 202, 208, 213, 216, 225, 235, 238, 241, 254, 255, 260, 265, 266, 267, 279, 281, 282, 322, 331, 332, 335, 340, 341, 344, 352, 369, 377, 379, 387, 412, 421, 422, 428, 455, 459, 462, 467, 473, 474, 477, 485, 487, 490, 504, 537, 541, 543, 544, 553, 554, 567, 570, 591, 594, 599, 602, 603, 632, 643, 644, 646, 647, 659, 661, 682, 683, 692, 694, 698, 710, 717, 722, 733, 740, 745, 748, 750, 751, 769, 771, 775, 776, 780, 785, 797, 803, 813, 815, 816, 817, 825, 828, 872, 875, 888, 891, 894, 900, 906, 924, 942, 948, 962, 971, 975, 977, 981, 992]


In [28]:
X_test_design = extract_design_matrix(ffs, X_test)
pd.DataFrame(X_test_design).to_csv("X_design_test.csv")

In [142]:
Y_pred = xg.model.predict(X_test_design)

ids = df_test.values[:,0]
preds = np.hstack((ids[np.newaxis].T, Y_pred[np.newaxis].T)).astype(int)

header = ['Id', 'Prediction']
preds = np.vstack((header,preds))

In [144]:
# replace all the missing data points with class '0', the most frequent sound
for i in range(1, preds.shape[0]):
    if int(preds[i,0]) in missing_data:
        preds[i,1] = '0'

In [145]:
import csv

with open("preds_xg.csv", "w") as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerows(preds)

In [147]:
names = ["mean", "mfcc", "centroid", "flatness", "rolloff"]#, "chroma", "mel", "contrast", "tonnetz"]
print(sorted(zip(map(lambda x: round(x, 4), rf.model.feature_importances_), names), 
             reverse=True))