In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import time

import re
import string
import librosa
import librosa.display
import IPython.display as ipd

from model_wrapper import ModelWrapper

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn import grid_search
from sklearn.grid_search import ParameterGrid



In [3]:
SAMPLE_RATE = 22050

In [4]:
#%%notify
df_train = pd.read_csv("train.csv")
train = df_train.values

<IPython.core.display.Javascript object>

In [10]:
N_train = train.shape[0]
NUM_SAMPLES = train.shape[1]-1

X_train = train[:,:-1]
y_train = train[:,-1]

In [8]:
def extract_design_matrix(ffs, Xs):
    '''
    arguments:
        ffs: a list of feature functions, functions have to return a list (sorry)
        Xs: a numpy array of the data (X only)
    returns:
        a design matrix
    '''
    flatten = lambda l: [item for sublist in l for item in sublist]
    
    design_matrix = np.array([flatten([f(x) for f in ffs]) for x in Xs])
    return design_matrix

In [93]:
f0 = lambda x : [np.mean(np.array(x))]
f1 = lambda x : np.mean(librosa.feature.mfcc(x).T, axis = 0)
f2 = lambda x : np.mean(librosa.feature.spectral_centroid(x).T, axis = 0)
f3 = lambda x : np.mean(librosa.feature.spectral_flatness(x).T, axis = 0)
f4 = lambda x : np.mean(librosa.feature.spectral_rolloff(x).T, axis = 0)

''' Added these features and camelot accuracy got worse; also took forever to compute 
sample_rate = SAMPLE_RATE
chroma = lambda X: np.mean(librosa.feature.chroma_stft(S=np.abs(librosa.stft(X)), sr=sample_rate).T,axis=0)
mel = lambda X : np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
contrast = lambda X: np.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(X)), sr=sample_rate).T,axis=0)
tonnetz = lambda X: np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),sr=sample_rate).T,axis=0)
'''
ffs = [f0, f1, f2, f3, f4]# chroma, mel, contrast, tonnetz]

In [94]:
%%notify
X_design = extract_design_matrix(ffs, X_train)



<IPython.core.display.Javascript object>

In [96]:
rf = ModelWrapper(X_design, y_train, RandomForestClassifier(n_estimators = 100))
rf.build()

Training model...
Done training model.
-----
Train error: 1.0
Validation error: 0.988614800759


In [25]:
%%notify

##########################
## Generate predictions ##
##########################

df_test = pd.read_csv("test.csv")

<IPython.core.display.Javascript object>

In [26]:
df_test.shape

(999, 88201)

In [29]:
X_test = df_test.values[:,1:]

In [98]:
%%notify
X_test_design = extract_design_matrix(ffs, X_test)



<IPython.core.display.Javascript object>

In [99]:
ids = df_test.values[:,0]

In [100]:
Y_pred = rf.model.predict(X_test_design)

In [101]:
preds = np.hstack((ids[np.newaxis].T, Y_pred[np.newaxis].T)).astype(int)
## TEMPORARY, FIX LATER
preds = np.vstack(([0,0], preds))

header = ['Id', 'Prediction']
preds = np.vstack((header,preds))

In [102]:
import csv

with open("pred1.csv", "w") as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerows(preds)

In [103]:
names = ["mean", "mfcc", "centroid", "flatness", "rolloff"]#, "chroma", "mel", "contrast", "tonnetz"]
print(sorted(zip(map(lambda x: round(x, 4), rf.model.feature_importances_), names), 
             reverse=True))

[(0.0054999999999999997, 'rolloff'), (0.0047000000000000002, 'tonnetz'), (0.0047000000000000002, 'chroma'), (0.0038, 'mfcc'), (0.0035999999999999999, 'centroid'), (0.0030000000000000001, 'contrast'), (0.0025999999999999999, 'mel'), (0.0023, 'flatness'), (0.0012999999999999999, 'mean')]
