In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import svm

from util_functions import process_files_to_mfccs

In [2]:
# Load data
df_train = process_files_to_mfccs(dataset='training')
df_test = process_files_to_mfccs(dataset='test')

In [3]:
def classification_svm(df_train,df_test,C,gamma):

    clf = svm.SVC(C=C, gamma=gamma)
    
    X_train = df_train.iloc[:,0:df_train.shape[1]-1]
    X_test = df_test.iloc[:,0:df_train.shape[1]-1]
    y_train = df_train['Label']

    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
     
    pred_train_series = pd.Series(y_pred_train,index = df_train.index)
    pred_group_train_series = pred_train_series.groupby(pred_train_series.index).mean()
    pred_train_boolean = pred_group_train_series >=0.5
    y_pred_train_last = pred_train_boolean*1

    pred_test_series = pd.Series(y_pred_test,index = df_test.index)
    pred_group_test_series = pred_test_series.groupby(pred_test_series.index).mean()
    pred_test_boolean = pred_group_test_series >=0.5
    y_pred_test_last= pred_test_boolean*1

    y_train = df_train['Label']
    y_train_last = y_train.groupby(y_train.index).mean()

    y_test = df_test['Label']
    y_test_last = y_test.groupby(y_test.index).mean()
    
    return y_train_last,y_pred_train_last,y_test_last,y_pred_test_last

In [4]:
def cross_validate(df_train,folds):
    
    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    results=np.empty((folds,len(C_range),len(gamma_range)))

    X = np.unique(df_train.index.values)
    np.random.shuffle(X)
    kf = KFold(n_splits=3)
    kf.get_n_splits(X)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = df_train.loc[X[train_index]], df_train.loc[X[test_index]]
        i,k = 0,0
        for C in C_range:
            j=0
            for gamma in gamma_range:
                y_train, y_pred_train, y_test, y_pred_test = classification_svm(X_train,X_test,C,gamma)
                train_acc = calculate_accuracies(y_true=y_train, y_pred=y_pred_train)
                test_acc = calculate_accuracies(y_true=y_test, y_pred=y_pred_test)
                results[k,i,j]=test_acc
                j+=1
            i+=1
        k+=1

    average_acc = np.sum(results,axis=0)/folds
    indexes_max = np.unravel_index(np.argmax(average_acc, axis=None), average_acc.shape)
    best_C = C_range[indexes_max[0]]    
    best_gamma = gamma_range[indexes_max[1]] 
        
    return best_C, best_gamma

In [5]:
def calculate_accuracies(y_true, y_pred):
    return np.sum(np.array([y_true==y_pred]))/len(y_true)

In [6]:
def class_report(y_true, y_pred, accuracy, dataset):    
    print(dataset + ' Set:\n')
    print(dataset + ' Confusion Matrix:')
    print(confusion_matrix(y_true, y_pred))    
    print(dataset + ' Classification report:')
    print(classification_report(y_true, y_pred))    
    print(dataset + ' Accuracy: ' + str(accuracy))    
    print('\n')

In [None]:
#Classification with support vector machine using mfccs
#We try finding the best C and gamma parameters to use
C,gamma = cross_validate(df_train,folds=3)

#Using the params found we run the model only on those
y_train,y_train_pred, y_test, y_test_pred = classification_svm(df_train,df_test,c,gamma)

#Calculate training and test accuracy for the model
train_acc = calculate_accuracies(y_true=y_train, y_pred=y_pred_train)
test_acc = calculate_accuracies(y_true=y_test, y_pred=y_pred_test)

#Get report for model
class_report(y_train, y_pred_train, train_acc, dataset='Training')
class_report(y_test, y_pred_test, test_acc, dataset='Test')



Training Set:

Training Confusion Matrix:
[[112   2]
 [  2  77]]
Training Classification report:
             precision    recall  f1-score   support

          0       0.98      0.98      0.98       114
          1       0.97      0.97      0.97        79

avg / total       0.98      0.98      0.98       193

Train Accuracy: 0.979274611399


Test Set:

Test Confusion Matrix:
[[48  2]
 [10 24]]
Test Classification report:
             precision    recall  f1-score   support

          0       0.83      0.96      0.89        50
          1       0.92      0.71      0.80        34

avg / total       0.87      0.86      0.85        84

Test Accuracy: 0.857142857143
