# Libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Perceptron

from sklearn.linear_model import SGDClassifier

from sklearn.svm import SVC

from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

from scipy.stats import kurtosis
from scipy.stats import skew

In [2]:
CUSTOM_SEED = 12345
np.random.seed(CUSTOM_SEED)

# Data Processing

## Load data

In [3]:
#loading the data
feat = np.load("feat.npy")
path = np.load("path.npy")
train = pd.read_csv("train.csv")  #load as pandas dataframe
test = pd.read_csv("test.csv")

#extract the target values (y)
words = train['word']
words.describe()

count     94824
unique       35
top        zero
freq       3634
Name: word, dtype: object

## Check the data

In [6]:
print("Train Set")
print("Train Shape    :", train.shape)
print("Train Dimension:", train.ndim)
print(train.describe())
print()
print("Test Set")
print("Test Shape    :", test.shape)
print("Test Dimension:", test.ndim)
print(test.describe())

Train Set
Train Shape    : (94824, 2)
Train Dimension: 2
                                                path   word
count                                          94824  94824
unique                                         94824     35
top     1827be43f904639d2d190e4da9b8d08ce300316b.wav   zero
freq                                               1   3634

Test Set
Test Shape    : (11005, 1)
Test Dimension: 2
                                                path
count                                          11005
unique                                         11005
top     964bbb052da81db755a302d7555b0f11bf489972.wav
freq                                               1


In [7]:
print("Path Set")
print("Path Shape    :", path.shape)
print("Path Dimension:", path.ndim)
print("Feat Set")
print("Feat Shape    :", feat.shape)
print("Feat Dimension:", feat.ndim)
print("feat[0] size  :", feat[0].size)
print("feat[0] shape :", feat[0].shape)
print("Each element has {} colums representing MFCC coefficients".format(feat[0].shape[1]))
print("feat[0] ndim  :", feat[0].ndim)

Path Set
Path Shape    : (105835,)
Path Dimension: 1
Feat Set
Feat Shape    : (105835,)
Feat Dimension: 1
feat[0] size  : 1287
feat[0] shape : (99, 13)
Each element has 13 colums representing MFCC coefficients
feat[0] ndim  : 2


## Process the data

In [10]:
#first we have to find the right features that are used in the training set and test set
def find_indices(portion):
    """
    This function find the index number of every wav path 
    from the given source path data set
    
    input : data set which has paths
            
    return: index list of data
    """
    indexlist = []

    for index, portionpath in enumerate(portion['path']):   #enumerate trought the training set
        indexlist.append(np.where(path == portionpath))   #for every path in the train.csv, find the index in path.npy
        print(index) #(this is to keep track if you want to know how long its going to take)
    
    #to extract the features from the feat.npy: 
    #for some reason i only could get it to work with integers as index, so extract those from the trainindex list
    number_indexlist = []
    for index in indexlist:
        number_indexlist.append(index[0][0])
    
    return number_indexlist

def create_feature_array(indexlist):
    """
    This function extract feature values from the given 
    source feature data set according to given index numbers
    
    input : index list of .wav files
            data set of MFCC features
    return: array of features for the given indeces
    """
    #create empty array, dtype = object because the features have different shapes
    features = np.zeros((len(indexlist)), dtype = object)
    
    for i in range(len(indexlist)):
        features[i] = feat[indexlist[i]]
        
    return features


In [None]:
# creating a list with indices for the train and the test data sets
train_index = find_indices(train)
test_index =  find_indices(test)

# creating new numpy arrays with only train or test features
training_features = create_feature_array(train_index)
test_features =     create_feature_array(test_index)

# saving these numpy arrays for convenience for later use if it's needed
np.save("training_features.npy", training_features)
np.save("test_features.npy", test_features)

In [4]:
#load the right train and test features if needed to run the code again
training_features = np.load("training_features.npy")
test_features = np.load("test_features.npy")

In [5]:
training_features.shape, test_features.shape

((94824,), (11005,))

## Functions to reshape the features

In [6]:
def add_zeros(features):
    """make every feature the same shape by adding rows with zeros to smaller features"""
    fixed_shape = (99, 13)

    for i in range(features.size):
        
        if features[i].shape != fixed_shape: #if a feature has different shape than (99,13)

            rows = 99 - features[i].shape[0]  #calculate how many rows with zeros we need to add
            zeros = np.zeros(shape=(rows, 13)) #create zero array with the rows

            features[i] = np.vstack((features[i], zeros))  #add zero array to bottom of existing array
        

In [7]:
def make_3d(features):
    array = np.zeros(shape=(features.shape[0], 99, 13), dtype = float)

    for i in range(features.size):
        for j in range(99):
            for k in range(13):

                array[i, j, k] = features[i][j,k]
    return array


In [8]:
# Convert the data to the same shape
add_zeros(training_features)
add_zeros(test_features)


In [9]:
train_feat_3d = make_3d(training_features)
test_feat_3d = make_3d(test_features)

In [10]:
train_feat_2d = np.reshape(train_feat_3d, (94824,1287))
test_feat_2d = np.reshape(test_feat_3d, (11005,1287))

In [11]:
train_feat_2d.shape, test_feat_2d.shape

((94824, 1287), (11005, 1287))

# Algorithms Tried

## Perceptron without scaling

In [18]:
#PERCEPTRON trainfeat2d, no scaling
X_train, X_val, y_train, y_val = train_test_split(train_feat_2d, words, test_size=0.2, random_state=12345)

for passes in [5, 10, 15, 20]:
    model = Perceptron(random_state=12345, max_iter=passes)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_val, model.predict(X_val))
    print("{}\t {}".format(passes, acc))

5	 0.19045610334827315
10	 0.17690482467703664
15	 0.1920906933825468
20	 0.1774848404956499


## Perceptron with scaling

In [19]:
#PERCEPTRON trainfeat2d, with scale
X_train, X_val, y_train, y_val = train_test_split(train_feat_2d, words, test_size=0.2, random_state=12345)

# Z-score the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

for passes in [5, 10, 15, 20]:
    model = Perceptron(random_state=12345, max_iter=passes)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_val, model.predict(X_val))
    print("{}\t {}".format(passes, acc))

5	 0.2128130767202742
10	 0.20416556815185868
15	 0.2093857105193778
20	 0.2126021618771421


## Padding the Features

In [21]:
#function to work with the padded features
def concat_summaries_3d(features, summaries):
    """function with as input: 
        -features
        -list of summary functions to calculate for every 13 column over all the frames 
        (they will be concatenated next to each other) """
    
    #create empty numpy matrix, with amount of features as rows, and (13 * amount of summary functions) for columns
    new_feat = np.zeros((features.shape[0], 13*len(summaries)))  
    
    #fill the matrix
    for i in range(features.shape[0]):
        for j in range(13*len(summaries)):
            coef_summaries = np.concatenate( [function(features[i], axis = 0) for function in summaries], axis = 0 )
            new_feat[i,j] = coef_summaries[j]
            
    return new_feat

In [22]:
# summarise the features to new feat min, max , std and mean statistics
summaries = [np.mean, np.min, np.max, np.std]
summaries1 = [np.mean, np.min, np.max]
feat_meanminmaxstd3d = concat_summaries_3d(training_features, summaries)
feat_meanminmax3d = concat_summaries_3d(training_features, summaries1)

## Perceptron with summarised features and scaling 

In [23]:
#feat_meanminmaxstd3d with scaling and perceptron
X_train, X_val, y_train, y_val = train_test_split(feat_meanminmaxstd3d, words, test_size=0.2, random_state=12345)

# Z-score the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

for passes in [5, 10, 15, 20]:
    model = Perceptron(random_state=12345, max_iter=passes)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_val, model.predict(X_val))
    print("{}\t {}".format(passes, acc))

5	 0.2854205114684946
10	 0.26227260743474823
15	 0.29912997627208016
20	 0.29106248352227787


## SGD Logistic Regression with summarised features and scaling

In [None]:
#feat_meanminmaxstd3d with scaling and SGD Logistic
X_train, X_val, y_train, y_val = train_test_split(feat_meanminmaxstd3d, words, test_size=0.2, random_state=12345)

# Z-score the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

for passes in [5, 10, 15, 20]:
    model = SGDClassifier(loss='log', random_state=12345, max_iter = passes)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print("passes:{}\t acc:{:.3}".format(passes, accuracy_score(y_val, y_pred)))

## SVC with min&max summary statistics

In [None]:
#this is with old summary engineered features, only meanminmax and no std
X_train, X_val, y_train, y_val = train_test_split(feat_meanminmax3d, words, test_size=0.2, random_state=12345)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

svm = SVC(C = 3.0, random_state = 12345)
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_val, y_val)))

## Ablation Analysis

In [26]:
import matplotlib.pyplot as plt

train_feat_3d = training_features
#features with all the summaries: feat_all
summaries = [np.mean, np.min, np.max, np.std, skew]
feat_all = concat_summaries_3d(train_feat_3d, summaries) 

#for ablation create per summary a feature, so that we can easily stack them 

#only mean
summaries = [np.mean]
feat_mean_3d = concat_summaries_3d(train_feat_3d, summaries)
#only min
summaries = [np.min]
feat_min_3d = concat_summaries_3d(train_feat_3d, summaries)
#only max
summaries = [np.max]
feat_max_3d = concat_summaries_3d(train_feat_3d, summaries)
#only std
summaries = [np.std]
feat_std_3d = concat_summaries_3d(train_feat_3d, summaries)
#only skew
summaries = [skew]
feat_skew_3d = concat_summaries_3d(train_feat_3d, summaries)

#create features stacked with all summaries except one of them
feat_all_min_mean = np.hstack((feat_min_3d, feat_max_3d, feat_std_3d, feat_skew_3d)) # all summaries except mean
feat_all_min_min = np.hstack((feat_mean_3d, feat_max_3d, feat_std_3d, feat_skew_3d)) # all summaries except min
feat_all_min_max = np.hstack((feat_mean_3d, feat_min_3d, feat_std_3d, feat_skew_3d)) # all summaries except max
feat_all_min_std = np.hstack((feat_mean_3d, feat_min_3d, feat_max_3d, feat_skew_3d)) # all summaries except std
feat_all_min_skew = np.hstack((feat_mean_3d, feat_min_3d, feat_max_3d, feat_std_3d)) # all summaries except skew

### Analysis
## choose 25 passes for max_iter, good tradeoff for speed and acc for comparing

# all the features:
X_train, X_val, y_train, y_val = train_test_split(feat_all, words, test_size=0.2, random_state=12345)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

model = SGDClassifier(loss='log', random_state=12345, max_iter = 25)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
acc_all = accuracy_score(y_val, y_pred)
print("All features acc:{:.3}\t".format( accuracy_score(y_val, y_pred)))

### ablation analysis, calculate the relative accuracy drop when ablating each summary feature

ablation_list = [feat_all_min_mean, feat_all_min_min, feat_all_min_max, feat_all_min_std, feat_all_min_skew]
ablation_names = ['mean', 'min', 'max', 'std', 'skew']

acc_drop = [] #create list we can fill and use for plot later

for i, ablation in enumerate(ablation_list):
    
    X_train, X_val, y_train, y_val = train_test_split(ablation, words, test_size=0.2, random_state=12345)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    model = SGDClassifier(loss='log', random_state=12345, max_iter = 25)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    rel_acc_drop = acc_all - accuracy_score(y_val, y_pred) #calculate relative acc drop
    acc_drop.append(rel_acc_drop) # for the plot 
    name = ablation_names[i]
    print("ablated feature: {}\t acc: {:.3}\t relative acc drop:{:.3}".format(name, accuracy_score(y_val, y_pred), rel_acc_drop))
    
###plot figure
x_pos = np.arange(len(ablation_names))
 
plt.bar(x_pos, acc_drop) #create bars
plt.xticks(x_pos, ablation_names) # create names under the bars

plt.title('Feature Ablation Analysis') 
plt.ylabel('Relative Accuracy Drop')

plt.show()
plt.savefig('ablation.png')

## Principal Component Analysis

In [12]:
#PCA
pca = PCA(n_components=200, svd_solver='randomized')
# Inputs
pca.fit(train_feat_2d)
X = pca.transform(train_feat_2d)
# Output
y = words

In [None]:
## KNN algortihm with 200 component (PCA)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=12345)

knn = KNeighborsClassifier(n_neighbors =1)
knn.fit(X_train, y_train)
pred = knn.predict(X_val)
print(confusion_matrix(y_val, pred))
print(classification_report(y_val,pred))
print(accuracy_score(y_val, pred))

# Training Model - Neural Network(MLP)




## Data preparation & train - test split

In [12]:
X= train_feat_2d
y = np.array(words, dtype='U10')

In [13]:
from sklearn.preprocessing import LabelEncoder
# Fit LabelEncoder with our list of classes
label_encoder = LabelEncoder()
#label_encoder.fit(y_train)
label_encoder.fit(y)

# Encode class values as integers
#y_train = label_encoder.transform(y_train)
y = label_encoder.transform(y)

In [14]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=12345)

In [15]:
# Convert integers to dummy variables (one hot encoded)
from keras.utils import np_utils
Y_train = np_utils.to_categorical(y_train)
Y_val = np_utils.to_categorical(y_val)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Model Building

In [38]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, AlphaDropout, Activation
from keras.optimizers import Adam

# Create a function to build model
def build_model(input_dim,
                   output_dim,
                   n_dense=6,
                   dense_neurons=512,
                   activation='relu',
                   dropout=AlphaDropout,
                   dropout_rate=0.2,
                   kernel_initializer='uniform',
                   optimizer='adam'):

        model = Sequential()
        model.add(Dense(dense_neurons, input_dim=input_dim,
                        kernel_initializer=kernel_initializer))
        model.add(Activation(activation))
        model.add(dropout(dropout_rate))
        for i in range(n_dense - 1):
            model.add(Dense(dense_neurons, kernel_initializer=kernel_initializer))
            model.add(Activation(activation))
            model.add(dropout(dropout_rate))

        model.add(Dense(output_dim))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        return model

In [39]:
# DEfine model hyperparameters
from keras.wrappers.scikit_learn import KerasClassifier
model_params = {
    'build_fn': build_model,
    'input_dim': X_train.shape[1],
    'dense_neurons': 1024,
    'output_dim': Y_train.shape[1],
    'epochs': 20,
    'batch_size': 512,
    'verbose': 1,
    #'validation_data': (X_val, Y_val),
    'shuffle': True
}
clf = KerasClassifier(**model_params)

## Model Train & Validation

In [40]:
hist = clf.fit(X_train, Y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Validation Accuracy

In [41]:
score = clf.score(X_val, Y_val)
print(score)

0.8103205851950716


# Classify with test Data

In [42]:
# Check the test data shape
test_feat_2d.shape

(11005, 1287)

In [43]:
# Prediction for test data
X= test_feat_2d
y_pred = clf.predict(X)




In [44]:
# Decode the predictions from integer to labels back to prepare result.csv
labels = label_encoder.inverse_transform(y_pred)

In [45]:
# check the labels, # of labels
labels, labels.shape, len(np.unique(labels))

(array(['bed', 'cat', 'seven', ..., 'five', 'left', 'backward'],
       dtype='<U10'), (11005,), 35)

In [46]:
import pandas as pd
test = pd.read_csv("test.csv")
predictions =  pd.DataFrame(labels)

In [47]:
predictions.head()

Unnamed: 0,0
0,bed
1,cat
2,seven
3,no
4,off


In [49]:
result = test.join(predictions)
result.columns = ['path', 'word']
result

Unnamed: 0,path,word
0,4985e0c3784b700688c35818ba69f01b4fa3e8da.wav,bed
1,c3815898eb339919ab56249acae83cf566eb622d.wav,cat
2,32c4865f292674cc904c5af503bc669c2dbd8843.wav,seven
3,99600d05d1a861ef9771a7bd8eca0d5f444fce7a.wav,no
4,57ece62e925c94a7e6c54916caca1237467ad4d8.wav,off
5,fc11b1ea9dfde4ebc927ff092b48a8ac86fddae3.wav,stop
6,15af624e3fce684f2a05a76902652d7388f1d912.wav,five
7,42be94dcf7e9c40bd6ed5d45b9a41d6d87eb2890.wav,three
8,ba326dca3f3e3b273a6c50f083f9eece3febe413.wav,one
9,29d8a1c1f2d6c692aeb391c7efbc128cb20969ae.wav,two


In [50]:
# Export the results to the csv
result.to_csv("result.csv", sep=',', index= False, header=True)