# ECE4200 - Kaggle competition
### Jorge Calvar (jc2767)

This notebook has been executed with only 1000 training samples to be able to hand in a notebook that has run completely.

However, the actual notebook used for the competition was executed with more samples. The number of samples can be modified in the Parameters section below.

## Imports
We import necessary libraries that are used through the whole project.


In [1]:
import numpy as np
import pandas as pd
from scipy.io import wavfile
from pathlib import Path


from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from scipy.fft import fft
from scipy.signal import spectrogram

## Parameters

In [2]:
TRY_TREE = True
TRY_LOGISTIC = True
TRY_NB = True   # Naive Bayes
TRY_SVM = True
TRY_BAGGING = True
TRY_ADABOOST = True
TRY_NN = True

N_SAMPLES = 1000

TRAIN_DIR = Path('kaggle/input/spoken-digit-pair-recognition/train/train_new')
TEST_DIR = Path('kaggle/input/spoken-digit-pair-recognition/test/test_new')
WORKING_DIR = Path('kaggle/working')

OUTPUT_MODEL = False
LOAD_SAVED_MODEL = True

## Exploring

In [3]:
train_labels = pd.read_csv('kaggle/input/spoken-digit-pair-recognition/train.csv', index_col='ID')
train_labels.head()

Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
0,21
1,32
2,31
3,31
4,41


In [4]:
sample_sub = pd.read_csv('kaggle/input/spoken-digit-pair-recognition/sample_sub.csv', index_col='ID')
sample_sub.head()

Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
0,21
1,21
2,21
3,21
4,21


In [5]:
max_files = []
for d in [TRAIN_DIR, TEST_DIR]:
    max_file = 0
    for f in d.iterdir():
        i = int(f.name[f.name.index('_')+1:-4])
        max_file = np.max([max_file, i])
    max_files.append(max_file)
print(f'Max train file: {max_files[0]}')
print(f'Max test file: {max_files[1]}')

Max train file: 89999
Max test file: 24749


## Functions

In [6]:
def get_audio(i, train=True):
    dir_to_look = TRAIN_DIR if train else TEST_DIR
    file = dir_to_look / f'train_{i}.wav'
    _, data = wavfile.read(file)
    return data


def convert_dataframe_to_numpy(df):
    for i in range(6000):
        df[f'x_{i}'] = df['data'].apply(lambda r: r[i])
    del df['data']
    array_final = df.to_numpy()
    return array_final 


def try_model(model, X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    acc_train = accuracy_score(y_train, model.predict(X_train))
    acc_test = accuracy_score(y_val, model.predict(X_val))
    return acc_train, acc_test


def try_models(models, X_train, X_val, y_train, y_val):
    df = pd.DataFrame()
    for model in models:
        acc_train, acc_test = try_model(model, X_train, X_val, y_train, y_val)
        df.loc[len(df), ['Model', 'Train accuracy', 'Validation accuracy']] = [model, acc_train, acc_test]
    return df

## Preprocessing

In [7]:
n_batch = np.arange(90000)
np.random.shuffle(n_batch)
n_batch = n_batch[:N_SAMPLES]

audios = [get_audio(i) for i in n_batch]
labels = [train_labels.at[i, 'Label'] for i in n_batch]

df_x = pd.DataFrame(index=n_batch, data={'data': audios, 'label': labels})

array_final = convert_dataframe_to_numpy(df_x)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(array_final[:, 1:], array_final[:, 0], test_size=0.1)

## Decision trees

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
def get_tree_models():
    models = set()
    for criterion in ['gini', 'entropy']:
        models.add(DecisionTreeClassifier(criterion=criterion, max_depth=None))
        for max_depth in range(1, 13):
            models.add(DecisionTreeClassifier(criterion=criterion, max_depth=max_depth))
    return models

In [11]:
if TRY_TREE:
    models_tree = get_tree_models()
    result_tree = try_models(models_tree, X_train, X_val, y_train, y_val)
    result_tree.to_csv(WORKING_DIR / 'tree_models.csv')
    result_tree.sort_values('Validation accuracy', ascending=False).head(10)

We obtained the best result with 11 max_depth. The validation accuracy does not even reach 50%. Although this makes it better than a random model, it is far from a good one. After a few variations of decision tree models, it was concluded that they were not the best approach and the time was used to focus on other models.

## Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
def get_logistic_models():
    models = set()
    N = np.array(range(0,15))
    lamb = 0.00001*(4**N)
    for l in lamb:
        models.add(LogisticRegression(C=1/l, solver='sag', max_iter=1000))
    return models

In [14]:
if TRY_LOGISTIC:
    models_logistic = get_logistic_models()
    result_logistic = try_models(models_logistic, X_train, X_val, y_train, y_val)
    result_logistic.to_csv(WORKING_DIR / 'logistic_models.csv')
    result_logistic.sort_values('Validation accuracy', ascending=False).head(10)



We obtain the very good training accuracies of 100% and relatively good validation accuracies close to 90%. This makes logistic regression a very good approach, specially considering the simplicity of the final model, which is just a set coefficients for the input variables.

## Naive Bayes


In [15]:
from sklearn.naive_bayes import GaussianNB

In [16]:
def get_naive_bayes_models():
    models = set()
    models.add(GaussianNB())
    return models

In [17]:
if TRY_NB:
    models_nb = get_naive_bayes_models()
    result_nb = try_models(models_nb, X_train, X_val, y_train, y_val)
    result_nb.to_csv(WORKING_DIR / 'naive_bayes_models.csv')
    result_nb.sort_values('Validation accuracy', ascending=False).head(10)

The performance of the Naive Bayes model is terrible. The training and validation accuracy are close to 20%, which is what we would expect of a random model. This is because Naive Bayes is an extremely simple model that allows for little room of flexibility of the predictive function. We also concentrate on other models after seeing this.

## SVM

In [18]:
from sklearn.svm import SVC

In [19]:
def get_svm_models():
    models = set()
    n = np.array(range(20))
    C = 2**n
    for c in C:
        for degree in range(1, 6):
            models.add(SVC(C=c, kernel='poly', degree=degree))
        models.add(SVC(C=c, kernel='rbf'))
    return models

In [20]:
if TRY_SVM:
    models_svm = get_svm_models()
    result_svm = try_models(models_svm, X_train, X_val, y_train, y_val)
    result_svm.to_csv(WORKING_DIR / 'svm_models.csv')
    result_svm.sort_values('Validation accuracy', ascending=False).head(10)

SVM has been the best performing model of this notebook. The downside is that takes very long to train, so few combinations can be tried to tweak the model. I reached a training and validation accuracy of 100% and 90.909% testing accuracy in the kaggle leaderboard.

The decrease in accuracy in the kaggle leaderboard is due to the fact that there are no label 43 samples in the training set, so it fails to predict them. However, several techniques have been applied to solve this problem in the Pytorch notebook.

## Bagging

In [21]:
from sklearn.ensemble import BaggingClassifier

In [22]:
def get_bagging_models():
    models = set()
    n_max = 12
    indices = 2**np.array(range(0,n_max))
    for i in indices:
        models.add(BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'), n_estimators=i))
    return models


In [None]:
if TRY_BAGGING:
    models_bagging = get_bagging_models()
    result_bagging = try_models(models_bagging, X_train, X_val, y_train, y_val)
    result_bagging.to_csv(WORKING_DIR / 'bagging_models.csv')
    result_bagging.sort_values('Validation accuracy', ascending=False).head(10)

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


Performance with bagging is not bad but it is not better than SVMs or Logistic Regression models. Near 80% validation accuracy. It has an incredible downside: which is that it takes very long to train.

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
def get_adaboost_models():
    models = set()
    indices = 2**np.array(range(0,n_max))
    maximum_depth = [10, 20, 50, 100]
    for i in indices:
        for md in maximum_depth:
            models.add(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', maximum_depth=md), n_estimators=i))
        models.add(AdaBoostClassifier(base_estimator=RandomForestClassifier(criterion='entropy', maximum_depth=50, n_estimators=10), n_estimators=i))
    return models

In [None]:
if TRY_ADABOOST:
    models_adaboost = get_adaboost_models()
    result_adaboost = try_models(models_adaboost, X_train, X_val, y_train, y_val)
    result_adaboost.to_csv(WORKING_DIR / 'adaboost_models.csv')
    result_adaboost.sort_values('Validation accuracy', ascending=False).head(10)

Performance with Adaboost was similar to bagging in the validation test. Training accuracy, however, has improved. Still, SVMs remain the best model so far.

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# if TRY_NN:
model_nn = MLPClassifier(hidden_layer_sizes=(500, 500))
acc_train, acc_test = try_model(model_nn, X_train, X_val, y_train, y_val)
print(acc_train)
print(acc_test)

**Note:** Neural networks are developed more using pytorch in the final submission notebook.

## Output submission

In [None]:
if OUTPUT_MODEL:

    if LOAD_SAVED_MODEL:
        import pickle
        with open(WORKING_DIR / 'model_svm.bin', 'rb') as f:
            model = pickle.load(f)
    else:
        #DEFINE OR CREATE THE FINAL MODEL HERE
        model = SVC(C=256, kernel='poly', degree=2)
        model.fit(X_train, y_train)
        with open(WORKING_DIR / 'model_svm.bin', 'wb') as f:
            f.write(pickle.dumps(model))

    df_test = pd.DataFrame(columns=['data'])
    df_test.index.name = 'ID'

    for i in range(24750):
        df_test.loc[i, 'data'] = get_audio(i)

    df_test.head()
    X_test = convert_dataframe_to_numpy(df_test)

    y_test = model.predict(X_test)

    df = pd.DataFrame(index=df_test.index, columns=['Label'])
    df['Label'] = y_test
    df.to_csv(WORKING_DIR / 'submission.csv')


#### **Annex:** Frequency analysis of a random train set signal

In [None]:
i = np.random.randint(0, 90000)
file = TRAIN_DIR / f'train_{i}.wav'

fs, x = wavfile.read(file)
n_samples = len(x)
print(f'Sample frequency: {fs} / N Samples: {n_samples}')


# Spectre with fft

X = (fft(x)/n_samples)[:n_samples//2]
f_x = np.linspace(0, 1/2, n_samples//2)*fs

plt.figure()
plt.plot(f_x, np.abs(X))
plt.ylabel('Amplitude')
plt.title(f'Spectre of {file.name}')

# Spectrogram
    
f_x, t_x, Sxx = spectrogram(x, fs=fs)

plt.figure()
plt.pcolormesh(t_x, f_x, Sxx)
plt.xlabel('Time')
plt.ylabel('Frequency')


print(f'Spectrogram shape: {Sxx.shape}')

#### **Annex:** Exploring how fft works

In [None]:
freq = 10

fs = 100
t = np.arange(0, 1, 1/fs)
x = np.sin(2*np.pi*freq*t)

# n samples
n_samples = len(x)
print(f'N samples: {len(x)}')

# Draw

plt.figure()
plt.plot(t, x)
plt.title(f'Sine {freq}Hz')

# Spectre

X = (fft(x)/n_samples)[:n_samples//2]
x_f = np.linspace(0, 1/2, n_samples//2)*fs

plt.figure()
plt.plot(x_f, np.abs(X))
plt.title('Spectre')
