- [Imports for Data Preparation](#Data-Preparation)
- [Modeling](##Deep-Learning-Models)

#### General Imports

In [2]:
import os 
import sys

import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd

from scipy.io import wavfile as wav
from sklearn import metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
import tensorflow as tf
from sklearn.linear_model import LogisticRegression


%matplotlib inline
%config InlineBackend.figure_format='retina'

PATH = os.getcwd() + '/speech_commands_v0.01/'

In [3]:
def get_directory_contents(path):
    return os.listdir(path)

def open_file(filename):
    
    f = open(filename)
    return f.read().splitlines()

def compile_dataset(folders):
    
    total_words = []
    for folder in folders:
        words = get_directory_contents(path=PATH+folder)
        words = [folder+'/'+word for word in words]
        total_words = total_words + words
    
    dataset = create_df(words=total_words)
                           
    return dataset

def create_df(words):
    
    data = pd.DataFrame({'recordings':words})

    data['word'] = data['recordings'].str.split('/').str[0]
    data['speaker_id'] = data['recordings'].str.split('/').str[1]
    data['speaker_id'] =data['speaker_id'].str.split('_').str[0]
    
    return data

def summary(data):
    
    summary_df = pd.DataFrame()
    summary_df['total_recordings'] = [data.shape[0]]
    summary_df['total_speakers'] = len(data['speaker_id'].unique())
    summary_df['total_words'] = len(data['word'].unique())
    
    return summary_df

def word_distribution():
    
    word_count = data['word'].value_counts()
    ax = word_count.plot(kind='bar', figsize=(8,4), alpha=0.5)
    plt.show()
    
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name) 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_processed = np.mean(mfccs.T,axis=0)
     
    return mfccs_processed

def dataset_prep(df):
    master_df = pd.DataFrame()
    columns = ['mfc_{}'.format(i) for i in range(40)]
    # Iterate through each sound file and extract the features 
    for index, row in df.iterrows():
        file_name = 'speech_commands_v0.01/'+row['recordings']
        class_label = row["word"]
        data = extract_features(file_name).reshape(1,-1)
        temp_df = pd.DataFrame(data, columns=columns)
        temp_df['class_label'] = class_label 
        master_df = master_df.append(temp_df, ignore_index=True)

    return master_df

### Data preparation
#### Only run if dataset not stored in csv file

In [None]:
remove_words = [
    '.DS_Store', 'validation_list.txt', 'LICENSE',
    '_background_noise_', 'README.md', 'testing_list.txt'
]

names = get_directory_contents(path=PATH)
names = [word for word in names if word not in remove_words]

In [None]:
val_list = open_file(filename=PATH+'validation_list.txt')
val_df = create_df(words=val_list)
test_list = open_file(filename=PATH+'testing_list.txt')
test_df = create_df(words=test_list)
total_df = compile_dataset(folders=names)

training_df = total_df[~total_df['recordings'].isin(val_df['recordings'])]
training_df = training_df[~training_df['recordings'].isin(test_df['recordings'])]

In [None]:
featuresdf = dataset_prep(df=training_df)
testdf = dataset_prep(df=test_df)
valdf = dataset_prep(df=val_df)
featuresdf.to_csv("training_set.csv")
testdf.to_csv("test_set.csv")
valdf.to_csv("validation_set.csv")

### To read from CSV

In [4]:
featuresdf = pd.read_csv("training_set.csv", index_col=0)
testdf = pd.read_csv("test_set.csv", index_col=0)
valdf = pd.read_csv("validation_set.csv", index_col=0)

### Modeling preparation

In [5]:
# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.drop(labels='class_label', axis=1).values)
y = np.array(featuresdf.class_label.tolist())
# Encode the classification labels
le = LabelEncoder()
yy = tf.keras.utils.to_categorical(le.fit_transform(y))

x_val = np.array(valdf.drop(labels='class_label', axis=1).values)
y_val = np.array(valdf.class_label.tolist())
y_val = tf.keras.utils.to_categorical(le.transform(y_val))

x_test = np.array(testdf.drop(labels='class_label', axis=1).values)
y_test = np.array(testdf.class_label.tolist())
y_test = tf.keras.utils.to_categorical(le.transform(y_test))

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_val_sacled = scaler.transform(x_val)
X_test_scaled = scaler.transform(x_test)

### Normal Logistic Regression model

In [None]:
clf = LogisticRegression(multi_class='multinomial', solver='newton-cg')

In [None]:
def to_category(columns, dataframe):
    """Convert a list of columns, from a dataframe, to a category datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('category')
    return dataframe
columns=['word']

training_df = to_category(columns=columns, dataframe=training_df)
test_df = to_category(columns=columns, dataframe=test_df)
val_df = to_category(columns=columns, dataframe=val_df)

In [None]:
clf.fit(X, training_df['word'])

print("Training Accuracy: {0:.2%}".format(clf.score(X, training_df['word'])))
print("Validation Accuracy: {0:.2%}".format(clf.score(x_val, val_df['word'])))
print("Testing Accuracy: {0:.2%}".format(clf.score(x_test, test_df['word'])))

## Deep Learning Models

In [26]:
def build_model_graph(input_shape=(13,)):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(input_shape),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(.1, input_shape=(100,)),
        tf.keras.layers.Dense(300, activation='relu'),
        tf.keras.layers.Dropout(.15, input_shape=(300,)),
        tf.keras.layers.Dense(800, activation='relu'),
        tf.keras.layers.Dropout(.2, input_shape=(800,)),
        tf.keras.layers.Dense(2000, activation='relu'),
        tf.keras.layers.Dropout(.3, input_shape=(2000,)),
        tf.keras.layers.Dense(5500, activation='relu'),
        tf.keras.layers.Dropout(.4, input_shape=(5500,)),
        tf.keras.layers.Dense(1000, activation='relu'),
        tf.keras.layers.Dense(500, activation='relu'),
        tf.keras.layers.Dense(250, activation='relu'),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(30, activation='softmax')
    ])
    # Compile the model
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=0.0013
    )
    model.compile(loss='categorical_crossentropy', 
                  metrics=['accuracy'], optimizer=optimizer)
    return model

def plot_metric(history, metric):
    train_metrics = history.history[metric]
    val_metrics = history.history['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics)
    plt.plot(epochs, val_metrics)
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()

In [24]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss')
lr = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 * 10 ** (epoch / 30))

- Build out different sections within the notebook
    - Give a summary for each section
    

In [28]:
model = build_model_graph()

model_history = model.fit(x=X_train_scaled, 
                          y=yy, 
                          epochs=100, 
                          validation_data=(X_val_sacled, y_val),
                          # callbacks =lr,
                          verbose=1)

Epoch 1/100
 285/1597 [====>.........................] - ETA: 3s - loss: 3.1756 - accuracy: 0.0830

KeyboardInterrupt: ignored

In [None]:
"plot_metric(history=model_history, metric='accuracy')

In [None]:
score = wide_nn.evaluate(x_test, y_test, verbose=0)

In [None]:
score

- Increase epochs til loss stabalizes
- Try different activation and optimizers
- Add dropout
- Have variability in diverging and convering with hidden layer units

- Unsupervised Learning
Concept of what works best for the problem we're solving

autoregression, vector auto regression

decisiontree regression, support vector regression, autocorrelation 