In [1]:
from scipy import signal
import scipy.io.wavfile
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
import os

In [2]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def pad_audio(samples, L=16000):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

In [3]:
audio_location = os.listdir('/home/husein/Desktop/convolutional-neural-network/audio')
audio_location

['seven', 'one', 'five', 'nine', 'down', 'four', 'eight']

In [4]:
X, Y = [], []
new_sample_rate = 8000
for i in audio_location:
    audios = os.listdir('/home/husein/Desktop/convolutional-neural-network/audio/%s'%(i))
    for k in audios:
        sample_rate, samples = scipy.io.wavfile.read(os.path.join('/home/husein/Desktop/convolutional-neural-network/audio', i, k))
        samples = pad_audio(samples)
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: 
            n_samples = [samples]
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            Y.append(i)
            X.append(scipy.misc.imresize(specgram,[45, 40]))

In [5]:
labels = np.unique(Y)
Y = LabelEncoder().fit_transform(Y)
c = list(zip(X, Y))
random.shuffle(c)
X, Y = zip(*c)
X, Y = np.array(X), np.array(Y)
onehot = np.zeros((X.shape[0],labels.shape[0]))
onehot[np.arange(Y.shape[0]), Y] = 1.0
filter_size = 3
stride = 2
epoch = 20

In [13]:
def cross_entropy(Y_hat, Y, epsilon=1e-12):
    Y_hat = np.clip(Y_hat, epsilon, 1. - epsilon)
    N = Y_hat.shape[0]
    return -np.sum(np.sum(Y * np.log(Y_hat+1e-9))) / N

def softmax(x):
    exp_scores = np.exp(x - np.max(x))
    return exp_scores / (np.sum(exp_scores, axis=1, keepdims=True) + 1e-8)

def relu(X, grad=False):
    if grad:
        return np.where(X >= 0, 1, 0)
    else:
        return np.where(X >= 0, X, 0)

def padding(x, filter_size, pad='SAME'):
    if pad == 'SAME':
        pad_h_min = int(np.floor((filter_size - 1)/2))
        pad_h_max = int(np.ceil((filter_size - 1)/2))
        pad_h = (pad_h_min, pad_h_max)
        return np.pad(x, ((0, 0), pad_h, (0, 0)), mode='constant')
    else:
        return x
    
def get_shape(x):
    output_height = int((x.shape[1] - filter_size) / stride + 1)
    return int(output_height)

def conv(x, w, out):
    for k in range(x.shape[0]):
        for z in range(w.shape[2]):
            h_range = int((x.shape[1] - filter_size) / stride + 1)
            for _h in range(h_range):
                out[k, _h, z] = np.sum(x[k, _h * stride:_h * stride + filter_size, :] * w[:, :, z])
    return out

def deconv_w(x, w, dv):
    for k in range(x.shape[0]):
        for z in range(w.shape[2]):
            h_range = int((x.shape[1] - filter_size) / stride + 1)
            for _h in range(h_range):
                w[:, :, z] = np.sum(x[k, _h * stride:_h * stride + filter_size, :] * dv[k, _h * stride:_h * stride + filter_size, :])
    return w

def deconv_x(x, w, dv):
    for k in range(x.shape[0]):
        for z in range(x.shape[2]):
            h_range = int((dv.shape[1] - filter_size) / stride + 1)
            for _h in range(h_range):
                x[k, _h, z] = np.sum(dv[k, _h * stride:_h * stride + filter_size, :] * w[:, z, :])
    return x

In [14]:
X.shape

(70, 45, 40)

In [31]:
starting_dimension = X[0].shape[1]
kernel_1 = np.random.randn(filter_size, starting_dimension, 16) / np.sqrt(starting_dimension)
kernel_2 = np.random.randn(filter_size, 16, 32) / np.sqrt(16)
kernel_3 = np.random.randn(filter_size, 32, 64) / np.sqrt(32)
h_pulled = int(np.ceil(X[0].shape[0] / 2**3))
w_1 = np.random.randn(h_pulled * 64, 128) / np.sqrt(h_pulled * 64)
w_2 = np.random.randn(128, labels.shape[0]) / np.sqrt(128)

LEARNING_RATE = 1e-6
EPOCH = 10

In [None]:
for i in range(EPOCH):
    padded_x = padding(X, filter_size)
    h = get_shape(padded_x)
    out_conv1 = np.zeros((X.shape[0], h, kernel_1.shape[2]))
    out_conv1 = conv(padded_x, kernel_1, out_conv1)
    z1 = relu(out_conv1,False)
    padded_z1 = padding(z1, filter_size)
    h = get_shape(padded_z1)
    out_conv2 = np.zeros((X.shape[0], h, kernel_2.shape[2]))
    out_conv2 = conv(padded_z1, kernel_2, out_conv2)
    z2 = relu(out_conv2,False)
    padded_z2 = padding(z2, filter_size)
    h = get_shape(padded_z2)
    out_conv3 = np.zeros((X.shape[0], h, kernel_3.shape[2]))
    out_conv3 = conv(padded_z2, kernel_3, out_conv3)
    z3 = relu(out_conv3,False)
    z3_reshape = z3.reshape((-1, h_pulled * 64))
    fully1 = np.dot(z3_reshape, w_1)
    z4 = relu(fully1,False)
    logits = np.dot(z4, w_2)
    probs = softmax(logits)
    accuracy = np.mean(np.argmax(logits,axis=1) == Y)
    loss = cross_entropy(probs, onehot)
    delta = probs
    delta[range(Y.shape[0]), Y] -= 1
    dw_2 = np.dot(z4.T, delta)
    dz4 = np.dot(delta,w_2.T)
    dfully1 = relu(fully1,True) * dz4
    dw_1 = np.dot(z3_reshape.T, dfully1)
    dz3_reshape = np.dot(dfully1,w_1.T)
    dz3 = dz3_reshape.reshape((-1, h_pulled, 64))
    dout_conv3 = relu(out_conv3, True) * dz3
    dkernel_3 = np.zeros(kernel_3.shape)
    dpadded_z2 = np.zeros(padded_z2.shape)
    dkernel_3 = deconv_w(out_conv3, dkernel_3, dout_conv3)
    dpadded_z2 = deconv_x(dpadded_z2, kernel_3, dout_conv3)
    dout_conv2 = relu(out_conv2, True) * dpadded_z2[:,:out_conv2.shape[1],:]
    dkernel_2 = np.zeros(kernel_2.shape)
    dpadded_z1 = np.zeros(padded_z1.shape)
    dkernel_1 = deconv_w(out_conv2, dkernel_2, dout_conv2)
    dpadded_z1 = deconv_x(dpadded_z1, kernel_2, dout_conv2)
    dout_conv1 = relu(out_conv1, True) * dpadded_z1[:,:out_conv1.shape[1],:]
    dkernel_1 = np.zeros(kernel_1.shape)
    dkernel_1 = deconv_w(out_conv1, dkernel_1, dout_conv1)
    kernel_1 += -LEARNING_RATE * dkernel_1
    kernel_2 += -LEARNING_RATE * dkernel_2
    kernel_3 += -LEARNING_RATE * dkernel_3
    w_2 += -LEARNING_RATE * dw_2
    w_1 += -LEARNING_RATE * dw_1
    print('epoch %d, cost %f, accuracy %f'%(i, loss, accuracy))

epoch 0, cost 19.430993, accuracy 0.142857
epoch 1, cost 19.311505, accuracy 0.142857
epoch 2, cost 19.215790, accuracy 0.200000
epoch 3, cost 20.722266, accuracy 0.142857
epoch 4, cost 19.864939, accuracy 0.171429
epoch 5, cost 19.237746, accuracy 0.185714
epoch 6, cost 20.508387, accuracy 0.185714
