In [1]:
from scipy import signal
import scipy.io.wavfile
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
import os

In [2]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def pad_audio(samples, L=16000):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

In [3]:
audio_location = os.listdir('/home/husein/Desktop/convolutional-neural-network/audio')
audio_location

['seven', 'one', 'five', 'nine', 'down', 'four', 'eight']

In [4]:
X, Y = [], []
new_sample_rate = 8000
for i in audio_location:
    audios = os.listdir('/home/husein/Desktop/convolutional-neural-network/audio/%s'%(i))
    for k in audios:
        sample_rate, samples = scipy.io.wavfile.read(os.path.join('/home/husein/Desktop/convolutional-neural-network/audio', i, k))
        samples = pad_audio(samples)
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: 
            n_samples = [samples]
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            Y.append(i)
            X.append(scipy.misc.imresize(specgram,[45, 40]))

In [5]:
labels = np.unique(Y)
Y = LabelEncoder().fit_transform(Y)
c = list(zip(X, Y))
random.shuffle(c)
X, Y = zip(*c)
X, Y = np.array(X), np.array(Y)
onehot = np.zeros((X.shape[0],labels.shape[0]))
onehot[np.arange(Y.shape[0]), Y] = 1.0

In [6]:
def get_padding(filter_shape, pad='same'):
    if pad == 'valid':
        return (0, 0)
    if pad == 'same':
        filter_height = filter_shape
        pad_h_min = int(np.floor((filter_height - 1)/2))
        pad_h_max = int(np.ceil((filter_height - 1)/2))
        return (pad_h_min, pad_h_max)
    
def images_to_column_indices(images_shape, filter_shape, padding, stride=1):
    batch_size, height, channels = images_shape
    filter_height = filter_shape
    pad_h = padding
    out_height = int((height + np.sum(pad_h) - filter_height) / stride + 1)
    i0 = np.arange(filter_height)
    i0 = np.tile(i0, channels)
    i1 = stride * np.arange(out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    k = np.repeat(np.arange(channels), filter_height).reshape(-1, 1)
    return (i, k)
    
def image_to_column(images, filter_shape, stride, pad='same'):
    filter_height = filter_shape
    pad_h = get_padding(filter_shape, pad)
    images_padded = np.pad(images, ((0, 0), pad_h, (0, 0)), mode='constant')
    i, k = images_to_column_indices(images.shape, filter_shape, (pad_h), stride)
    cols = images_padded[:, i, k]
    channels = images.shape[2]
    return cols.reshape(filter_height * channels, -1)

def get_shape(x, filter_shape, stride, pad):
    _, height, _ = x.shape
    pad_h = get_padding(filter_shape, pad)
    output_height = (height + np.sum(pad_h) - filter_shape) / stride + 1
    return int(output_height)

def column_to_image(cols, images_shape, filter_shape, stride, pad='same'):
    batch_size, height, channels = images_shape
    pad_h = get_padding(filter_shape, pad)
    height_padded = height + np.sum(pad_h)
    images_padded = np.empty((batch_size, height_padded, channels))
    i, k = images_to_column_indices(images_shape, filter_shape, (pad_h), stride)
    cols = cols.reshape(channels * np.prod(filter_shape), -1, batch_size)
    cols = cols.transpose(2,0,1)
    np.add.at(images_padded, (slice(None), i, k), cols)
    return images_padded[:, pad_h[0]:height+pad_h[0], :]

def conv_forward(X, W, stride=1, pad='same'):
    filter_shape = (W.shape[0])
    n_filter = W.shape[2]
    X_col = image_to_column(X, filter_shape, stride=stride, pad=pad)
    W_col = W.reshape((n_filter, -1))
    output = W_col.dot(X_col)
    out=get_shape(X, filter_shape, stride, pad)
    return output.reshape((X.shape[0],out,n_filter)), (filter_shape, n_filter, X_col, W_col,stride, pad)

def conv_backward(X, W, dout, cached):
    filter_shape, n_filter, X_col, W_col,stride, pad = cached
    db = np.sum(dout, axis=(0, 1)).reshape(n_filter)
    dout_reshaped = dout.transpose(1, 2, 0).reshape(n_filter, -1)
    dW = dout_reshaped.dot(X_col.T)
    dW = dW.reshape(W.shape)
    W_reshape = W.reshape(n_filter, -1)
    dX_col = W_reshape.T.dot(dout_reshaped)
    dX = column_to_image(dX_col, X.shape, filter_shape, stride, pad)
    return dX, dW, db

def cross_entropy(Y_hat, Y, epsilon=1e-12):
    Y_hat = np.clip(Y_hat, epsilon, 1. - epsilon)
    N = Y_hat.shape[0]
    return -np.sum(np.sum(Y * np.log(Y_hat+1e-9))) / N

def softmax(x):
    exp_scores = np.exp(x - np.max(x))
    return exp_scores / (np.sum(exp_scores, axis=1, keepdims=True) + 1e-8)

def relu(X, grad=False):
    if grad:
        return np.where(X >= 0, 1, 0)
    else:
        return np.where(X >= 0, X, 0)

In [11]:
filter_size = 3
stride = 2
epoch = 30
learning_rate = 1e-7
starting_dimension = X[0].shape[1]
kernel_1 = np.random.randn(filter_size, starting_dimension, 16) / np.sqrt(starting_dimension)
bias_1 = np.zeros(16)
kernel_2 = np.random.randn(filter_size, 16, 32) / np.sqrt(16)
bias_2 = np.zeros(32)
kernel_3 = np.random.randn(filter_size, 32, 64) / np.sqrt(32)
bias_3 = np.zeros(64)
h = int(np.ceil(X[0].shape[0] / 2**3))
w_1 = np.random.randn(h * 64, 128) / np.sqrt(h * 64)
b_1 = np.zeros(128)
w_2 = np.random.randn(128, labels.shape[0]) / np.sqrt(128)
b_2 = np.zeros(labels.shape[0])

In [12]:
for i in range(epoch):
    conv1, cached1 = conv_forward(X, kernel_1, 2)
    conv1 = conv1 + bias_1
    z1 = relu(conv1,False)
    conv2, cached2 = conv_forward(z1, kernel_2, 2)
    z2 = relu(conv2,False)
    conv3, cached3 = conv_forward(z2, kernel_3, 2)
    conv3 = conv3 + bias_3
    z3 = relu(conv3,False)
    z3_reshape = z3.reshape((-1, h * 64))
    fully1 = np.dot(z3_reshape, w_1) + b_1
    z4 = relu(fully1,False)
    logits = np.dot(z4, w_2) + b_2
    probs = softmax(logits)
    accuracy = np.mean(np.argmax(probs,axis=1) == Y)
    loss = cross_entropy(probs, onehot)
    delta = probs
    delta[range(Y.shape[0]), Y] -= 1
    dw_2 = np.dot(z4.T, delta)
    db_2 = np.sum(delta,axis=0)
    dz4 = np.dot(delta,w_2.T)
    dfully1 = relu(fully1, True) * dz4
    dw_1 = np.dot(z3_reshape.T, dfully1)
    db_1 = np.sum(dfully1,axis=0)
    dz3_reshape = np.dot(dfully1,w_1.T)
    dz3 = dz3_reshape.reshape((-1, h, 64))
    dconv3 = relu(conv3, True) * dz3
    dz2, dkernel_3, dbias_3 = conv_backward(z2, kernel_3, dconv3, cached3)
    dconv2 = relu(conv2, True) * dz2
    dz1, dkernel_2, dbias_2 = conv_backward(z1, kernel_2, dconv2, cached2)
    dconv1 = relu(conv1, True) * dz1
    _, dkernel_1, dbias_1 = conv_backward(X, kernel_1, dconv1, cached1)
    kernel_1 -= learning_rate * dkernel_1
    bias_1 -= learning_rate * dbias_1
    kernel_2 -= learning_rate * dkernel_2
    bias_2 -= learning_rate * dbias_2
    kernel_3 -= learning_rate * dkernel_3
    bias_3 -= learning_rate * dbias_3
    w_1 -= learning_rate * dw_1
    b_1 -= learning_rate * db_1
    w_2 -= learning_rate * dw_2
    b_2 -= learning_rate * db_2
    print('epoch %d, loss %f, accuracy %f'%(i+1, loss, accuracy))

epoch 1, loss 20.722266, accuracy 0.200000
epoch 2, loss 1.945910, accuracy 0.142857
epoch 3, loss 1.945910, accuracy 0.142857
epoch 4, loss 1.945910, accuracy 0.142857
epoch 5, loss 1.945910, accuracy 0.142857
epoch 6, loss 1.945910, accuracy 0.142857
epoch 7, loss 1.945910, accuracy 0.142857
epoch 8, loss 1.945910, accuracy 0.142857
epoch 9, loss 1.945910, accuracy 0.142857
epoch 10, loss 1.945910, accuracy 0.142857
epoch 11, loss 1.945910, accuracy 0.142857
epoch 12, loss 1.945910, accuracy 0.142857
epoch 13, loss 1.945910, accuracy 0.142857
epoch 14, loss 1.945910, accuracy 0.142857
epoch 15, loss 1.945910, accuracy 0.142857
epoch 16, loss 1.945910, accuracy 0.142857
epoch 17, loss 1.945910, accuracy 0.142857
epoch 18, loss 1.945910, accuracy 0.142857
epoch 19, loss 1.945910, accuracy 0.142857
epoch 20, loss 1.945910, accuracy 0.142857
epoch 21, loss 1.945910, accuracy 0.142857
epoch 22, loss 1.945910, accuracy 0.142857
epoch 23, loss 1.945910, accuracy 0.142857
epoch 24, loss 1.94