In [2]:
import wave
import glob
import random

# for data, model, training
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from scipy import signal

import librosa
import librosa.display

# for visuals and statistics
import matplotlib.pyplot as plt
import seaborn as sns

# Set the seed value for experiment reproducibility.
seed = 42
random.seed(42)
tf.random.set_random_seed(seed)
np.random.seed(seed)

In [4]:
def get_and_shuffle_filenames(dir_name):
    filenames = glob.glob(str(data_dir) + "/*")
    random.shuffle(filenames)
    return filenames

data_dir = "./recordings"
filenames = get_and_shuffle_filenames(data_dir)

print(filenames[:5])

['./recordings/2_jackson_13.wav', './recordings/6_george_34.wav', './recordings/7_george_5.wav', './recordings/1_yweweler_21.wav', './recordings/2_george_42.wav']


In [5]:
# https://www.tensorflow.org/tutorials/audio/simple_audio

def decode_audio(file_path):
    # read file to get buffer                                                                                               
    ifile = wave.open(file_path)
    samples = ifile.getnframes()
    audio = ifile.readframes(samples)

    # convert buffer to float32 using NumPy                                                                                 
    audio_as_np_int16 = np.frombuffer(audio, dtype=np.int16)
    audio_as_np_float32 = audio_as_np_int16.astype(np.float32)
    
    # get largest absolute value
    max_val = np.max(
        np.absolute(
            [np.max(audio_as_np_float32), np.min(audio_as_np_float32)]))
    audio_normalized = audio_as_np_float32 / max_val

    return audio_normalized

def get_label(file_path):
    # label is in the filename
    parts = file_path.split("/")
    label = int(parts[2].split("_")[0])

    return label


In [52]:
# to remove outliers 
X_unfiltered = [(file_path, decode_audio(file_path)) for file_path in filenames]
X_lengths = [audio.shape[0] for _, audio in X_unfiltered]

max_length = int(np.mean(X_lengths) + 2 * np.std(X_lengths))
print(np.mean(X_lengths))
print(np.std(X_lengths))
print(max_length)
max_length = 5888

3499.4746666666665
1180.9471707171701
5861


In [53]:
def spect(signal):
    spectogram = np.array([])
    for i in range(23):
        window_fft = np.fft.rfft(signal[i * 256: (i + 1) * 256])[:-1]
        window_fft = np.abs(window_fft)
        spectogram = np.append(spectogram, window_fft, axis=0)
    spectogram = np.array(spectogram)
    spectogram = librosa.amplitude_to_db(spectogram, ref=np.max)
    return spectogram

In [54]:
# padding function from
# https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5

X_full = [] # padded X values
y_full = []

numbers = [0] * 10

for file_path, audio in X_unfiltered:
    x_val = audio
    y_val = get_label(file_path)
    
    if (y_val > 7): continue
    
    signal_length = audio.shape[0]
    if signal_length > max_length:
        numbers[y_val] += 1
    else:
        pad_len = max_length - signal_length
        
        x_val = np.pad(
            x_val, (0, pad_len), 
            'constant', constant_values=(0, 0))
        
        spect_x = spect(x_val)
#         spect_x = spect_x.flatten()
        
        X_full.append(spect_x)
        y_full.append(y_val)

X_full = np.array(X_full)
y_full = np.array(y_full)

num_samples, sample_w = X_full.shape
print(num_samples)
print(sample_w)
print(y_full[:10])

2329
2944
[2 6 7 1 2 6 6 4 3 2]


In [55]:
# quantities = {"y": list(range(10)), "quantities": numbers}
df = pd.DataFrame.from_dict({"quantities": numbers})
print(df)
print(sum(numbers))

   quantities
0          10
1           8
2           6
3           5
4           3
5           6
6          22
7          11
8           0
9           0
71


In [56]:
mean = np.mean(X_full)
std = np.std(X_full)
X_full = X_full - mean
X_full = X_full / std

In [57]:
# rows = 3
# cols = 3
# n = rows * cols
# fig, axes = plt.subplots(rows, cols, figsize=(12, 14))

# for i, (audio, label) in enumerate(list(zip(X_full, y_full))[:n]):
#     r = i // cols
#     c = i % cols
#     ax = axes[r][c]
    
#     print(audio)
    
#     librosa.display.specshow(audio, y_axis='mel', fmax=8000, x_axis='time', ax=axes[r][c]);
# #     plt.title('Mel Spectrogram');
# #     plt.colorbar(format='%+2.0f dB');
    
# #     ax.plot(audio)
# #     ax.set_yticks(np.arange(-1,1.5,0.5))
#     ax.set_title(label)

# plt.show()

In [58]:
tenth = int(num_samples * 0.1)
eightyth = tenth * 8

X_train = X_full[:eightyth]
y_train = y_full[:eightyth]

X_val = X_full[eightyth: eightyth + tenth]
y_val = y_full[eightyth: eightyth + tenth]

X_test = X_full[eightyth + tenth:]
y_test = y_full[eightyth + tenth:]

print('Training set size', len(X_train))
print('Validation set size', len(X_val))
print('Test set size', len(X_test))

Training set size 1856
Validation set size 232
Test set size 241


In [59]:
flattened_data = {}
i = 0
for x, y in zip(X_test, y_test):
    flattened_data[i] = np.concatenate(([y], x.flatten()))
    i += 1
    
df = pd.DataFrame.from_dict(flattened_data, orient='index')
print(df.head())
df.to_csv("my_model_test.csv")

   0         1         2         3         4         5         6         7     \
0   4.0 -0.517063 -0.145395  0.345380  0.014779 -0.157025 -0.103307 -0.191867   
1   2.0  0.787242  0.590388  0.147761  0.202116  0.831150  0.706009  0.825241   
2   7.0  0.246638 -0.295067  0.013261  0.180662  0.257957  0.020789  0.279846   
3   1.0  0.172252  0.707328  1.043602  1.292135  1.502515  2.028182  2.307660   
4   4.0  1.042133  1.023080  0.802510  0.967628  1.978160  1.311670  1.222563   

       8         9     ...      2935      2936      2937      2938      2939  \
0 -0.457938 -0.199374  ... -0.993144 -0.993144 -0.993144 -0.993144 -0.993144   
1  1.031756  0.763968  ... -0.993144 -0.993144 -0.993144 -0.993144 -0.993144   
2  0.320586  0.111707  ... -0.993144 -0.993144 -0.993144 -0.993144 -0.993144   
3  1.662410  1.580116  ... -0.993144 -0.993144 -0.993144 -0.993144 -0.993144   
4  1.444402  2.176260  ... -0.993144 -0.993144 -0.993144 -0.993144 -0.993144   

       2940      2941      2942 