In [2]:
import wave
import glob
import random

# for data, model, training
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from scipy import signal

import librosa
import librosa.display

# for visuals and statistics
import matplotlib.pyplot as plt
import seaborn as sns

# Set the seed value for experiment reproducibility.
seed = 42
random.seed(42)
tf.random.set_random_seed(seed)
np.random.seed(seed)

In [5]:
def get_and_shuffle_filenames(dir_name):
    filenames = glob.glob(str(data_dir) + "/*")
    random.shuffle(filenames)
    return filenames

data_dir = "./recordings"
filenames = get_and_shuffle_filenames(data_dir)

print(filenames[:5])

['./recordings/2_jackson_13.wav', './recordings/6_george_34.wav', './recordings/7_george_5.wav', './recordings/1_yweweler_21.wav', './recordings/2_george_42.wav']


In [6]:
# https://www.tensorflow.org/tutorials/audio/simple_audio

def decode_audio(file_path):
    # read file to get buffer                                                                                               
    ifile = wave.open(file_path)
    samples = ifile.getnframes()
    audio = ifile.readframes(samples)

    # convert buffer to float32 using NumPy                                                                                 
    audio_as_np_int16 = np.frombuffer(audio, dtype=np.int16)
    audio_as_np_float32 = audio_as_np_int16.astype(np.float32)
    
    # get largest absolute value
    max_val = np.max(
        np.absolute(
            [np.max(audio_as_np_float32), np.min(audio_as_np_float32)]))
    audio_normalized = audio_as_np_float32 / max_val

    return audio_normalized

def get_label(file_path):
    # label is in the filename
    parts = file_path.split("/")
    label = int(parts[2].split("_")[0])

    return label


In [7]:
# to remove outliers 
X_unfiltered = [(file_path, decode_audio(file_path)) for file_path in filenames]
X_lengths = [audio.shape[0] for _, audio in X_unfiltered]

max_length = int(np.mean(X_lengths) + 2 * np.std(X_lengths))
print(np.mean(X_lengths))
print(np.std(X_lengths))
print(max_length)

3499.4746666666665
1180.9471707171701
5861


In [24]:
def spect(signal):
    spectogram = np.array([[]])
    for i in range(23):
        window_fft = np.fft.rfft(signal[i * 256: (i + 1) * 256])[:-1]
        window_fft = np.abs(window_fft)
        spectogram = np.append(spectogram, window_fft)
        break
    spectogram = np.array(spectogram)
    spectogram = librosa.amplitude_to_db(spectogram, ref=np.max)
    return spectogram

In [25]:
# padding function from
# https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5


_, x_val = X_unfiltered[0]

# pad_len = max_length - len(x_val)

# x_val = np.pad(
#     x_val, (0, pad_len), 
#     'constant', constant_values=(0, 0))

spec_x = spect(x_val)

print(spec_x[:129])



0.10676692
[-35.41301618 -42.16741971 -31.63574968 -21.76387435 -21.79612863
 -26.29768879 -23.37199405 -18.71216037 -15.35405817  -9.6237314
  -6.62602895  -8.2958434   -3.00586492  -8.10320372  -7.37317991
  -8.87296058 -20.49259586 -14.66618063 -24.08589822 -14.6035751
 -13.08654499  -8.38684294 -14.73949728 -14.26050398 -14.72832629
 -26.29526913 -27.21366248 -16.69194615 -23.87418149 -27.95202755
 -23.96824344 -44.77013932 -29.54058586 -39.68835259 -39.90999706
 -38.70985044 -33.39488708 -35.72605273 -34.69945898 -35.7012483
 -50.92025742 -38.74234902 -37.28215563 -34.8208483  -37.2785184
 -38.63429601 -35.69362118 -37.07800429 -32.11945928 -32.94708966
 -27.1484454  -31.50115825 -33.01352157 -26.37374978 -39.11115952
 -14.12997115 -12.11358638 -19.75855015 -10.7945241  -25.56779663
 -14.86125714 -19.43645126 -26.87327767 -24.09609453 -23.11180057
 -21.62845507 -36.50595466 -21.17892605 -30.00400271 -28.79202965
 -35.71601624 -26.24820308 -28.26067449 -24.07737321 -17.9656935
 -21

In [25]:
np.fft.rfft([0, 1, 2, 3, 4, 5, 6, 7])

array([28.+0.j        , -4.+9.65685425j, -4.+4.j        , -4.+1.65685425j,
       -4.+0.j        ])

In [26]:
abs(np.fft.rfft([0, 1, 2, 3, 4, 5, 6, 7]))

array([28.        , 10.45250372,  5.65685425,  4.3295688 ,  4.        ])