## Introduction

In [1]:
# We prepare a dataset of speech samples from different speakers, with the speaker as label.
# We add background noiseto these samples to augment our data
# We take the FFT of these samples
# We train a 1D convnet to predict the correct speaker given a noisy FFt speech sample

In [2]:
# Note

# Tenso

## Set up

In [3]:
import os
import shutil
import numpy as np

import tensorflow as tf
from tensorflow import keras

from pathlib import Path
from IPython.display import display, Audio

In [4]:
# Get the data from https://www.kaggle.com/kongaevans/speaker-recognition-dataset/download
# and save it to the 'Downloads' folder in your HOME directory

DATASET_ROOT = os.path.join( "./audio_train/16000_pcm_speeches")

# The folders in which we will put the audio samples and the noise samples
AUDIO_SUBFOLDER = "audio"
NOISE_SUBFOLDER = "noise"

DATASET_AUDIO_PATH = os.path.join(DATASET_ROOT, AUDIO_SUBFOLDER)
DATASET_NOISE_PATH = os.path.join(DATASET_ROOT, NOISE_SUBFOLDER)

In [5]:
DATASET_ROOT

'./audio_train/16000_pcm_speeches'

## DOWNLOAD DATA MANUALLY FROM KAGGLE

In [6]:
# Percentage of samples to use for validation
valid_split = 0.1

# Seed to use when shuffling the dataset and the noise
shuffle_seed = 43

In [7]:
# The sampling rate to use
# This is the one used in all of  the audio samples
# We will resample all of the noise to this sampling rate
# This will also be the output size of the audio wave samples
# Since all samples are of 1 Second long

# The factor to multiply the noise with according to :
# noisy_sample = sample + noise * prop * scale
# Where prop = sample_amplitude / noise_amplitude
SAMPLING_RATE = 16000


scale = 0.5

batch_size = 128
epochs = 100

In [8]:
AUDIO_SUBFOLDER = "audio"
NOISE_SUBFOLDER = "noise"
DATASET_AUDIO_PATH = os.path.join(DATASET_ROOT, AUDIO_SUBFOLDER)
DATASET_NOISE_PATH = os.path.join(DATASET_ROOT, NOISE_SUBFOLDER)

# Percentage of samples to use for validation
VALID_SPLIT = 0.1

# Seed to use when shuffling the dataset and the noise
SHUFFLE_SEED = 43

# (since all samples are of 1 second long)
SAMPLING_RATE = 16000

# The factor to multiply the noise with according to:
#   noisy_sample = sample + noise * prop * scale
#      where prop = sample_amplitude / noise_amplitude
SCALE = 0.5

BATCH_SIZE = 128
EPOCHS = 100



## Data Preparation

In [9]:
# The dataset is composed of 7 folders, divided into 2 groups\

# Speech samples, with 5 folders for 5 different speakers, Each folder contains 1500 audio files, 
     # each 1 second long and sampled at 16000 Hz.
    
# Background noise samples, with 2 folders and a total of 6 files.
  # These files are longer than 1 second and originally not sampled at 16000 Hz , but we will resample them to 16000 Hz.
  # We will use those 6 files to create 354 1-second-long noise samples to be used for training.

# Lets sort these 2 categories into 2 folders:

# An audio folcer which will contain all the per-speaker speech sample folders
# A noise folder which contain all the noise samples.

os.listdir()

['.ipynb_checkpoints',
 'audio_train',
 'ckpt',
 'image_classification_cat_dog.ipynb',
 'kagglecatsanddogs_3367a',
 'kagglecatsanddogs_3367a.zip',
 'model.png',
 'MSR-LA - 3467.docx',
 'Multi_gpu_and_distributed_training_using_Mnsist_Data.ipynb',
 'Speaker_Recognition.ipynb',
 'SPEAKER_RECOGNITION_KAGGLE.ipynb',
 'Using_callbacks_to_ensure_fault_tolerance.ipynb']

In [10]:
print(DATASET_ROOT, " -->  is ROOT DIrectory")
print(DATASET_AUDIO_PATH , "  --> is Audio data set path ")
print(DATASET_NOISE_PATH , "  --> is Noise Data set Path")

./audio_train/16000_pcm_speeches  -->  is ROOT DIrectory
./audio_train/16000_pcm_speeches\audio   --> is Audio data set path 
./audio_train/16000_pcm_speeches\noise   --> is Noise Data set Path


In [11]:
# If folder "audio" does not exist, create it, otherwise do nothing
if os.path.exists(DATASET_AUDIO_PATH) is False:
    os.makedirs(DATASET_AUDIO_PATH)

# If folder 'noiose', does not exist, create it, otherwise do nothing
if os.path.exists(DATASET_NOISE_PATH) is False:
    os.makedirs(DATASET_NOISE_PATH)
    
for folder in os.listdir(DATASET_ROOT):
    if os.path.isdir(os.path.join(DATASET_ROOT, folder)):
        if folder in [AUDIO_SUBFOLDER, NOISE_SUBFOLDER]:
            # if folder is audio or noise do nothing
            continue
        elif folder in ["other", "_background_noise_"]:
            # If folder is one of the folders that contains noise samples,
            # move it to the 'noise' folder
            shutil.move(os.path.join(DATASET_ROOT, folder),
                       os.path.join(DATASET_NOISE_PATH, folder))
        else:
            # otherwise, ot should be a speaker folder, then move it to 
            # audio folder
            shutil.move(
                os.move(os.path.join(DATASET_ROOT, folder),
                os.path.join(DATASET_AUDIO_PATH, folder)))

In [12]:
os.listdir()

['.ipynb_checkpoints',
 'audio_train',
 'ckpt',
 'image_classification_cat_dog.ipynb',
 'kagglecatsanddogs_3367a',
 'kagglecatsanddogs_3367a.zip',
 'model.png',
 'MSR-LA - 3467.docx',
 'Multi_gpu_and_distributed_training_using_Mnsist_Data.ipynb',
 'Speaker_Recognition.ipynb',
 'SPEAKER_RECOGNITION_KAGGLE.ipynb',
 'Using_callbacks_to_ensure_fault_tolerance.ipynb']

## Noise Preparation

In [13]:
# Noise Preparation 

# In this section :
     # We load all noise samples (which should have beed resamples to 16000)
    # We split those noise samples to chunks of 16000 samples which correspond to 1 
    # second duration each
    

In [14]:
os.listdir()

['.ipynb_checkpoints',
 'audio_train',
 'ckpt',
 'image_classification_cat_dog.ipynb',
 'kagglecatsanddogs_3367a',
 'kagglecatsanddogs_3367a.zip',
 'model.png',
 'MSR-LA - 3467.docx',
 'Multi_gpu_and_distributed_training_using_Mnsist_Data.ipynb',
 'Speaker_Recognition.ipynb',
 'SPEAKER_RECOGNITION_KAGGLE.ipynb',
 'Using_callbacks_to_ensure_fault_tolerance.ipynb']

In [15]:
# Get the list of all noise files
noise_paths = []
for subdir in os.listdir(DATASET_NOISE_PATH):
    subdir_path = Path(DATASET_NOISE_PATH)/ subdir
    print(subdir_path, "--")
    
    if os.path.isdir(subdir_path):
        noise_paths += [os.path.join(subdir_path, filepath)
                       for filepath in os.listdir(subdir_path)
                       if filepath.endswith(".wav")] # list comprehensive
print(
    "Found {} files belonging to {} directories".format(len(noise_paths), len(os.listdir(DATASET_NOISE_PATH))))


audio_train\16000_pcm_speeches\noise\other --
audio_train\16000_pcm_speeches\noise\_background_noise_ --
Found 6 files belonging to 2 directories


## Do this in linux 
**My Pc is in Window so i do directly in linux**

In [16]:
#    command = (
    #    "for dir in `ls -1 " + DATASET_NOISE_PATH + "`; do "
    #    "for file in `ls -1 " + DATASET_NOISE_PATH + "/$dir/*.wav`; do "
    #    "sample_rate=`ffprobe -hide_banner -loglevel panic -show_streams "
    #    "$file | grep sample_rate | cut -f2 -d=`; "
    #    "if [ $sample_rate -ne 16000 ]; then "
    #    "ffmpeg -hide_banner -loglevel panic -y "
    #    "-i $file -ar 16000 temp.wav; "
    #    "mv temp.wav $file; "
    #    "fi; done; done"
#    )
#    
#    os.system(command)

In [17]:
# Do this im Linux
# Need to install program in Linux 

# sudo apt install ffmpeg

# My linux Code / shell script program


#    for dir in `ls -1 ./16000_pcm_speeches/noise/ `; do 
   #     for file in `ls -1 ./16000_pcm_speeches/noise/$dir/*.wav`; do
   #     sample_rate=`ffprobe -hide_banner -loglevel panic -show_streams $file | grep sample_rate | cut -f2 -d=`;
   #     if [ $sample_rate -ne 16000 ] ; then
   #     ffmpeg -hide_banner -loglevel panic -y -i $file -ar 16000 temp.wav;
   #     mv temp.wav $file;
   #     fi;done; done;#

In [18]:
print(DATASET_NOISE_PATH)

./audio_train/16000_pcm_speeches\noise


In [19]:
# Split Noise into chunks of 1600 each
def load_noise_sample(path):
    sample, sampling_rate = tf.audio.decode_wav(tf.io.read_file(path),
                                               desired_channels = 1)
    if sampling_rate == sampling_rate:
        # Number of slices of 16000 each that can be generated from the noise sample
        slices = int(sample.shape[0] / sampling_rate)
        sample = tf.split(sample[:slices * sampling_rate], slices)
        return sample
    else:
        print("Sampling rate for {} is incorrect. Ignoring it ".format(path))
        return None

In [20]:
noises = []
for path in noise_paths:
    sample = load_noise_sample(path)
    if sample:
        noises.extend(sample)
noises = tf.stack(noises)

print("{} noise files were split into {} noise where each is {}\
        sec. long".format(len(noise_paths), noises.shape[0], noises.shape[1]// SAMPLING_RATE))

6 noise files were split into 354 noise where each is 1        sec. long


## Dataset generator

In [21]:
def paths_and_labels_to_dataset(audio_paths, labels):
    """
    Constructs a dataset of audios and labels.
    """
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x : path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    
    return tf.data.Dataset.zip((audio_ds, label_ds))

In [22]:
def path_to_audio(path):
    """
    Reads and decodes an audio file.
    """
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio,  1, SAMPLING_RATE)
    
    return audio

In [23]:
def add_noise(audio, noises = None, scale = 0.5):
    
    if noises is not None:
        # Create a random tensor of the same size as audio ranging from 
        # o to the number of noise strem samples that we have.
        
        tf_rnd = tf.random.uniform(
        (tf.shape(audio)[0], ), 0, noises.shape[0], dtype=tf.int32)
        noise = tf.gather(noises, tf_rnd, axis = 0)
        
        # Get the amplitude proportion between the audio and the noise
        prop = tf.math.reduce_max(audio, axis = 1)/ tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis = 1)
        
        # Adding the rescaled noise to audio 
        audio = audio + noise * prop * scale
        
    return audio

In [25]:
def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on  the intermost dimension,
    # We need to squuze the dimensions and then expand them again
    # after FFT 
    audio = tf.squeeze(audio, axis = -1)
    fft = tf.signal.fft(
    tf.cast(tf.complex(real = audio, image = tf.zeros_like(audio)), tf.complex64))
    fft = tf.expand_dims(fft, axis = -1)
    
    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    
    return tf.math.abs(fft[:, :(audio.shape[1]//2), :])

# Get the list of audio file paths along with their corresponding labels

class_names = os.listdir(DATASET_AUDIO_PATH)

# Get the list of audio file paths along with their corresponding labels

class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

audio_paths = []
labels = []

for label, name in enumerate(class_names):
    print("Processing speaker {}".format(class_names,))
    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path) if filepath.endswith(".wav")]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)
    
    print("Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names)))
    
    # Shuffle 
    rng = np.random.RandomState(shuffle_seed)
    rng.shuffle(audio_paths)
    
    rng = np.random.RandomState(shuffle_seed)
    rng.shuffle(labels)
    
    # Split into training and validation
    num_val_samples = int(valid_split * len(audio_paths))
    print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
    train_audio_paths = audio_paths[:-num_val_samples]
    
    train_labels = labels[:-num_val_samples]
    
    print("Using {} files for validation.".format(num_val_samples))
    valid_audio_paths = audio_paths[-num_val_samples:]
    valid_labels = labels[-num_val_samples:]
    
    # Create 2 datasets, one for training and the other for validation 
    train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
    train_ds = train_ds.shuffle(buffer_size = batch_size * 8, seed = shuffle_seed).batch(batch_size)
    
    
    valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
    valid_ds = valid_ds.shuffle(buffer_size = 32 *8 , seed = shuffle_seed).batch(32)
    
    ## Add noise to the training set
    train_ds = train_ds.map(lambda x,y : (add_noise(x, noises, scale = scale), y),
                           num_parallel_calls = tf.data.experimental.AUTOTUNE,)
    
    # Transform audio wave to the frequency domain using ' audio_to_fft'
    
    train_ds = train_ds.map(lambda x,y : (audio_to_fft(x),y), num_parallel_calls = tf.data.experimental.AUTOTUNE)
    train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)
    
    valid_ds = valid_ds.map(
    lambda x,y : (audio_to_fft(x), y), num_parallel_cells = tf.data.experimental.AUTOTUNE)
    valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)

Our class names: ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
Processing speaker ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
Found 1500 files belonging to 5 classes.
Using 1350 files for training.
Using 150 files for validation.


TypeError: in user code:

    <ipython-input-25-9f5836676306>:70 None  *
        lambda x,y : (audio_to_fft(x),y)
    <ipython-input-24-3b2b4b3ec837>:6 audio_to_fft  *
        fft = tf.signal.fft(
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)

    TypeError: complex() got an unexpected keyword argument 'image'


In [None]:
shuffle_seed

In [None]:
def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio


def add_noise(audio, noises=None, scale=0.5):
    if noises is not None:
        # Create a random tensor of the same size as audio ranging from
        # 0 to the number of noise stream samples that we have.
        tf_rnd = tf.random.uniform(
            (tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
        )
        noise = tf.gather(noises, tf_rnd, axis=0)

        # Get the amplitude proportion between the audio and the noise
        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)

        # Adding the rescaled noise to audio
        audio = audio + noise * prop * scale

    return audio


def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])


# Get the list of audio file paths along with their corresponding labels

class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

audio_paths = []
labels = []
for label, name in enumerate(class_names):
    print("Processing speaker {}".format(name,))
    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)

# Split into training and validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

# Create 2 datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
    BATCH_SIZE
)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)


# Add noise to the training set
train_ds = train_ds.map(
    lambda x, y: (add_noise(x, noises, scale=SCALE), y),
    num_parallel_calls=tf.data.experimental.AUTOTUNE,
)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
)
train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)

## Model Definition

In [None]:
def residual_blocK(x, filters, conv_num = 3, activation = "relu")