### Train model to recognize speaker

Get training data at: https://www.kaggle.com/datasets/kongaevans/speaker-recognition-dataset

Examples: https://keras.io/examples/audio/speaker_recognition_using_cnn/
https://www.kaggle.com/code/masoudmzb/gradient-tape-tutorial-audio-proccesing-example/notebook

In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import shutil
import numpy as np

import tensorflow as tf
import keras
import os
from pathlib import Path
from IPython.display import display, Audio

In [2]:
DATASET_ROOT = os.path.join( "data")

In [3]:
AUDIO_SUBFOLDER = "audio"
NOISE_SUBFOLDER = "noise"

DATASET_AUDIO_PATH = os.path.join(DATASET_ROOT, AUDIO_SUBFOLDER)
DATASET_NOISE_PATH = os.path.join(DATASET_ROOT, NOISE_SUBFOLDER)

# Percentage of samples to use for validation
VALID_SPLIT = 0.1

# Seed to use when shuffling the dataset and the noise
SHUFFLE_SEED = 43

# The sampling rate to use.
# This is the one used in all the audio samples.
# We will resample all the noise to this sampling rate.
# This will also be the output size of the audio wave samples
# (since all samples are of 1 second long)
SAMPLING_RATE = 16000

# The factor to multiply the noise with according to:
#   noisy_sample = sample + noise * prop * scale
#      where prop = sample_amplitude / noise_amplitude
SCALE = 0.5

BATCH_SIZE = 128
EPOCHS = 1

In [4]:
for folder in os.listdir(DATASET_ROOT):
    if os.path.isdir(os.path.join(DATASET_ROOT, folder)):
        if folder in [AUDIO_SUBFOLDER, NOISE_SUBFOLDER]:
            # If folder is `audio` or `noise`, do nothing
            continue
        elif folder in ["other", "_background_noise_"]:
            # If folder is one of the folders that contains noise samples,
            # move it to the `noise` folder
            shutil.move(
                os.path.join(DATASET_ROOT, folder),
                os.path.join(DATASET_NOISE_PATH, folder),
            )
        else:
            # Otherwise, it should be a speaker folder, then move it to
            # `audio` folder
            shutil.move(
                os.path.join(DATASET_ROOT, folder),
                os.path.join(DATASET_AUDIO_PATH, folder),
            )

In [17]:
noise_paths = []
for subdir in os.listdir(DATASET_NOISE_PATH):
    subdir_path = Path(DATASET_NOISE_PATH) / subdir
    if os.path.isdir(subdir_path):
        noise_paths += [
            os.path.join(subdir_path, filepath)
            for filepath in os.listdir(subdir_path)
            if filepath.endswith(".wav")
        ]
if not noise_paths:
    raise RuntimeError(f"Could not find any files at {DATASET_NOISE_PATH}")
print(
    "Found {} files belonging to {} directories".format(
        len(noise_paths), len(os.listdir(DATASET_NOISE_PATH))
    )
)

Found 7 files belonging to 2 directories


In [15]:
import subprocess

input_file = "data/noise/other/pink_noise.wav"
output_file = "data/noise/other/pink_noise_resampled.wav"

# Define your shell command for bash (for Unix-like systems, Git Bash, or WSL)
command = f"ffmpeg -hide_banner -loglevel panic -y -i {input_file} -ar 16000 {output_file}"


# Use WSL to run the bash command
result = subprocess.run(command, shell=True, capture_output=True, text=True)

# Print the results
print("Standard Output:\n", result.stdout)
print("Error Output:\n", result.stderr)



Standard Output:
 
Error Output:
 


In [18]:
def load_noise_sample(path):
    sample, sampling_rate = tf.audio.decode_wav(
        tf.io.read_file(path), desired_channels=1
    )
    print("sampling rate", sampling_rate)
    if sampling_rate == SAMPLING_RATE:
        # Number of slices of 16000 each that can be generated from the noise sample
        slices = int(sample.shape[0] / SAMPLING_RATE)
        sample = tf.split(sample[: slices * SAMPLING_RATE], slices)
        return sample
    else:
        print("Sampling rate for {} is incorrect. Ignoring it".format(path))
        return None

noises = []
for path in noise_paths:
    sample = load_noise_sample(path)
    if sample:
        noises.extend(sample)
noises = tf.stack(noises)

print("Number of noise samples:", len(noises))


# print(
#     "{} noise files were split into {} noise samples where each is {} sec. long".format(
#         len(noise_paths), noises.shape[0], noises.shape[1] // SAMPLING_RATE
#     )
# )

sampling rate tf.Tensor(22050, shape=(), dtype=int32)
Sampling rate for data\noise\other\exercise_bike.wav is incorrect. Ignoring it
sampling rate tf.Tensor(22050, shape=(), dtype=int32)
Sampling rate for data\noise\other\pink_noise.wav is incorrect. Ignoring it
sampling rate tf.Tensor(16000, shape=(), dtype=int32)
sampling rate tf.Tensor(44100, shape=(), dtype=int32)
Sampling rate for data\noise\_background_noise_\10convert.com_Audience-Claps_daSG5fwdA7o.wav is incorrect. Ignoring it
sampling rate tf.Tensor(22050, shape=(), dtype=int32)
Sampling rate for data\noise\_background_noise_\doing_the_dishes.wav is incorrect. Ignoring it
sampling rate tf.Tensor(22050, shape=(), dtype=int32)
Sampling rate for data\noise\_background_noise_\dude_miaowing.wav is incorrect. Ignoring it
sampling rate tf.Tensor(22050, shape=(), dtype=int32)
Sampling rate for data\noise\_background_noise_\running_tap.wav is incorrect. Ignoring it
Number of noise samples: 60
