<a href="https://colab.research.google.com/github/kregier/AudioLanguageClassifer/blob/main/PreprocessAudio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [96]:
# Set up the environment
!pip install soundfile

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import IPython.display as ipd
import librosa
import librosa.display
import soundfile as sf

import os
import random
import re

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

from keras.layers import Dense
from keras.models import Sequential

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

print("All set up!")

All set up!


In [97]:
# Set up the data import using Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [98]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# Change working directory
%cd /content/gdrive/My Drive/Kaggle
!ls

/content/gdrive/My Drive/Kaggle
data  kaggle.json  reading-passage.txt	recordings  speakers_all.csv


In [99]:
meta = pd.read_csv('speakers_all.csv')
# Prepare the data based on previous exploration
# Drop 3 end columns with NaN values
meta.drop(['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)

# Set speakerid as index
meta.set_index('speakerid', inplace=True)
meta.sort_index(inplace=True)

# Replace missing values and typos
meta.loc[meta.country.isnull(), 'country'] = 'laos'
type_idx = meta[meta.sex =='famale'].index
meta.loc[type_idx, 'sex'] = 'female'

# Delete records with missing audio files
missingIdx = meta[meta['file_missing?']==True].index
meta.drop(missingIdx, inplace=True )

# Delete records with no birthplace - synthesized files
meta.dropna(subset=['birthplace'], inplace=True)

# Delete files not present in audiofiles database
nica_index = meta[meta.filename == 'nicaragua'].index
sinhalese_index = meta[meta.filename=='sinhalese1'].index
meta.drop(nica_index, inplace=True, axis=0)
meta.drop(sinhalese_index, inplace=True, axis=0)

meta.head()

Unnamed: 0_level_0,age,age_onset,birthplace,filename,native_language,sex,country,file_missing?
speakerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,south africa,False
2,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,south africa,False
3,25.0,15.0,"diekabo, ivory coast",agni1,agni,male,ivory coast,False
4,19.0,6.0,"prishtina, kosovo",albanian1,albanian,male,kosovo,False
5,33.0,15.0,"tirana, albania",albanian2,albanian,male,albania,False


In [100]:
# Set constants
SAMP_RATE = 16000
BATCH_SIZE = 6

In [101]:
# Split data into training and testing sets for gender analysis
data = meta[['sex', 'filename']]
x_train_names, x_test_names, y_train, y_test = train_test_split(data['filename'], 
                                                                data['sex'], 
                                                                test_size=0.25, 
                                                                random_state=38, 
                                                                stratify=data['sex'])
print(x_train_names.shape)

(1600,)


In [102]:
print(x_test_names.shape)

(534,)


Since I can't hold all of the sound files and their augmentations (segmetation, noise addition and VGGish embedding) in memory, I going to write each segement and noise to file, so they can be loaded one by one. I can't figure out how to make this work with tf.Data, since all of the examples use data that is available through TF.

In [103]:
# Scale audio to fall between [-1, 1]
def normalize(audio):
  norm = audio/max(audio)
  return norm

In [104]:
def segment_10s(audio, sr):
  """ Load an audio file and divide into 10 second segments.
  Arguments: audio - the audio file; sr = sampling rate of the file
  Returns: a dictionary of the audio segments. Key is the index of segment, value is the segment.
  """
  seg_files ={}
  n_seg = int((len(audio)/sr)/10)
  for i in range(n_seg):
    segment = audio[10*i*sr:(i+1)*10*sr]
    seg_files[i] = segment
  return seg_files

1. Read each audio file in x_train or x_test
2. Segment the audio file into 10s segments
3. Save each segment to file by appending segment index to filename.

In [105]:
def segment_data(x_names, y_names, split='train', clf='gender'):
  seg_names = []

  for i in range(len(x_names)): #df = x_train_names, x_test_names
    filename = x_names.iloc[i]
    filepath = 'recordings/recordings/' + filename + '.mp3'
    audio, sr = librosa.load(filepath, sr=16000)
    audio = normalize(audio)

    # Add gender label to filename, for later processing
    sex = y_names.iloc[i]
    if sex == "female":
      filename = '{}.F'.format(filename)
    else: filename = '{}.M'.format(filename)

    # Segment audio file
    seg_files = segment_10s(audio, SAMP_RATE)
    for key, val in seg_files.items():
      new_name = '{}.{}'.format(filename, key)
      sf.write('data/{}/{}/{}o.wav'.format(clf, split, new_name), val, SAMP_RATE)
      seg_names.append(new_name)
    # end filename
  return seg_names

In [106]:
x_train_seg = segment_data(x_train_names[:10], y_train, split='train', clf='gender')
print(len(x_train_seg))

19


In [107]:
x_test_seg = segment_data(x_test_names[:10], y_test, split='test', clf='gender')
print(len(x_test_seg))

24


Add noise
1. Read each audio file (10s segments)
2. Add random noise
3. Save noisy segement to file by appending noise to filename

In [108]:
def add_noise(audio):
    '''
    Add random noise to an audio file.
    Arguments: audio - the audio file
    Returns: the noisy audio file
    ''' 
    # Load random number generator
    rng = np.random.default_rng()
    # Generate random noise
    noise = rng.standard_normal(len(audio))
    # Add noise to file
    noisy_seg = audio + 0.005*noise

    return noisy_seg

In [109]:
def noisy_data(x_names, split='train', clf='gender'):
  for i in range(len(x_names)): #list of seg_names
    filename = x_names[i]
    filepath = 'data/{}/{}/{}o.wav'.format(clf, split, filename)
    audio, sr = librosa.load(filepath, sr=16000)
 #   audio = normalize(audio) #Already done when originally segmented
    # Add noise
    noisy = add_noise(audio)
    # Write noise to file
    sf.write('data/{}/{}/{}n.wav'.format(clf, split, filename), noisy, SAMP_RATE)
    print("Noise added to {}".format(x_names[i]))

In [110]:
# Generate noisy samples
noisy_data(x_train_seg[:15], split='train', clf='gender')

Noise added to english188.M.0
Noise added to english188.M.1
Noise added to english413.F.0
Noise added to english413.F.1
Noise added to italian28.M.0
Noise added to italian28.M.1
Noise added to xiang3.M.0
Noise added to xiang3.M.1
Noise added to xiang3.M.2
Noise added to english529.F.0
Noise added to french54.F.0
Noise added to french54.F.1
Noise added to english263.M.0
Noise added to english263.M.1
Noise added to swedish1.F.0


In [111]:
# No need to add noise to test files!
#noisy_data(x_test_seg[:10], split='test', clf='gender')

In [112]:
!ls data/gender/train/

english188.M.0n.wav  english413.F.0n.wav  french54.F.1o.wav   swedish1.F.1o.wav
english188.M.0o.wav  english413.F.0o.wav  italian28.M.0n.wav  xiang3.M.0n.wav
english188.M.1n.wav  english413.F.1n.wav  italian28.M.0o.wav  xiang3.M.0o.wav
english188.M.1o.wav  english413.F.1o.wav  italian28.M.1n.wav  xiang3.M.1n.wav
english263.M.0n.wav  english529.F.0n.wav  italian28.M.1o.wav  xiang3.M.1o.wav
english263.M.0o.wav  english529.F.0o.wav  serbian7.F.0o.wav   xiang3.M.2n.wav
english263.M.1n.wav  french54.F.0n.wav	  serbian7.F.1o.wav   xiang3.M.2o.wav
english263.M.1o.wav  french54.F.0o.wav	  swedish1.F.0n.wav
english272.M.0o.wav  french54.F.1n.wav	  swedish1.F.0o.wav


In [113]:
!ls data/gender/test/

arabic48.M.0o.wav    english579.M.1o.wav  romanian5.F.1o.wav
arabic48.M.1o.wav    estonian1.M.0o.wav   vietnamese15.M.0o.wav
arabic48.M.2o.wav    estonian1.M.1o.wav   vietnamese15.M.1o.wav
english122.F.0o.wav  filipino1.M.0o.wav   vietnamese15.M.2o.wav
english122.F.1o.wav  japanese5.F.0o.wav   vietnamese15.M.3o.wav
english143.M.0o.wav  japanese5.F.1o.wav   wolof6.M.0o.wav
english143.M.1o.wav  japanese5.F.2o.wav   wolof6.M.1o.wav
english579.M.0o.wav  romanian5.F.0o.wav   wolof6.M.2o.wav


Convert to VGGish embedding
1. Load VGGish model
1. Read each audio file (10s segments with and without noise)
1. Run through VGGish model
1. Save embedding by appending _embed to filename

In [121]:
# Using a SavedModel from the TFHub in Keras
# https://www.tensorflow.org/hub/tf2_saved_model
# VGGish model, from https://tfhub.dev/google/vggish/1

# Link to the model on TFHub
hub_url = 'https://tfhub.dev/google/vggish/1'

# Load the model as a Keras model
vggish_model = hub.KerasLayer(hub_url)
vggish_model.trainable = False

# Load the model as a tf.function
vggish_fn = hub.load(hub_url)

In [115]:
x_train_filenames = os.listdir('./data/gender/train')
print(x_train_filenames)

x_train_filepaths = ['./data/gender/train/{}'.format(i) for i in x_train_filenames]
print(len(x_train_filepaths))

['english188.M.0o.wav', 'english188.M.1o.wav', 'english413.F.0o.wav', 'english413.F.1o.wav', 'italian28.M.0o.wav', 'italian28.M.1o.wav', 'xiang3.M.0o.wav', 'xiang3.M.1o.wav', 'xiang3.M.2o.wav', 'english529.F.0o.wav', 'french54.F.0o.wav', 'french54.F.1o.wav', 'english263.M.0o.wav', 'english263.M.1o.wav', 'swedish1.F.0o.wav', 'swedish1.F.1o.wav', 'english272.M.0o.wav', 'serbian7.F.0o.wav', 'serbian7.F.1o.wav', 'english188.M.0n.wav', 'english188.M.1n.wav', 'english413.F.0n.wav', 'english413.F.1n.wav', 'italian28.M.0n.wav', 'italian28.M.1n.wav', 'xiang3.M.0n.wav', 'xiang3.M.1n.wav', 'xiang3.M.2n.wav', 'english529.F.0n.wav', 'french54.F.0n.wav', 'french54.F.1n.wav', 'english263.M.0n.wav', 'english263.M.1n.wav', 'swedish1.F.0n.wav']
34


In [116]:
x_test_filenames = os.listdir('./data/gender/test')
print(x_test_filenames)

x_test_filepaths = ['./data/gender/test/{}'.format(i) for i in x_test_filenames]
print(len(x_train_filepaths))

['wolof6.M.0o.wav', 'wolof6.M.1o.wav', 'wolof6.M.2o.wav', 'estonian1.M.0o.wav', 'estonian1.M.1o.wav', 'romanian5.F.0o.wav', 'romanian5.F.1o.wav', 'english122.F.0o.wav', 'english122.F.1o.wav', 'vietnamese15.M.0o.wav', 'vietnamese15.M.1o.wav', 'vietnamese15.M.2o.wav', 'vietnamese15.M.3o.wav', 'arabic48.M.0o.wav', 'arabic48.M.1o.wav', 'arabic48.M.2o.wav', 'english579.M.0o.wav', 'english579.M.1o.wav', 'english143.M.0o.wav', 'english143.M.1o.wav', 'filipino1.M.0o.wav', 'japanese5.F.0o.wav', 'japanese5.F.1o.wav', 'japanese5.F.2o.wav']
34


# Define transformation function and dataset generator

Adapted from https://biswajitsahoo1111.github.io/post/efficiently-reading-multiple-files-in-tensorflow-2/

In [122]:
def vggish_transform(audio):
  return vggish_fn(audio)

In [127]:
# Write a generator to read data in chunks and process it
# Generator yields both data and labels
# Takes a list of filenames as first argument, batch_size as second argument

#https://biswajitsahoo1111.github.io/post/efficiently-reading-multiple-files-in-tensorflow-2/

def tf_data_generator(file_list, batch_size=32):
  i = 0
  while True: #infinite loop
    if i*batch_size >= len(file_list):
      i=0
      np.random.shuffle(file_list)
    else:
      file_chunk = file_list[i*batch_size:(i+1)*batch_size]
      data = []
      labels = []
      label_classes = tf.constant(['M', 'F'])
      for file in file_chunk:
        # Read data
        audio, sr = librosa.load(file, sr=16000)
        # Apply transformations
        embed = vggish_model(audio)
        data.append(embed)
        #data.append(audio)
        # Extract labels from filename
        bytes_string = file
        string_name = str(bytes_string, 'utf-8')
        split_str = string_name.split('.')
        pattern = tf.constant(split_str[2])
        for j in range(len(label_classes)):
          if re.match(pattern.numpy(), label_classes[j].numpy()):
            labels.append(j)

      data = np.asarray(data)
      labels = np.asarray(labels)

      # To be able t prefecth the data you can use the mpa function, 
      # but this doesn't work for VGGish, sincc VGGish only processes one file at a time
      #first_dim = data.shape[0]
      ## Create tensorflow dataset to use 'map' function for parallelization
      #data_ds = tf.data.Dataset.from_tensor_slices(data)
      #data_ds = data_ds.batch(batch_size = first_dim).map(vggish_transform,
                                                         # num_parallel_calls = tf.data.experimental.AUTOTUNE)
      # Convert dataset to generator and subsequently to np array
      #data_ds = tfds.as_numpy(data_ds)
      #data = np.array([data for data in data_ds]).reshape(first_dim, 10, 128)

      yield data, labels
      i += 1

In [128]:
dataset_check = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_train_filepaths[:12], BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) )

In [129]:
for data, labels in dataset_check.take(2):
  print(data.shape)
  print(labels)

(6, 10, 128)
tf.Tensor([0. 0. 1. 1. 0. 0.], shape=(6,), dtype=float32)
(6, 10, 128)
tf.Tensor([0. 0. 0. 1. 1. 1.], shape=(6,), dtype=float32)


# Create pipeline and model

In [130]:
x_train, x_val = train_test_split(x_train_filepaths, test_size=.25, random_state=38)

In [131]:
# Print sizes of data splits
print("Number of training samples: ", len(x_train))
print("Number of training samples: ", len(x_val))
print("Number of training samples: ", len(x_test_seg))

Number of training samples:  25
Number of training samples:  9
Number of training samples:  24


In [132]:
print(BATCH_SIZE)

6


In [134]:
train_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_train, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) ) 
validation_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_val, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) )
test_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_test_filepaths, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) ) 

In [142]:
# Check structure of datasets, with the goal of extracting the labels

# Look at each type of element component
test_dataset.element_spec

(TensorSpec(shape=(None, 10, 128), dtype=tf.float32, name=None),
 TensorSpec(shape=(None,), dtype=tf.float32, name=None))

In [147]:
y_test_labels = []
for data, labels in test_dataset.take(eval_steps):
  y_test_labels.append(labels.numpy())

print(type(y_test_labels))
print(len(y_test_labels))
print(y_test_labels)

<class 'list'>
4


In [None]:
#I don't think this will work, since I can't get VGGish to fuction on multiple files at a time
# Prefetch datasets = prepare next batch with CPU while GPU trains on previous batch
#train_dataset = train_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
#validation_dataset = validation_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)

# Build and compile the model
Copy from other notebooks

In [135]:
genderClf = tf.keras.models.Sequential([tf.keras.layers.Dense(128, activation = 'relu'),
                              tf.keras.layers.Dense(64, activation = 'relu'),
                              tf.keras.layers.Dense(1, activation='sigmoid'),
                              tf.keras.layers.AveragePooling1D(pool_size=10, strides=None, padding="valid", data_format="channels_last")
                              ])
genderClf.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [136]:
# Add early stopping to train classifier model; default is 10 epochs
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)

*Important* before fitting model, specify number of epochs and steps to fit, since generators are infinite loops.


In [137]:
# Calculate how many dataset batches to generate, since generator is infinite
steps_per_epoch = np.int(np.ceil(len(x_train)/BATCH_SIZE))
val_steps = np.int(np.ceil(len(x_val)/BATCH_SIZE))
eval_steps = np.int(np.ceil(len(x_test_filepaths)/BATCH_SIZE))

print("steps_per_epoch = ", steps_per_epoch)
print("validation_steps = ", val_steps)
print("steps = ", eval_steps)

steps_per_epoch =  5
validation_steps =  2
steps =  4


In [138]:
#model.fit(train_dataset, validation_data = validation_dataset, steps_per_epoch = steps_per_epoch,
#         validation_steps = val_steps, epochs = 5)

history = genderClf.fit(train_dataset,
                        steps_per_epoch=steps_per_epoch,
                        epochs=20,
                        validation_data=validation_dataset,
                        validation_steps = val_steps,
                        callbacks=[early_stopping_monitor], 
                        batch_size=BATCH_SIZE)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


In [141]:
# Evaluate model?
test_loss, test_acc = genderClf.evaluate(test_dataset, steps=eval_steps)
#print(test_loss)
#print(test_acc)



In [149]:
# Making predictions
y_pred = genderClf.predict(test_dataset, steps=eval_steps)

In [153]:
print(y_pred.shape)
y_pred = y_pred [:, 0, 0]
print(y_pred.shape)
print(y_pred)

(24,)


IndexError: ignored

In [155]:
y_pred_int  = []
for i in y_pred:
  if i < 0.5:
    y_pred_int.append(0)
  else: y_pred_int.append(1)

print(y_pred_int)

[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]


https://www.tensorflow.org/api_docs/python/tf/data/Dataset

To create a dataset of all files matching a pattern, use tf.data.Dataset.list_files:



In [None]:
#dataset = tf.data.Dataset.list_files("/path/*.txt")  # doctest: +SKIP

batch
View source

batch(
    batch_size, drop_remainder=False
)

Combines consecutive elements of this dataset into batches.

dataset = tf.data.Dataset.range(8)
dataset = dataset.batch(3)
list(dataset.as_numpy_iterator())


dataset = tf.data.Dataset.range(8)
dataset = dataset.batch(3, drop_remainder=True)
list(dataset.as_numpy_iterator())


The components of the resulting element will have an additional outer dimension, which will be batch_size (or N % batch_size for the last element if batch_size does not divide the number of input elements N evenly and drop_remainder is False). If your program depends on the batches having the same outer dimension, you should set the

https://www.tensorflow.org/tutorials/audio/simple_audio

Simple audio recognition: Recognizing keywords