<a href="https://colab.research.google.com/github/kregier/AudioLanguageClassifer/blob/main/GenderClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Identify the gender of the speaker from an audio file.

Split data into train and test sets
For **all** audio files, segment into 10s segments.
For **training** data, copy segments and add random noise.

Load the VGGish model.

Create dataset generators to process the files in batches. The data generator runs the segments through the VGGish model and extract the feature embeddings, which are used as input to the classifier model.




In [1]:
# Set up the environment
!pip install pyAudioAnalysis
!pip install hmmlearn
!pip install eyed3
!pip install pydub
!pip install soundfile

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import IPython.display as ipd
import librosa
#import librosa.display

from pyAudioAnalysis import audioSegmentation as aS

import os
import random

import tensorflow as tf
import tensorflow_hub as hub

from keras.layers import Dense
from keras.models import Sequential

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

print("All set up!")

Collecting pyAudioAnalysis
[?25l  Downloading https://files.pythonhosted.org/packages/71/42/09adc0229b78dc514004ecf83508afa36a998502a36a4ebdacc14ae55fcf/pyAudioAnalysis-0.3.6.tar.gz (52.4MB)
[K     |████████████████████████████████| 52.4MB 81kB/s 
[?25hBuilding wheels for collected packages: pyAudioAnalysis
  Building wheel for pyAudioAnalysis (setup.py) ... [?25l[?25hdone
  Created wheel for pyAudioAnalysis: filename=pyAudioAnalysis-0.3.6-cp36-none-any.whl size=52589856 sha256=8a3f048223e3c58af726ac6780b727c0231ce66748d9bb13ce5d4e6cb2091d49
  Stored in directory: /root/.cache/pip/wheels/fd/74/c2/361da76b03ed9d45c1b606d8fd25ac53ab965f754061fc4805
Successfully built pyAudioAnalysis
Installing collected packages: pyAudioAnalysis
Successfully installed pyAudioAnalysis-0.3.6
Collecting hmmlearn
[?25l  Downloading https://files.pythonhosted.org/packages/b3/49/9e9a89cee24b26ef6afec5abbd5eb9cf14632855f32b999389873ecb1b4e/hmmlearn-0.2.4-cp36-cp36m-manylinux1_x86_64.whl (361kB)
[K     |█

In [2]:
# Set up the data import using Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# Change working directory
%cd /content/gdrive/My Drive/Kaggle
!ls

/content/gdrive/My Drive/Kaggle
augment  kaggle.json	reading-passage.txt  speakers_all.csv
data	 processed.csv	recordings


In [80]:
# Import custom functions that I wrote
import augment
from augment import Augment

from imp import reload
reload(augment)
reload(augment.Augment)

Module imported
Augment scripts reloaded


<module 'augment.Augment' from '/content/gdrive/My Drive/Kaggle/augment/Augment.py'>

In [125]:
# Set constants
SAMP_RATE = 16000  #Defined in augment package
BATCH_SIZE = 32  #Defined in augment package
CLF = 'gender'

In [62]:
meta = pd.read_csv('processed.csv', index_col='speakerid')
meta.head()

Unnamed: 0_level_0,age,age_onset,birthplace,filename,native_language,sex,country,file_missing?
speakerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,south africa,False
2,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,south africa,False
3,25.0,15.0,"diekabo, ivory coast",agni1,agni,male,ivory coast,False
4,19.0,6.0,"prishtina, kosovo",albanian1,albanian,male,kosovo,False
5,33.0,15.0,"tirana, albania",albanian2,albanian,male,albania,False


In [63]:
meta.shape

(2134, 8)

In [64]:
meta.isnull().sum()

age                0
age_onset          0
birthplace         0
filename           0
native_language    0
sex                0
country            0
file_missing?      0
dtype: int64

# Data processing
## Split into training and testing sets

In [65]:
# Split data into training and testing sets for gender analysis
data = meta[['filename','sex']]
x_train_names, x_test_names, y_train, y_test = train_test_split(
    data['filename'], data['sex'], test_size=0.25, random_state=38, 
    stratify=data['sex'])

In [66]:
print("Number of training files: ", x_train_names.shape)
print("Number of testing files: ", x_test_names.shape)

Number of training files:  (1600,)
Number of testing files:  (534,)


## Segment the audio files into 10s segments
This takes a bit of time, but should only need to be done once.

In [85]:
# Check if training data has been segmented. If not, segment each audio file.
for i in range(len(x_train_names)):
  # get a filename
  filename = x_train_names.iloc[i]
  # Check to see if the filename has already been segmented
  if any(file.startswith(filename) for file in os.listdir('data/gender/train')):
    pass
  else: 
    augment.Augment.segment_audio(x_train_names.iloc[i], y_train.iloc[i], split='train', clf=CLF)
    print('{} segmented'.format(filename))

In [88]:
# Check if testing data has been segmented. If not, segment each audio file.
for i in range(len(x_test_names)):
  filename = x_test_names.iloc[i]
  if any(file.startswith(filename) for file in os.listdir('data/gender/test')):
    pass
  else: 
    augment.Augment.segment_audio(x_test_names.iloc[i], y_test.iloc[i], split='test', clf=CLF)
    print('{} segmented'.format(filename))


In [89]:
# Generate a list training filenames + segment index to input to add_noise() function
x_train_seg = [x.split('o.wav')[0] for x in os.listdir('data/gender/train') if x.endswith('o.wav')]
print(len(x_train_seg))

['english188.M.0', 'english413.F.1', 'english413.F.0', 'english188.M.1', 'italian28.M.1', 'italian28.M.0', 'xiang3.M.0', 'xiang3.M.2', 'xiang3.M.1', 'english529.F.0', 'french54.F.1', 'french54.F.0', 'english263.M.0', 'english263.M.1', 'swedish1.F.1', 'swedish1.F.0', 'english272.M.0', 'serbian7.F.0', 'serbian7.F.1', 'russian18.M.0', 'russian18.M.1', 'english257.M.0', 'english257.M.1', 'english41.M.0', 'english41.M.1']
3707


In [90]:
# Generate a list testing filenames + segment index
x_test_seg = [x.split('o.wav')[0] for x in os.listdir('data/gender/test') if x.endswith('o.wav')]
print(len(x_test_seg))

['wolof6.M.2', 'estonian1.M.1', 'estonian1.M.0', 'wolof6.M.1', 'wolof6.M.0', 'romanian5.F.0', 'romanian5.F.1', 'english122.F.0', 'english122.F.1', 'vietnamese15.M.1', 'vietnamese15.M.2', 'vietnamese15.M.0', 'vietnamese15.M.3', 'arabic48.M.0', 'arabic48.M.2', 'arabic48.M.1', 'english579.M.0', 'english579.M.1', 'english143.M.1', 'english143.M.0', 'filipino1.M.0', 'japanese5.F.2', 'japanese5.F.0', 'japanese5.F.1', 'english249.F.0']
1206


## Add noise to segments in training set
Not necesary for testing set

In [94]:
# Check if training data has been augmented with noise. If not, add noise to each segment.
for i in range(len(x_train_seg)):
  filename = x_train_seg[i]
  if any((file.startswith(filename)& file.endswith('n.wav')) for file in os.listdir('data/gender/train')):
    pass
  else: 
    augment.Augment.noisy_data(x_train_seg[i], split='train', clf=CLF)
    print('{} augmented'.format(filename))

In [95]:
# Verify there are equal numbers for original segments and noisy segments.
x_train_noise = [x.split('n.wav')[0] for x in os.listdir('data/gender/train') if x.endswith('n.wav')]
print(len(x_train_seg) == len(x_train_noise))

True


## Format input lists for generator

In [97]:
x_train_filenames = os.listdir('data/gender/train')
#print(x_train_filenames[:5])

x_train_filepaths = ['data/gender/train/{}'.format(i) for i in x_train_filenames]
#print(x_train_filepaths[:5])
print(len(x_train_filepaths))

['english188.M.0o.wav', 'english413.F.1o.wav', 'english413.F.0o.wav', 'english188.M.1o.wav', 'italian28.M.1o.wav']
['data/gender/train/english188.M.0o.wav', 'data/gender/train/english413.F.1o.wav', 'data/gender/train/english413.F.0o.wav', 'data/gender/train/english188.M.1o.wav', 'data/gender/train/italian28.M.1o.wav']
7414


In [99]:
x_test_filenames = os.listdir('data/gender/test')
#print(x_test_filenames)

x_test_filepaths = ['data/gender/test/{}'.format(i) for i in x_test_filenames]
#print(x_test_filepaths[:5])
print(len(x_test_filepaths))

1206


## Load VGGish model
Generate a dataset to check the funtionality of the generator before applying to the larger dataset

In [100]:
# Using a SavedModel from the TFHub in Keras
# https://www.tensorflow.org/hub/tf2_saved_model
# VGGish model, from https://tfhub.dev/google/vggish/1

# Link to the model on TFHub
hub_url = 'https://tfhub.dev/google/vggish/1'

# Load the model as a Keras model
vggish_model = hub.KerasLayer(hub_url)
vggish_model.trainable = False

In [153]:
# Run one file through the model to get output shape
import librosa
audio, sr = librosa.load(x_train_filepaths[0], SAMP_RATE)
sample = vggish_model(audio)
print(sample.shape)

(10, 128)


In [135]:
reload(augment.TFGenerator)

<module 'augment.TFGenerator' from '/content/gdrive/My Drive/Kaggle/augment/TFGenerator.py'>

In [136]:
from augment.TFGenerator import tf_data_generator

dataset_check = tf.data.Dataset.from_generator(tf_data_generator,
                                         args = [x_train_filepaths[:2*BATCH_SIZE], BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) )

In [137]:
for data, labels in dataset_check.take(2):
  print(data.shape)
  print(labels)

(32, 10, 128)
tf.Tensor([], shape=(0,), dtype=float32)
(32, 10, 128)
tf.Tensor([], shape=(0,), dtype=float32)


## Generate training, validation and testing datasets

In [138]:
x_train, x_val = train_test_split(x_train_filepaths, test_size=.25, random_state=38)

In [139]:
# Print sizes of data splits
print("Number of training samples: ", len(x_train))
print("Number of training samples: ", len(x_val))
print("Number of training samples: ", len(x_test_seg))

Number of training samples:  5560
Number of training samples:  1854
Number of training samples:  1206


In [140]:
train_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_train, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) ) 
validation_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_val, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) )
test_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_test_filepaths, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) ) 

# Build and compile the classifier model

In [159]:
genderClf = tf.keras.models.Sequential([tf.keras.layers.Dense(128, activation = 'relu'), #, input_shape=(10, 128)),
                              tf.keras.layers.Dense(64, activation = 'relu'),
                              tf.keras.layers.Dense(1, activation='sigmoid'),
                              tf.keras.layers.AveragePooling1D(pool_size=10, strides=None, padding="valid", data_format="channels_last")
                              ])
genderClf.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [160]:
# Add early stopping to train classifier model; default is 10 epochs
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)

**Important:**

Before fitting model, specify the number of epochs and stept to fit, to avoid infinite looping of the generators.

In [161]:
# Calculate how many dataset batches to generate, since generator is infinite
steps_per_epoch = np.int(np.ceil(len(x_train)/BATCH_SIZE))
val_steps = np.int(np.ceil(len(x_val)/BATCH_SIZE))
eval_steps = np.int(np.ceil(len(x_test_filepaths)/BATCH_SIZE))

print("steps_per_epoch = ", steps_per_epoch)
print("validation_steps = ", val_steps)
print("evaluation_steps = ", eval_steps)

steps_per_epoch =  174
validation_steps =  58
evaluation_steps =  38


In [162]:
# Fit the classifier
history = genderClf.fit(train_dataset,
                        steps_per_epoch=steps_per_epoch,
                        epochs=5,
                        validation_data=validation_dataset,
                        validation_steps = val_steps,
                        callbacks=[early_stopping_monitor]) #, batch_size=BATCH_SIZE)

Epoch 1/5


InvalidArgumentError: ignored

In [163]:
genderClf.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 10, 128)           16512     
_________________________________________________________________
dense_10 (Dense)             (None, 10, 64)            8256      
_________________________________________________________________
dense_11 (Dense)             (None, 10, 1)             65        
_________________________________________________________________
average_pooling1d_3 (Average (None, 1, 1)              0         
Total params: 24,833
Trainable params: 24,833
Non-trainable params: 0
_________________________________________________________________


In [None]:
plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Evaluate the trained classifier

In [None]:
test_loss, test_acc = genderClf.evaluate(test_dataset, steps=eval_steps)

In [None]:
y_pred = genderClf.predict(test_dataset, steps=eval_steps)

In [None]:
print(y_pred.shape)
# Probably need to reshape to format for classification report
#y_pred = y_pred [:, 0, 0]
#print(y_pred.shape)
#print(y_pred)

In [None]:
gen_pred  = []
for i in y_pred:
  if i < 0.5:
    gen_pred.append(0)
  else: gen_pred.append(1)

print(y_pred_int)

In [None]:
# Get 1D array of labels from test_dataset
y_lab = np.concatenate([y for x, y in x_test_dataset], axis=0)
print(len(y_lab))
print(y_lab[10])

In [None]:
tf.math.confusion_matrix(y_lab, gen_pred)

In [None]:
classification_report(y_lab, gen_pred)

In [None]:
confusion_matrix(y_lab, gen_pred)