<a href="https://colab.research.google.com/github/kregier/AudioLanguageClassifer/blob/main/GenderClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Identify the gender of the speaker from an audio file.

Split data into train and test sets
For **all** audio files, segment into 10s segments.
For **training** data, copy segments and add random noise.

Load the VGGish model.

Create dataset generators to process the files in batches. The data generator runs the segments through the VGGish model and extract the feature embeddings, which are used as input to the classifier model.




In [1]:
# Set up the environment
!pip install pyAudioAnalysis
!pip install hmmlearn
!pip install eyed3
!pip install pydub
!pip install soundfile

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import IPython.display as ipd
import librosa
import librosa.display

from pyAudioAnalysis import audioSegmentation as aS

import os
import random

import tensorflow as tf
import tensorflow_hub as hub

from keras.layers import Dense
from keras.models import Sequential

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

print("All set up!")

All set up!


In [2]:
# Set up the data import using Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# Change working directory
%cd /content/gdrive/My Drive/Kaggle
!ls

/content/gdrive/My Drive/Kaggle
augment  kaggle.json	reading-passage.txt  speakers_all.csv
data	 processed.csv	recordings


In [13]:
# Import custom functions that I wrote
import augment

In [5]:
# Set constants
#SAMP_RATE = 16000  #Defined in augment package
#BATCH_SIZE = 32  #Defined in augment package
CLF = 'gender'

In [6]:
meta = pd.read_csv('processed.csv', index_col='speakerid')
meta.head()

Unnamed: 0_level_0,age,age_onset,birthplace,filename,native_language,sex,country,file_missing?
speakerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,south africa,False
2,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,south africa,False
3,25.0,15.0,"diekabo, ivory coast",agni1,agni,male,ivory coast,False
4,19.0,6.0,"prishtina, kosovo",albanian1,albanian,male,kosovo,False
5,33.0,15.0,"tirana, albania",albanian2,albanian,male,albania,False


In [7]:
meta.shape

(2134, 8)

In [8]:
meta.isnull().sum()

age                0
age_onset          0
birthplace         0
filename           0
native_language    0
sex                0
country            0
file_missing?      0
dtype: int64

# Data processing
## Split into training and testing sets

In [9]:
# Split data into training and testing sets for gender analysis
data = meta[['filename','sex']]
x_train_names, x_test_names, y_train, y_test = train_test_split(
    data['filename'], data['sex'], test_size=0.25, random_state=38, 
    stratify=data['sex'])

In [10]:
print("Number of training files: ", x_train_names.shape)
print("Number of testing files: ", x_test_names.shape)

Number of training files:  (1600,)
Number of testing files:  (534,)


## Segment the audio files into 10s segments

In [14]:
x_train_seg = segment_data(x_train_names, y_train, split='train', clf=CLF)
print(len(x_train_seg))

NameError: ignored

In [None]:
x_test_seg = segment_data(x_test_names, y_test, split='test', clf=CLF)
print(len(x_test_seg))

## Add noise to segments in training set
Not necesary for testing set

In [None]:
noisy_data(x_train_seg, split='train', clf=CLF)

## Format input lists for generator

In [None]:
x_train_filenames = os.listdir('.data/gender/train')
print(x_train_filenames[:5])

x_train_filepaths = ['./data/gender/train/{}'.format(i) for i in x_train_filenames]
print(x_train_filepaths[:5])
print(len(x_train_filepaths))

In [None]:
x_test_filenames = os.listdir('./data/gender/test')
print(x_test_filenames)

x_test_filepaths = ['./data/gender/test/{}'.format(i) for i in x_test_filenames]
print(x_test_filepaths[:5])
print(len(x_test_filepaths))

## Load VGGish model
Generate a dataset to check the funtionality of the generator before applying to the larger dataset

In [None]:
# Using a SavedModel from the TFHub in Keras
# https://www.tensorflow.org/hub/tf2_saved_model
# VGGish model, from https://tfhub.dev/google/vggish/1

# Link to the model on TFHub
hub_url = 'https://tfhub.dev/google/vggish/1'

# Load the model as a Keras model
vggish_model = hub.KerasLayer(hub_url)
vggish_model.trainable = False

In [None]:
dataset_check = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_train_filepaths[:2*BATCH_SIZE], BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) )

In [None]:
for data, labels in dataset_check.take(2):
  print(data.shape)
  print(labels)

## Generate training, validation and testing datasets

In [None]:
x_train, x_val = train_test_split(x_train_filepaths, test_size=.25, random_state=38)

In [None]:
# Print sizes of data splits
print("Number of training samples: ", len(x_train))
print("Number of training samples: ", len(x_val))
print("Number of training samples: ", len(x_test_seg))

In [None]:
train_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_train, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) ) 
validation_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_val, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) )
test_dataset = tf.data.Dataset.from_generator(tf_data_generator, 
                                         args = [x_test_filepaths, BATCH_SIZE],
                                         output_types=(tf.float32, tf.float32),
                                         output_shapes= ((None, 10, 128),(None,)) ) 

# Build and compile the classifier model

In [None]:
genderClf = tf.keras.models.Sequential([tf.keras.layers.Dense(128, activation = 'relu'),
                              tf.keras.layers.Dense(64, activation = 'relu'),
                              tf.keras.layers.Dense(1, activation='sigmoid'),
                              tf.keras.layers.AveragePooling1D(pool_size=10, strides=None, padding="valid", data_format="channels_last")
                              ])
genderClf.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [None]:
genderClf.summary()

In [None]:
# Add early stopping to train classifier model; default is 10 epochs
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)

**Important:**

Before fitting model, specify the number of epochs and stept to fit, to avoid infinite looping of the generators.

In [None]:
# Calculate how many dataset batches to generate, since generator is infinite
steps_per_epoch = np.int(np.ceil(len(x_train)/BATCH_SIZE))
val_steps = np.int(np.ceil(len(x_val)/BATCH_SIZE))
eval_steps = np.int(np.ceil(len(x_test_filepaths)/BATCH_SIZE))

print("steps_per_epoch = ", steps_per_epoch)
print("validation_steps = ", val_steps)
print("steps = ", eval_steps)

In [None]:
# Fit the classifier
history = genderClf.fit(train_dataset,
                        steps_per_epoch=steps_per_epoch,
                        epochs=20,
                        validation_data=validation_dataset,
                        validation_steps = val_steps,
                        callbacks=[early_stopping_monitor], 
                        batch_size=BATCH_SIZE)

In [None]:
plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Evaluate the trained classifier

In [None]:
test_loss, test_acc = genderClf.evaluate(test_dataset, steps=eval_steps)

In [None]:
y_pred = genderClf.predict(test_dataset, steps=eval_steps)

In [None]:
print(y_pred.shape)
# Probably need to reshape to format for classification report
#y_pred = y_pred [:, 0, 0]
#print(y_pred.shape)
#print(y_pred)

In [None]:
gen_pred  = []
for i in y_pred:
  if i < 0.5:
    gen_pred.append(0)
  else: gen_pred.append(1)

print(y_pred_int)

In [None]:
# Get 1D array of labels from test_dataset
y_lab = np.concatenate([y for x, y in x_test_dataset], axis=0)
print(len(y_lab))
print(y_lab[10])

In [None]:
tf.math.confusion_matrix(y_lab, gen_pred)

In [None]:
classification_report(y_lab, gen_pred)

In [None]:
confusion_matrix(y_lab, gen_pred)