<a href="https://colab.research.google.com/github/kregier/AudioLanguageClassifer/blob/main/SampleModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is this notebook?
This is a notebook to load a few audio files and load the VGGish model. The idea is to make sure the model loads and runs before moving it to the larger notebook.

In [1]:
# Set up the environment
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import IPython.display as ipd
import librosa
import librosa.display

import os
import random

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

print("All set up!")

All set up!


# Load the data
- Connect to google drive
- Load a few sample audio files


In [2]:
# Set up the data import using Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# Change working directory
%cd /content/gdrive/My Drive/Kaggle
!ls

/content/gdrive/My Drive/Kaggle
data  kaggle.json  reading-passage.txt	recordings  speakers_all.csv


In [4]:
meta = pd.read_csv('speakers_all.csv')

#Filenames
x_train = ['afrikaans1', 'mandarin3','french38', 'lao2']
x_test = ['spanish94']

In [5]:
# Sample audio files
afrikaans = 'recordings/recordings/afrikaans1.mp3'
mandarin = 'recordings/recordings/mandarin3.mp3'
spanish = 'recordings/recordings/spanish94.mp3'
french = 'recordings/recordings/french38.mp3'
lao = 'recordings/recordings/lao2.mp3'

SAMP_RATE = 16000

afrikaans_raw, sr = librosa.load(afrikaans, sr=SAMP_RATE)
mandarin_raw, sr = librosa.load(mandarin, sr=SAMP_RATE)
spanish_raw, sr = librosa.load(spanish, sr=SAMP_RATE)
french_raw, sr = librosa.load(french, sr=SAMP_RATE)
lao_raw, sr = librosa.load(lao, sr=SAMP_RATE)

In [6]:
# Results in np arrays of different lengths, since the audio files are different lengths
x_train_features = np.asarray([afrikaans_raw, mandarin_raw, french_raw, lao_raw])
print(x_train_features.shape)
print(type(x_train_features))

(4,)
<class 'numpy.ndarray'>


In [7]:
# Segment the files into 10s arrays to have consistent input dimensions
def get_10s(audio, sr):
  """ Load an audio file and get the first 10 seconds.
  Arguments: audio - the audio file; sr = sampling rate of the file
  Returns: first 10s of audio file.
  """
  beginning = audio[0:10*sr]
  return beginning

In [8]:
x_train_beg = np.asarray([get_10s(i, SAMP_RATE) for i in x_train_features])
print(x_train_beg.shape)
print(type(x_train_beg))

(4, 160000)
<class 'numpy.ndarray'>


In [9]:
def normalize(audio):
  norm = audio/max(audio)
  return norm

In [34]:
x_train_norm = np.asarray([normalize(i) for i in x_train_beg])
print(x_train_norm.shape)
print(type(x_train_norm))
print(type(x_train_norm[0]))
print(type(x_train_norm[0][0]))

# Reshape x_train_norm to have shape (None, 4, 16000)
# x_train_norm = np.expand_dims(x_train_norm, axis=0)  # didn't work
# x_train_norm = x_train_norm[np.newaxis] # also doesn't work
# x_train_norm = x_train_norm[None, :] # also doesn't work
#x_train_norm.reshape(-1, 4, 16000) # doesn't work
x_train_norm = x_train_norm[None, :,:]

print(x_train_norm.shape)
print(type(x_train_norm))
print(type(x_train_norm[0]))
print(type(x_train_norm[0][0]))
print(type(x_train_norm[0][0][0]))

(4, 160000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float32'>
(1, 4, 160000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float32'>


In [39]:
x_test_features = normalize(get_10s(spanish_raw, SAMP_RATE))
x_test_features = x_test_features[None, :]
print(x_test_features.shape)
print(type(x_test_features))
print(type(x_test_features[0]))
print(type(x_test_features[0][0]))
# Need to reshape  this array!

(1, 160000)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float32'>


In [44]:
y_label = []
for name in x_train:
  idx = meta[meta.filename == name].index
  gender = meta.loc[idx, 'sex'].values[0]
  if gender == 'male':
    y_label.append(1)
  else: y_label.append(0)

y_train_label = np.asarray(y_label)
print(y_train_label)
print(y_train_label.shape)
y_train_label = y_train_label[None, :]
print(y_train_label.shape)
print(type(y_train_label))

y_test_label = np.asarray([1])
print(type(y_test_label))

[0 1 1 0]
(4,)
(1, 4)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


# Load the pre-trained VGGish model from Tensorflow Hub

In [13]:
# Link to the model on TFHub
hub_url = 'https://tfhub.dev/google/vggish/1'

# Load the model as a Keras model
vggish_model = hub.KerasLayer(hub_url)
#vggish_model.summary()

## Run sample audio through the model and examine the embedding

In [14]:
for i in range(len(x_train_norm)):
  vggish_embed = vggish_model(x_train_norm[i])
  print(vggish_embed.shape, vggish_embed.dtype)

(10, 128) <dtype: 'float32'>
(10, 128) <dtype: 'float32'>
(10, 128) <dtype: 'float32'>
(10, 128) <dtype: 'float32'>


# Embed the vggish model/embeddings into a binary gender classifier.

In [22]:
#classifier = tf.keras.Sequential([
#    hub.KerasLayer(classifier_model, input_shape=IMAGE_SHAPE+(3,))
#])

genderClf = tf.keras.models.Sequential([ #vggish_model,
                              tf.keras.layers.Dense(128, activation = 'relu', input_shape = (4, 160000)),
                              tf.keras.layers.Dense(64, activation = 'relu'),
                              tf.keras.layers.Dense(1, activation='sigmoid')
                              ])
genderClf.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [43]:
# Add early stopping to train classifier model
# default is 10 epochs
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)

#genderClf.fit(x_train_norm[0], y_train_label[0], epochs=10) #, callbacks=[early_stopping_monitor]) #validation_split=0.25,

In [45]:
genderClf.fit(x_train_norm, y_train_label, epochs=10) #, callbacks=[early_stopping_monitor]) #validation_split=0.25,

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f341e24d828>

## Confirm the types and shapes of the input vectors

In [18]:
print(type(x_train_norm))
print(x_train_norm.shape)

print(type(y_train_label))
print(y_train_label.shape)

<class 'numpy.ndarray'>
(4, 160000)
<class 'numpy.ndarray'>
(4,)


In [19]:
for i in range(len(x_train_norm)):
  print(type(x_train_norm[i]))
  print(x_train_norm[i].shape)
  print(type(x_train_norm[i][0]))
  print("- - - - - - - - -")

<class 'numpy.ndarray'>
(160000,)
<class 'numpy.float32'>
- - - - - - - - -
<class 'numpy.ndarray'>
(160000,)
<class 'numpy.float32'>
- - - - - - - - -
<class 'numpy.ndarray'>
(160000,)
<class 'numpy.float32'>
- - - - - - - - -
<class 'numpy.ndarray'>
(160000,)
<class 'numpy.float32'>
- - - - - - - - -


# Next steps:
Embed the vggish model into a trainable binary classifier.

In [None]:
# enable fine-tuning with trainable argument
#layer = hub.KerasLayer(..., trainable=True)

# Reexport the fine-tuned model

#loaded_obj = hub.load("https://tfhub.dev/...")
#hub_layer = hub.KerasLayer(loaded_obj, trainable=True, ...)

#model = keras.Sequential([..., hub_layer, ...])
#model.compile(...)
#model.fit(...)

#export_module_dir = os.path.join(os.getcwd(), "finetuned_model_export")
#tf.saved_model.save(loaded_obj, export_module_dir)