<a href="https://colab.research.google.com/github/kregier/AudioLanguageClassifer/blob/main/SampleModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is this notebook?
This is a notebook to load a few audio files and load the VGGish model. The idea is to make sure the model loads and runs before moving it to the larger notebook.

In [1]:
# Set up the environment
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import IPython.display as ipd
import librosa
import librosa.display

import os
import random

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

from sklearn.metrics import classification_report, confusion_matrix

print("All set up!")

All set up!


# Load the data
- Connect to google drive
- Load a few sample audio files


In [2]:
# Set up the data import using Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# Change working directory
%cd /content/gdrive/My Drive/Kaggle
!ls

/content/gdrive/My Drive/Kaggle
data  kaggle.json  reading-passage.txt	recordings  speakers_all.csv


In [90]:
meta = pd.read_csv('speakers_all.csv')
# Prepare the data based on previous exploration
# Drop 3 end columns with NaN values
meta.drop(['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)

# Set speakerid as index
meta.set_index('speakerid', inplace=True)
meta.sort_index(inplace=True)

# Replace missing values and typos
meta.loc[meta.country.isnull(), 'country'] = 'laos'
type_idx = meta[meta.sex =='famale'].index
meta.loc[type_idx, 'sex'] = 'female'

# Delete records with missing audio files
missingIdx = meta[meta['file_missing?']==True].index
meta.drop(missingIdx, inplace=True )

# Delete records with no birthplace - synthesized files
meta.dropna(subset=['birthplace'], inplace=True)

# Delete files not present in audiofiles database
nica_index = meta[meta.filename == 'nicaragua'].index
sinhalese_index = meta[meta.filename=='sinhalese1'].index
meta.drop(nica_index, inplace=True, axis=0)
meta.drop(sinhalese_index, inplace=True, axis=0)

meta.head()

Unnamed: 0_level_0,age,age_onset,birthplace,filename,native_language,sex,country,file_missing?
speakerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,south africa,False
2,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,south africa,False
3,25.0,15.0,"diekabo, ivory coast",agni1,agni,male,ivory coast,False
4,19.0,6.0,"prishtina, kosovo",albanian1,albanian,male,kosovo,False
5,33.0,15.0,"tirana, albania",albanian2,albanian,male,albania,False


In [91]:
meta.shape

(2134, 8)

In [92]:
# Select 96 files at random from meta.filename
data = np.random.choice(meta.filename, size=96, replace=False)

idx = meta[meta.filename.isin(data)].index
df = meta.loc[idx, ['filename', 'sex']]

df.head()

Unnamed: 0_level_0,filename,sex
speakerid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,afrikaans1,female
35,bengali2,female
54,danish1,female
86,english28,male
91,english32,female


In [93]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['filename'], df['sex'], random_state=38, test_size=0.33)

### Format features for testing and training sets

In [94]:
SAMP_RATE = 16000

In [95]:
# Segment the files into 10s arrays to have consistent input dimensions
def get_10s(audio, sr):
  """ Load an audio file and get the first 10 seconds.
  Arguments: audio - the audio file; sr = sampling rate of the file
  Returns: first 10s of audio file.
  """
  beginning = audio[0:10*sr]
  return beginning

In [96]:
# Scale audio to fall between [-1, 1]
def normalize(audio):
  norm = audio/max(audio)
  return norm

In [97]:
# Sample audio files
def load_data(series,  sr):
  output = []
  for i in range(len(series)):
    filename = series.iloc[i]
    filepath = 'recordings/recordings/' + filename + '.mp3'
    soundfile, sr = librosa.load(filepath, sr=SAMP_RATE)
    output.append(normalize(get_10s(soundfile, sr)))
  return np.asarray(output)

Input vector should be [batch_size, num_frames, num_bands]
- batch size = 32
- num_frames = 94
- num_bands = 64
(96 x 64)  = log mel spectrogram


In [98]:
x_train = load_data(x_train, SAMP_RATE)
print(x_train.shape)
print(type(x_train))

(64, 160000)
<class 'numpy.ndarray'>


In [54]:
x_test = load_data(x_test, SAMP_RATE)
print(x_test.shape)
print(type(x_test))

(32, 160000)
<class 'numpy.ndarray'>


In [63]:
# Format label arrays
def gender_str_to_int(labels):
  y_label = []
  for i in range(len(labels)):
    gender = labels.iloc[i]
    if gender == 'male':
      y_label.append(1)
    else: y_label.append(0)
  return np.asarray(y_label)

In [86]:
y_train_label = gender_str_to_int(y_train)
print(type(y_train_label))
print(y_train_label.shape)

<class 'numpy.ndarray'>
(64,)


In [87]:
y_test_label = gender_str_to_int(y_test)
print(type(y_test_label))
print(y_test_label.shape)

<class 'numpy.ndarray'>
(32,)


## Convert data (np. arrays) to a TF Dataset

In [None]:
train_dataset =  tf.data.Dataset.from_tensor_slices( (x_train, y_train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices( (x_test, y_test_labels))

In [None]:
BATCH_SIZE=32
SHUFFLE_BUFFER_SIZE=100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

# Load the pre-trained VGGish model from Tensorflow Hub

In [68]:
# Link to the model on TFHub
hub_url = 'https://tfhub.dev/google/vggish/1'

# Load the model as a Keras model
vggish_model = hub.KerasLayer(hub_url)
#vggish_model.summary()

## Run sample audio through the model and examine the embedding

In [69]:
for i in range(len(x_train)):
  vggish_embed = vggish_model(x_train[0][i])
  print(vggish_embed.shape, vggish_embed.dtype)

(10, 128) <dtype: 'float32'>


# Embed the vggish model/embeddings into a binary gender classifier.

In [71]:
genderClf = tf.keras.models.Sequential([vggish_model,
                              tf.keras.layers.Dense(128, activation = 'relu'), #, input_shape = (, 160000)),
                              tf.keras.layers.Dense(64, activation = 'relu'),
                              tf.keras.layers.Dense(1, activation='sigmoid')
                              ])
genderClf.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [73]:
# Add early stopping to train classifier model
# default is 10 epochs
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)

#genderClf.fit(x_train_norm[0], y_train_label[0], epochs=10) #, callbacks=[early_stopping_monitor]) #validation_split=0.25,

In [88]:
#x_train.shape

(64, 160000)

In [89]:
genderClf.fit(x_train, y_train_label, epochs=10, callbacks=[early_stopping_monitor], 
              validation_split=0.25, batch_size=32)

Epoch 1/10


ValueError: ignored

In [None]:
genderClf.fit(train_dataset, epochs=10, callbacks=[early_stopping_monitor], 
              validation_split=0.25, batch_size=BATCH_SIZE)

In [72]:
genderClf.summary()

ValueError: ignored

In [92]:
#y_pred = genderClf.predict(x_test)
y_pred = genderClf.evaluate(test_dataset)
#y_pred = genderClf.predict(normalize(get_10s(spanish_raw, SAMP_RATE)), y_test_label)





In [94]:
confusion_matrix(y_test, y_pred)

[[[0.97188115]
  [0.42228356]]]
[[1 0]]


## Confirm the types and shapes of the input vectors

In [84]:
print(type(x_train_norm))
print(x_train_norm.shape)

print(type(y_train_label))
print(y_train_label.shape)

<class 'numpy.ndarray'>
(1, 3, 160000)
<class 'numpy.ndarray'>
(1, 3)


In [85]:
print(x_test_norm.shape)
print(type(x_test_norm))

(1, 2, 160000)
<class 'numpy.ndarray'>


In [None]:
for i in range(len(x_train_norm)):
  print(type(x_train_norm[i]))
  print(x_train_norm[i].shape)
  print(type(x_train_norm[i][0]))
  print("- - - - - - - - -")

# Next steps:
Embed the vggish model into a trainable binary classifier.

In [None]:
# enable fine-tuning with trainable argument
#layer = hub.KerasLayer(..., trainable=True)

# Reexport the fine-tuned model

#loaded_obj = hub.load("https://tfhub.dev/...")
#hub_layer = hub.KerasLayer(loaded_obj, trainable=True, ...)

#model = keras.Sequential([..., hub_layer, ...])
#model.compile(...)
#model.fit(...)

#export_module_dir = os.path.join(os.getcwd(), "finetuned_model_export")
#tf.saved_model.save(loaded_obj, export_module_dir)

Epoch - on complete pass through the dataset

Batch size = divide dataset into smaller parts/sets

Iterations - number of batches to complete one epoch

In [None]:
#model.fit(x, y, batch_size=n, epochs=n) # Batch size default is 32
# Do not use batch size if the data is in the form of dataset, generators, or keras.utils.Sequence instances

# Import larger sample set
- at least two batches, to see how model fits
- a least one batch of testing data

In [None]:
meta.head()

In [None]:
# Format label arrays
#def gender_int(labels):
#  y_label = []
#  for name in labels:
#    idx = meta[meta.filename == name].index
#    gender = meta.loc[idx, 'sex'].values[0]
#    if gender == 'male':
#      y_label.append(1)
#    else: y_label.append(0)

In [None]:
y_train.head()

## Original (small samples)

In [62]:
#Filenames
x_train = ['afrikaans1', 'mandarin3','french38']
x_test = ['spanish94', 'lao2']

# Sample audio files
afrikaans = 'recordings/recordings/afrikaans1.mp3'
mandarin = 'recordings/recordings/mandarin3.mp3'
spanish = 'recordings/recordings/spanish94.mp3'
french = 'recordings/recordings/french38.mp3'
lao = 'recordings/recordings/lao2.mp3'

SAMP_RATE = 16000

afrikaans_raw, sr = librosa.load(afrikaans, sr=SAMP_RATE)
mandarin_raw, sr = librosa.load(mandarin, sr=SAMP_RATE)
spanish_raw, sr = librosa.load(spanish, sr=SAMP_RATE)
french_raw, sr = librosa.load(french, sr=SAMP_RATE)
lao_raw, sr = librosa.load(lao, sr=SAMP_RATE)

In [63]:
# Results in np arrays of different lengths, since the audio files are different lengths
x_train_features = np.asarray([afrikaans_raw, mandarin_raw, french_raw])
# print(x_train_features.shape)
# print(type(x_train_features))

In [64]:
x_test_features = np.asarray([spanish_raw, lao_raw])
# print(x_test_features.shape)
# print(type(x_test_features))

In [25]:
#x_train_beg = np.asarray([get_10s(i, SAMP_RATE) for i in x_train_features])
#print(x_train_beg.shape)
#print(type(x_train_beg))

(4, 160000)
<class 'numpy.ndarray'>


In [67]:
x_train_norm = np.asarray([normalize(get_10s(i, SAMP_RATE)) for i in x_train_features])
#x_train_norm = np.asarray([normalize(i) for i in x_train_beg])
print(x_train_norm.shape)
#print(type(x_train_norm))
#print(type(x_train_norm[0]))
#print(type(x_train_norm[0][0]))

# Reshape x_train_norm to have shape (None, 4, 16000)
x_train_norm = x_train_norm[None, :,:]

print(x_train_norm.shape)
#print(type(x_train_norm))
#print(type(x_train_norm[0]))
#print(type(x_train_norm[0][0]))
#print(type(x_train_norm[0][0][0]))

(3, 160000)
(1, 3, 160000)


In [68]:
# Format x_test array
x_test_norm = np.asarray([normalize(get_10s(i, SAMP_RATE)) for i in x_test_features])
print(x_test_norm.shape)

x_test_norm = x_test_norm[None, :,:]
print(x_test_norm.shape)
# print(type(x_test_norm))
# print(type(x_test_norm[0]))
# print(type(x_test_norm[0][0]))
# Need to reshape  this array!

(2, 160000)
(1, 2, 160000)
