In [1]:
import os
import numpy as np
import soundfile as sf 
import python_speech_features as speech_lib

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from sklearn.metrics import classification_report


%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)

# Importing lib.py script from my local drive
import sys
sys.path.append('/home/kenan/Desktop/Speech Processing/scripts/')
import lib

In [3]:
# Importing metadata from my local drive
speakers_file = '/home/kenan/Desktop/Speech Processing/dev-clean/LibriSpeech/SPEAKERS.TXT'

# Exploring the dataset

In [4]:
# Exploring the speech data from my local drive
Audio1_path = '/home/kenan/Desktop/Speech Processing/dev-clean/LibriSpeech/dev-clean/84/121123/84-121123-0002.flac'                                                  
data, samplerate = sf.read(Audio1_path)
print('samplerate in Hz is', samplerate)

Audio2_path = '/home/kenan/Desktop/Speech Processing/dev-clean/LibriSpeech/dev-clean/174/168635/174-168635-0002.flac'                                                  
data, samplerate = sf.read(Audio2_path)
print('samplerate in Hz is', samplerate)  

samplerate in Hz is 16000
samplerate in Hz is 16000


In [5]:
# Appending speaker ID and speeker gender
# Class 0 referes to a femlae speaker and class 1 referes to a male speaker 
with open(speakers_file) as f:
    content = f.readlines() 

id_speaker = np.array([], dtype=int)
gender_speaker = np.array([], dtype=int)
for line in content:
    if 'dev-clean' in line:
        id_speaker = np.append(id_speaker, int(line.split('|')[0]))
        if 'F' in line.split('|')[1]:
            gender_speaker = np.append(gender_speaker,0)
        elif 'M' in line.split('|')[1]:
            gender_speaker = np.append(gender_speaker,1)

In [6]:
gender_speaker = np.asarray(gender_speaker)
id_speaker = np.asarray(id_speaker)
metadata = np.concatenate([np.expand_dims(id_speaker, axis=1).T, np.expand_dims(gender_speaker,axis=1).T]).T

In [7]:
for row  in metadata:
    print("Speaker id : {:5d}, gender class : {:d}".format(row[0],row[1]))

Speaker id :    84, gender class : 0
Speaker id :   174, gender class : 1
Speaker id :   251, gender class : 1
Speaker id :   422, gender class : 1
Speaker id :   652, gender class : 1
Speaker id :   777, gender class : 1
Speaker id :  1272, gender class : 1
Speaker id :  1462, gender class : 0
Speaker id :  1673, gender class : 0
Speaker id :  1919, gender class : 0
Speaker id :  1988, gender class : 0
Speaker id :  1993, gender class : 0
Speaker id :  2035, gender class : 0
Speaker id :  2078, gender class : 1
Speaker id :  2086, gender class : 1
Speaker id :  2277, gender class : 0
Speaker id :  2412, gender class : 0
Speaker id :  2428, gender class : 1
Speaker id :  2803, gender class : 1
Speaker id :  2902, gender class : 1
Speaker id :  3000, gender class : 1
Speaker id :  3081, gender class : 0
Speaker id :  3170, gender class : 1
Speaker id :  3536, gender class : 0
Speaker id :  3576, gender class : 0
Speaker id :  3752, gender class : 1
Speaker id :  3853, gender class : 0
S

In [26]:
# Assigning female and male speakers in the training data
female_speakers = metadata[metadata[:,1]==0]
male_speakers = metadata[metadata[:,1]==1]

In [27]:
# The male and female ratios in train and test sets are kept 50/50
train_female, test_female = lib.split_train_test_speakers(female_speakers[:,0],ratio = 0.5, seed=1)
train_male, test_male = lib.split_train_test_speakers(male_speakers[:,0],ratio = 0.5, seed=0)

In [28]:
# Putting together the training and testing sets consisting of both male and felames speakers 
train_speakers = np.append(np.append(np.expand_dims(train_female,axis=1),np.zeros([train_female.shape[0],1]),axis=1),
                           np.append(np.expand_dims(train_male,axis=1),np.ones([train_male.shape[0],1]),axis=1),axis=0).astype(int)
test_speakers = np.append(np.append(np.expand_dims(test_female,axis=1),np.zeros([test_female.shape[0],1]),axis=1),
                           np.append(np.expand_dims(test_male,axis=1),np.ones([test_male.shape[0],1]),axis=1),axis=0).astype(int)

In [29]:
print("Training set : ")
for row  in train_speakers:
    print("Speaker id : {:5d}, gender class : {:d}".format(row[0],row[1]))

Training set : 
Speaker id :  1919, gender class : 0
Speaker id :  6319, gender class : 0
Speaker id :  2035, gender class : 0
Speaker id :  3536, gender class : 0
Speaker id :  1673, gender class : 0
Speaker id :  5895, gender class : 0
Speaker id :  1988, gender class : 0
Speaker id :  6345, gender class : 0
Speaker id :  2277, gender class : 0
Speaker id :  1462, gender class : 0
Speaker id :  7976, gender class : 1
Speaker id :   251, gender class : 1
Speaker id :  8297, gender class : 1
Speaker id :  2428, gender class : 1
Speaker id :  2902, gender class : 1
Speaker id :  6295, gender class : 1
Speaker id :  2078, gender class : 1
Speaker id :  3752, gender class : 1
Speaker id :   777, gender class : 1
Speaker id :   422, gender class : 1


In [30]:
print("Testing set : ")
for row  in test_speakers:
    print("Speaker id : {:5d}, gender class : {:d}".format(row[0],row[1]))

Testing set : 
Speaker id :  5338, gender class : 0
Speaker id :    84, gender class : 0
Speaker id :  8842, gender class : 0
Speaker id :  7850, gender class : 0
Speaker id :  3081, gender class : 0
Speaker id :  6313, gender class : 0
Speaker id :  2412, gender class : 0
Speaker id :  3853, gender class : 0
Speaker id :  3576, gender class : 0
Speaker id :  1993, gender class : 0
Speaker id :  1272, gender class : 1
Speaker id :  5536, gender class : 1
Speaker id :  2803, gender class : 1
Speaker id :  2086, gender class : 1
Speaker id :  6241, gender class : 1
Speaker id :  3000, gender class : 1
Speaker id :   652, gender class : 1
Speaker id :   174, gender class : 1
Speaker id :  5694, gender class : 1
Speaker id :  3170, gender class : 1


In [35]:
# Choosing number of coeffecinet for Mel-spectrum (13 is expected to perform well as a standard number of coeffecients)
mfcc_num = 13

In [36]:
# Creating the data set using lib.py code
train_data, train_gender = lib.create_dataset(train_speakers, mfcc_num=mfcc_num)
test_data, test_gender = lib.create_dataset(test_speakers, mfcc_num=mfcc_num)



loading data from speaker  1919




loading data from speaker  6319




loading data from speaker  2035




loading data from speaker  3536




loading data from speaker  1673




loading data from speaker  5895




loading data from speaker  1988




loading data from speaker  6345




loading data from speaker  2277




loading data from speaker  1462




loading data from speaker  7976




loading data from speaker  251




loading data from speaker  8297




loading data from speaker  2428




loading data from speaker  2902




loading data from speaker  6295




loading data from speaker  2078




loading data from speaker  3752




loading data from speaker  777




loading data from speaker  422




loading data from speaker  5338




loading data from speaker  84




loading data from speaker  8842




loading data from speaker  7850




loading data from speaker  3081




loading data from speaker  6313




loading data from speaker  2412




loading data from speaker  3853




loading data from speaker  3576




loading data from speaker  1993




loading data from speaker  1272




loading data from speaker  5536




loading data from speaker  2803




loading data from speaker  2086




loading data from speaker  6241




loading data from speaker  3000




loading data from speaker  652




loading data from speaker  174




loading data from speaker  5694




loading data from speaker  3170




(1421, 13)
(1421, 1)


In [39]:
print(train_data.shape)
print(train_gender.shape)

(1421, 13)
(1421, 1)


In [37]:
train_data

array([[-28.7541,  -3.5119,  -4.5472, ...,  -1.7335,  -2.3086,   2.4613],
       [-27.8175,  -9.6642,  -0.0682, ...,  -4.1466,  -1.0815,   1.9488],
       [-28.4726,  -8.912 ,  -3.2502, ...,  -2.6644,  -1.522 ,   2.4377],
       ...,
       [-27.1659,  -4.7583,   2.2956, ...,  -2.3554,   0.5527,  -0.9956],
       [-27.4037,  -2.8225,   3.4078, ...,  -1.7855,  -0.3974,  -0.1161],
       [-27.3189,  -6.0517,   3.3063, ...,  -2.3082,   1.1349,  -1.0217]])

In [38]:
test_data

array([[-29.3208,  -0.7485,  -4.6897, ...,   1.9794,   0.3706,   2.4274],
       [-39.3019,  -1.1568,  -5.4866, ...,   0.9733,  -0.2523,  -0.0638],
       [-42.87  ,  -5.5678,  -6.4753, ...,   1.276 ,   2.402 ,  -1.4759],
       ...,
       [-33.3803,  -2.6679,  -1.4793, ...,  -1.9933,   5.2268,   0.1319],
       [-33.343 ,   0.1125,   1.0458, ...,  -1.9529,   1.8072,  -0.5877],
       [-35.1044,  -3.0005,   2.596 , ...,  -0.9595,   2.1759,  -0.2949]])

## Gaussian Naive Bayes classifier¶

The Gaussian Naive Bayes classifier makes the assumption that the features are independant and that each feature is normally distributed. It is a simple classifier and is very fast as it can be solved in closed form, unlike iterative models presented below.

In [41]:
clf = GaussianNB()
clf.fit(train_data,np.squeeze(train_gender))

train_accuracy = clf.score(train_data,np.squeeze(train_gender))
print("Training Accuracy : ", train_accuracy)
lib.test_classifier(clf,test_data, np.squeeze(test_gender))

Training Accuracy :  0.88036593947924
Test accuracy :  0.7441497659906396
162 males classified as females out of 631, 25.674 %
166 females classified as males out of 651, 25.499 %
              precision    recall  f1-score   support

         0.0       0.75      0.75      0.75       651
         1.0       0.74      0.74      0.74       631

    accuracy                           0.74      1282
   macro avg       0.74      0.74      0.74      1282
weighted avg       0.74      0.74      0.74      1282



## Support Vector Machine Classifier

Support Vector Machine (SVM) is a classification method that tries to separate the data with decision lines. It is a maximum margin method, which means it tries to maximize the distance between the classes at the decision line. SVM performs a linear classification, but it can be extended to non-linear classification by using a Kernel transformation (here, we used a polynomial kernel)

In [42]:
best_acc=0
for degree in range(1,15):

    clf = svm.SVC(degree=degree,kernel='poly')
    clf.fit(train_data,np.squeeze(train_gender))

    accuracy = clf.score(test_data,test_gender)
    
    if accuracy > best_acc:
        best_acc = accuracy
        best_degree = degree
    
        print("New optimal parameters : degree : ", best_degree, ", accuracy : ", accuracy)

New optimal parameters : degree :  1 , accuracy :  0.750390015600624


In [43]:
clf = svm.SVC(degree=best_degree, kernel='poly')

clf.fit(train_data,np.squeeze(train_gender))

train_accuracy = clf.score(train_data,np.squeeze(train_gender))
print("Training Accuracy : ", train_accuracy)
lib.test_classifier(clf,test_data, np.squeeze(test_gender))

Training Accuracy :  0.9092188599577762
Test accuracy :  0.750390015600624
173 males classified as females out of 631, 27.417 %
147 females classified as males out of 651, 22.581 %
              precision    recall  f1-score   support

         0.0       0.74      0.77      0.76       651
         1.0       0.76      0.73      0.74       631

    accuracy                           0.75      1282
   macro avg       0.75      0.75      0.75      1282
weighted avg       0.75      0.75      0.75      1282



## Neural Network (Multi-Layer Perceptron) Classifier


The Multi-layer Perceptron is a Neural Network that is able to classify non linearly separable data. It trains iteratively and uses backpropagation to learn the data.

Here, we will use a Perceptron with: one input layer of size mfcc_num+1, two hidden layers and one output layer of size 2 since we have two classes. The "optimal" size of the two hidden layers is empirically determined based on the testing accuracy:

In [44]:
best_acc = 0
for hidden_size in range(2,mfcc_num+1):
    for hidden_size2 in range(2,hidden_size):

        clf = MLPClassifier(solver='lbfgs', alpha=1e-1,
                    hidden_layer_sizes=(hidden_size, hidden_size2), random_state=1)

        clf.fit(train_data, np.squeeze(train_gender))

        accuracy = clf.score(test_data,test_gender)
        if accuracy > best_acc:
            best_acc = accuracy
            best_hs1 = hidden_size
            best_hs2 = hidden_size2
            print(" New optimal parameters : Hidden layer sizes : ", best_hs1, best_hs2,", accuracy : ", best_acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


 New optimal parameters : Hidden layer sizes :  3 2 , accuracy :  0.7020280811232449
 New optimal parameters : Hidden layer sizes :  5 2 , accuracy :  0.765990639625585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

 New optimal parameters : Hidden layer sizes :  8 6 , accuracy :  0.7800312012480499


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

 New optimal parameters : Hidden layer sizes :  11 4 , accuracy :  0.780811232449298


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

 New optimal parameters : Hidden layer sizes :  13 3 , accuracy :  0.782371294851794


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [45]:
# Running with best parameters
clf =  MLPClassifier(solver='lbfgs', alpha=1e-1,
                    hidden_layer_sizes=(best_hs1, best_hs2), random_state=1)
clf.fit(train_data,np.squeeze(train_gender))

train_accuracy = clf.score(train_data,np.squeeze(train_gender))
print("Training Accuracy : ", train_accuracy)
lib.test_classifier(clf,test_data, np.squeeze(test_gender))

Training Accuracy :  0.9387755102040817
Test accuracy :  0.782371294851794
221 males classified as females out of 631, 35.024 %
58 females classified as males out of 651, 8.909 %
              precision    recall  f1-score   support

         0.0       0.73      0.91      0.81       651
         1.0       0.88      0.65      0.75       631

    accuracy                           0.78      1282
   macro avg       0.80      0.78      0.78      1282
weighted avg       0.80      0.78      0.78      1282



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


## Results Discussion

* ANNs outperforms other classifiers in terms of accuracy and gives almost similar recall and precision results.
 
* In terms of accuracy ANNs was best followed by SVM and GNB, both on the training and testing sets.

* In terms of recall, precision and f1_score, it was clear which performed best as they varied in performance in terms of predicting male vs. female speech.