<h1> Objective:- Assessment Task 9.2C: Speaker recognition using GMMs </h1>

<div style="text-align: right"> Done by: <b>Karan Murjani </b> </div>
<div style="text-align: right"> StudentId: <b> 221047083 </b></div>

In [1]:
import numpy as np
import pickle
import glob
import librosa
from pydub import AudioSegment
from pydub.utils import mediainfo
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import os

In [2]:
def mfcc_extraction(audio_filename, #.wav filename
                    hop_duration, #hop_length in seconds, e.g., 0.015s (i.e., 15ms)
                    num_mfcc #number of mfcc features
                   ):
    speech = AudioSegment.from_wav(audio_filename) #Read audio data from file
    samples = speech.get_array_of_samples() #samples x(t)
    sampling_rate = speech.frame_rate #sampling rate f
    mfcc = librosa.feature.mfcc(
    y = np.float32(samples),
    sr = sampling_rate,
    hop_length = int(sampling_rate * hop_duration),
    n_mfcc = num_mfcc)
    return mfcc.T

In [3]:
def learningGMM(features, #list of feature vectors, each feature vector is an array
                n_components, #the number of components
                max_iter #maximum number of iterations
               ):
    gmm = GaussianMixture(n_components = n_components, max_iter = max_iter)
    gmm.fit(features)
    return gmm

In [4]:
path = 'SpeakerData/'
speakers = os.listdir(path + 'Train/')
print(speakers)

['Azmisov', 'Bahoke', 'Beez', 'Anthony', 'Beady', 'Arjuan', 'Bachroxx', 'Asp', 'Bae', 'Ariyan', 'Argail', 'Ara', 'Asladic', 'Asalkeld', 'Bart', 'Artk', 'Arthur', 'Bassel', 'Artem', 'Bareford', 'AppleEater', 'Arvala', 'BelmontGuy', 'Arun', 'B']


In [5]:
#this list is used to store the MFCC features of all training data of all speakers
mfcc_all_speakers = []
hop_duration = 0.015 #15ms
num_mfcc = 12
for s in speakers:
    sub_path = path + 'Train/' + s + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    mfcc_one_speaker = np.asarray(())
    for fn in sub_file_names:
        mfcc_one_file = mfcc_extraction(fn, hop_duration, num_mfcc)
        if mfcc_one_speaker.size == 0:
            mfcc_one_speaker = mfcc_one_file
        else:
            mfcc_one_speaker = np.vstack((mfcc_one_speaker, mfcc_one_file))
    mfcc_all_speakers.append(mfcc_one_speaker)

In [6]:
for i in range(0, len(speakers)):
    with open('TrainingFeatures/' + speakers[i] + '_mfcc.fea','wb') as f:
        pickle.dump(mfcc_all_speakers[i], f)

In [7]:
n_components = 5
max_iter = 50
gmms = [] #list of GMMs, each is for a speaker
for i in range(0, len(speakers)):
    gmm = learningGMM(mfcc_all_speakers[i],
                      n_components,
                      max_iter)
    gmms.append(gmm)

In [8]:
for i in range(len(speakers)):
    with open('Models/' + speakers[i] + '.gmm', 'wb') as f: #'wb' is for binary write
        pickle.dump(gmms[i], f)

In [9]:
gmms = []
for i in range(len(speakers)):
    with open('Models/' + speakers[i] + '.gmm', 'rb') as f: #'wb' is for binary write
        gmm = pickle.load(f)
        gmms.append(gmm)

In [10]:
hop_duration = 0.015 #15ms
num_mfcc = 12
def speaker_recognition(audio_file_name, gmms):
    speaker_id = 0 #you need to calculate this
    score = []
    for i, gmm in enumerate(gmms):
        mfcc_test = mfcc_extraction(audio_file_name, hop_duration, num_mfcc)
        score.append((i,gmms[i].score(mfcc_test)))
    max_element = max(score, key=lambda x:x[1])
    speaker_id = max_element[0]
    return speaker_id

In [11]:
speaker_id = speaker_recognition('SpeakerData/Test/Ara/a0522.wav', gmms)
print(speakers[speaker_id])

Ara


In [12]:
#Checking across entire test file
files = glob.glob("SpeakerData/Test/*/*")
y_true = []
y_pred = []
new_item = []

for file in files:
    true_label = os.path.basename(os.path.dirname(file))
    speaker_id = speaker_recognition(file, gmms)
    pred_label = speakers[speaker_id]
    y_true.append(true_label)
    y_pred.append(pred_label)

In [13]:
conf_matrix = confusion_matrix(y_true, y_pred)
print("Overall Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_true, y_pred))

Overall Accuracy: 0.9485714285714286
Confusion Matrix:
 [[1 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0