# Speaker classification
Implementation of a basic speaker classification system. It uses GMMS to model the speakers voice from MFCCs.

In [15]:
#IMPORTS
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
plt.style.use('ggplot')
rcParams['figure.figsize'] = 16, 8

from sklearn.mixture import GaussianMixture as GMM
from sklearn.decomposition import PCA
from sklearn import metrics

import sys
import os


import glob
import random

import librosa
import librosa.display

In [16]:
#DEFINICIO VARIABLES 
base_path = '/home/jc/speech_processing_notebooks'    #Carpeta practica 4
speecon_path = os.path.join(base_path,'audios','speecon')    #Carpeta Speecon
temp_path = os.path.join(base_path,'exports')    #Carpeta exports

In [17]:
def mfcc(files, n_coefs=16):
    ''' Función genérica MFCC
        Función genérica para calcular los coeficientes MFCC dada
        una lista con los paths a los audios.
        Utiliza la libreria librosa para leer el
        audio y calcular los coeficientes.
        Parametros:
        - files: lista con los ficheros a computar
        - n_coefs: int numero de coeficientes para el MFCC
        Devuelve:
        - base: np.array de tamaño Nxn_coefs con los coeficientes para cada trama'''
    
    #Inicializamos un array a ceros
    base = np.zeros((1,n_coefs))
    
    for file_audio in files:
        #Lectura del audio, remuestreamos a 8000Hz
        audio, fs = librosa.core.load(file_audio, sr=8000)
        #Calculo de los coefs
        mfcc_raw = librosa.feature.mfcc(audio, sr=fs, n_mfcc=n_coefs).T
        #Stack de la base de datos con los mfcc calculados
        base = np.vstack((base,mfcc_raw))
    
    return base[1:] #El primero no lo devolvemos porqué son los ceros de inializacion

In [18]:
def read_mfcc(path):
    ''' Reads mfcc file from person
        Parametros:
        - path: path donde leer el mfcc
    '''
    mfcc = np.loadtxt(path, delimiter=',')
    
    return mfcc

In [19]:
def classification_list(speecon):
    ''' Crea listas para calcular los parámetros posteriormente
        (actualmente lista todos los audios)
        Para train usamos 3/4 partes de los audios y para test 
        1/4 partes de los audios'''
    #Diccionario para contener los datos
    classification_list = dict()
    
    #Iteramos la base de datos de speecon primero bloque a bloque 
    #y luego persona a persona
    for block in os.listdir(speecon):
        block_path = os.path.join(speecon,block)
        
        for ses in os.listdir(block_path):
            ses_path = os.path.join(block_path,ses)
            
            all_files = glob.glob(f"{ses_path}/*.wav")
            
            #Para hacer el dataset cojemos 15 ficheros para entrenar y
            #5 para el test
            train_files = all_files[:15]
            test_files = all_files[-5:]
            
            #Añadimos al diccionario
            classification_list[ses[-3:]] = [train_files, test_files]
            
    return classification_list

In [20]:
def compute_mfccs(paths_dict, n_coefs):
    ''' Calculo de los MFCC
        Le pasamos el diccionario generado por classification_list. Exporta los mfcc a CSV'''
    #Paths para exportar
    train_path = os.path.join(temp_path,'class','train')
    test_path = os.path.join(temp_path,'class','test')
    
    os.makedirs(train_path, exist_ok=True)
    os.makedirs(test_path, exist_ok=True)
    
    count = 0
    
    for person,paths in paths_dict.items():
        #Para calcular el modelo de la persona usamos todos los
        #audios de train
        train_mfcc = mfcc(paths[0],n_coefs)
        
        save_path = os.path.join(train_path, person[-3:]+'.mfcc')
        np.savetxt(save_path, train_mfcc, delimiter=",")
        
        #Pero para hacer el testeo usamos audio a audio
        i = 0
        for t_path in paths[1]:
            os.makedirs(os.path.join(test_path, person), exist_ok=True)
            save_path = os.path.join(test_path, person, f'audio{i}.mfcc')
      
            test_mfcc = mfcc([t_path],n_coefs)
            np.savetxt(save_path, test_mfcc, delimiter=",")
            i = i+1
        count += 1
        if not count%10 : print(f'Persona {count} calculada')
        

In [21]:
def train_gmm(n_gmms,train_path):
    ''' Entrnament de les gmm
        Retorna:
        - trained_gmm: diccionari amb els noms i les gmms
    '''
    #train_path = os.path.join(temp_path,'mfcc','train')
    files_train = glob.glob(f"{train_path}/*.mfcc")
    
    trained_gmm = dict()
    
    for mfcc_path in files_train:
        
        mfcc = read_mfcc(mfcc_path)
        
        gmm=GMM(n_gmms, n_init=2).fit(mfcc) 
        person = mfcc_path[-8:-5]
        trained_gmm[person] = gmm
        
    return trained_gmm

In [22]:
def classification(gmms, test_path):
    ''' Classificació dels audios de test_path
        Parametres:
        - gmms: llista de gmms que retorna train_gmm()
        - test_path: carpeta on hi ha els fitxers de test
        Retorna:
        - assigned: llista amb assignacions de prediccions
    '''
    trained_gmms = gmms
    assigned = list()
    test_files = glob.glob(f"{test_path}/*/*.mfcc")
    
    for traza in test_files:
        max_score = None
        predicted_person = None
        read = read_mfcc(traza)
        correct_person = traza[-15:-12]
        for key, gmm in trained_gmm.items():
            score = gmm.score(read)
            if (max_score==None) or (score>max_score):
                max_score = score
                predicted_person = key
                
        assigned.append([correct_person,predicted_person])
    return assigned

In [23]:
def precision(assigned):
    ''' Lectura dels mfcc d'una llista
    '''
    count = 0
    for prediction in assigned:
        if prediction[0] == prediction[1]:
            count = count + 1
    precision = count/len(assigned)
    
    print(f'Correct: {count}/{len(assigned)} Uncorrect: {len(assigned)-count}/{len(assigned)} Precision: {precision}')
    
    return precision

Path definition

In [24]:
train_path = os.path.join(temp_path,'class','train')
test_path = os.path.join(temp_path,'class','test')

Create lists to calculate features

In [25]:
c_list = classification_list(speecon_path)

MFCC computation given a list

In [27]:
compute_mfccs(c_list, 16)

Persona 10 calculada
Persona 20 calculada
Persona 30 calculada
Persona 40 calculada
Persona 50 calculada
Persona 60 calculada
Persona 70 calculada
Persona 80 calculada
Persona 90 calculada
Persona 100 calculada
Persona 110 calculada
Persona 120 calculada
Persona 130 calculada
Persona 140 calculada
Persona 150 calculada


GMM training

In [28]:
trained_gmm = train_gmm(n_gmms = 6, train_path = train_path)

Audio classification

In [29]:
assigments = classification(trained_gmm, test_path)

Results calculation 

In [30]:
precision_v = precision(assigments)

Correct: 774/785 Uncorrect: 11/785 Precision: 0.9859872611464968
