In [22]:
import IPython
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
import import_ipynb
from sklearn.model_selection import GroupShuffleSplit

### Leer dataset

In [3]:
PATHDATA_MFFC_AVG = '../../../data/audios_MFCC_average.csv'
df = pd.read_csv(PATHDATA_MFFC_AVG, header = None)
data = df.values #Convertimos en un  numpy array

In [4]:
X = data[:,0:-3]
Y = data[:,-2]
print('Tamaño dataSet', X.shape)
print('\n')
Y=  np.reshape(Y,(np.size(Y,0),1))
groups = data[:,-1]
tracks = data[:,-3:-2]

Tamaño dataSet (540, 20)




### Número de clases

In [5]:
n_classes=len(np.unique(Y))
print('Número de clases:', n_classes)

Número de clases: 5


### Número de hablantes

In [6]:
n_groups = len(np.unique(groups))
print('Número de hablantes diferentes', n_groups)

Número de hablantes diferentes 9


### Número de Audios

In [7]:
n_tracks = len(np.unique(tracks))
print('Número de audios diferentes',n_tracks)

Número de audios diferentes 50


## Dividir el dataset (train/test) 0.8/0.2

In [18]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
gss.get_n_splits()
for train_index, test_index in gss.split(X, Y, groups=groups):
    X_train_original, X_test_original = X[train_index], X[test_index]
    #print(X_train, X_test)
    Y_train__original, Y_test_original = Y[train_index], Y[test_index]
    #print(y_train, y_test)
    groups_original = groups[train_index]
    groups_test_original=groups[test_index]
    tracks_train_original = tracks[train_index]
    tracks_test_original = tracks[test_index]   

### Info TRAIN

In [17]:
print('X',X_train_original.shape)
print('Groups',len(np.unique(groups_original)))
print('Tracks',len(np.unique(tracks_train_original)))

X (488, 20)
Groups 7
Tracks 45


### Info TEST

In [19]:
print('X',X_test_original.shape)
print('Groups',len(np.unique(groups_test_original)))
print('Tracks',len(np.unique(tracks_test_original)))

X (52, 20)
Groups 2
Tracks 5


In [20]:
FOLDS = 10
TRAIN_SIZE = 0.8

# GaussianMixture (GMM) 

## 1. Mejor modelo

Usamos X_train_original y Y_train__original para seleccionar la mejor combinación de parametros y posteriormente validar dichos res]ultados con el conjunto X_test__original y Y_test__original.

<b>Nota:</b> el conjunto X_train_original se divide en dos conjuntos train y dev a una proporción de 80 a 20.

In [23]:
def GMMClassifierTrain(X,Y,M,tipo):    
    #X: Matriz con las características
    #Y: Matriz con las clases
    #M: Número de componentes
    #tipo: Tipo de matriz de covarianza
    nClases = np.unique(Y)
    GMMs = [] # Vector que tiene los modelos GMM entrendao
    for c in nClases:
        gmm = GaussianMixture(M,tipo)#Configurar el modelo
        Xc = X[Y==c] # Seleccionamos la matrix de la clase a entrenar
        GMMs.append(gmm.fit(Xc))        
    return GMMs #Debe retornar un objeto que contenga todos los modelos entrenados

def GMMClassfierVal(GMMs,Xtest,tracks_test):
    nGmm = len(GMMs) #Numero de modelos que existen
    nSamples = len(Xtest) #Numero de muestras a clasificar
    """Creamos una matriz de tamaño nSamples,nGmm en donde
    cada valor de la columna representa la probabilidad de que esta muestra pertenezca a dicha clase
    la pocision del mayor valor de la columna es la clase a la que se predice pertenecera
    """
    mProbabilities = np.zeros((nSamples,nGmm))
    for i in range(nGmm):
        mProbabilities[:,i]=GMMs[i].score_samples(Xtest)
    Yest = np.argmax(mProbabilities,axis=1)    
    
    #Creamos un matriz con la primera fila las predicciones y la otra el track      
    X_test_with_id_trak = np.column_stack((Yest,tracks_test))

    # sacamos los 
    index_traks = list(np.unique(tracks_test))

    #Agrupamos por audio
    prediction_by_instances = npi.group_by(X_test_with_id_trak[:, -1]).split(X_test_with_id_trak[:, -2])
    
    #Sacamos la moda de prediccion
    predictions = []
    for v in range(prediction_by_instances.shape[0]):
        decision = stats.mode(prediction_by_instances[v])[0][0]
        predictions.append(decision)
    print('predi',predictions)
    
    
    
    return predictions #Debe retornar un vector con las clases predichas para cada una de las muestras en Xtest, de acuerdo con los modelos almacenados en GMMs

In [26]:
def TRAIN(n_components,covariance_type,X_train,Y_train,tracks_train,train_size,folds,groups_train):
    gss = GroupShuffleSplit(n_splits=folds, train_size=.7)
    EficienciaTrain = np.zeros(folds)
    EficienciaVal = np.zeros(folds)
    j = 0
    for train_idx, test_idx in gss.split(X_train, Y_train, groups_train):
        
        X_train_fold =X_train[train_idx]
        Y_train_fold=Y_train[train_idx]
        X_test_fold=X_train[test_idx]
        Y_test_fold=Y_train[test_idx]
        
        tracks_train_fold=tracks_train[train_idx]
        tracks_test_fold=tracks_train[test_idx]
        
        #Entrenamiento
        GMMs = GMMClassifierTrain(X_train_fold,Y_train_fold,n_components,covariance_type)
        
        
        #Validación
        Ytrain_pred = GMMClassfierVal(GMMs,X_train_fold,tracks_train_fold)
        Ytest_pred = GMMClassfierVal(GMMs,X_test_fold,tracks_test_fold)
        
        #Metricas en entrenamiento
        EficienciaTrain[j] = np.mean(Ytrain_pred.ravel() == Y_train_fold.ravel())
        EficienciaVal[j] = np.mean(Ytest_pred.ravel() == Y_test_fold.ravel())
        j += 1
        
        print('Eficiencia durante el entrenamiento = ' + str(np.mean(EficienciaTrain)) + '+-' + str(np.std(EficienciaTrain)))
        print('Eficiencia durante la validación = ' + str(np.mean(EficienciaVal)) + '+-' + str(np.std(EficienciaVal)))   

In [28]:
n_components=4
covariance_type='full'
X_train=X_train_original
Y_train=Y_train__original
tracks_train=tracks_train_original
train_size=TRAIN_SIZE
folds=FOLDS
groups_train=groups_original
TRAIN(n_components,covariance_type,X_train,Y_train,tracks_train,train_size,folds,groups_train)

IndexError: boolean index did not match indexed array along dimension 1; dimension is 20 but corresponding boolean dimension is 1