In [280]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.fftpack import dct

sns.set()
import os
import scipy
import librosa.display
from IPython.display import Audio
import random
from functools import reduce
from sklearn.mixture import GaussianMixture

In [281]:
# Extract MFCC Feature

In [295]:
# sample_rate = 48000
def extractMFCC(filename, sample_rate):
    #sample_rate = 192000
    utter_part, sr = librosa.core.load(filename, sample_rate)  # load utterance audio
    intervals = librosa.effects.split(utter_part, top_db=30)  # voice activity detection
    # print(intervals)
    S_total = []
    for inte in intervals:
        S = librosa.core.stft(y=utter_part[inte[0]:inte[1]])
        S = np.abs(S) ** 2
        # print("Size of S is {}".format(S.shape))
        mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=2048, n_mels=40)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)
        # print(S.shape)
        S_total.append(S)
    # plt.show()
    # plt.pause(2)
    # Extract MFCC feature
    # mfccs = librosa.feature.mfcc(y=utter_part.astype('float'), sr=sample_rate, n_mfcc=10)
    S_total = np.concatenate(S_total,axis=1)
    # print(S_total.shape)
    mfccs = librosa.feature.mfcc(S = S_total, sr=sample_rate, n_mfcc=40) # (128, times)
    # print(mfccs.shape)
    mfccs = mfccs.reshape(-1,40)
    return mfccs

In [296]:
dataset = [dirs for dirs in os.listdir('.') if(dirs[-4:]==("high"))]

In [297]:
dataset

['hanqing-high', 'jianzhi-high', 'liuli-high', 'Nick-high', 'xiao-high']

In [298]:
import random
# Random select 2 as enrollment, to train GMM model
def split(speaker_wavs):
    random.shuffle(speaker_wavs)
    # print(speaker_wavs)
    enroll = speaker_wavs[:2]
    verify = speaker_wavs[2:]
    return enroll, verify

In [299]:
speakers_en = {}
speakers_ve = {}
for idx,speaker in enumerate(dataset):
    speaker_wavs = os.listdir(os.path.join('.', dataset[idx]))
    speaker_label = dataset[idx][:-5]
    enroll, verify = split(speaker_wavs)
    en_mfccs = []
    ve_mfccs = []
    for en in enroll:
        wav_path = os.path.join('.', dataset[idx], en)
        mfcc = extractMFCC(wav_path,192000)
        mfcc = mfcc - np.mean(mfcc, axis=0)
        # print(mfcc.shape)
        en_mfccs.append(mfcc)
    speakers_en[speaker_label] = np.concatenate(en_mfccs, axis=0)
    for ve in verify:
        wav_path = os.path.join('.', dataset[idx], ve)
        mfcc = extractMFCC(wav_path,192000)
        mfcc = mfcc - np.mean(mfcc, axis=0)
        # print(mfcc.shape)
        ve_mfccs.append(mfcc)
    speakers_ve[speaker_label] = np.concatenate(ve_mfccs, axis=0)
#print(speakers_en)

In [304]:
GMM = {}
UBM = {}
for k, v in speakers_en.items():
    GMM[k] = GaussianMixture(n_components= 5, covariance_type= 'diag')
    UBM[k] = GaussianMixture(n_components= 5, covariance_type= 'diag')
    print(v.shape)
    GMM[k].fit(v)
    other_v = []
    for k1, v1 in speakers_en.items():
        if(k1 != k):
            other_v.append(v1)
    other_v = np.concatenate(other_v)
    UBM[k].fit(other_v)

(1504, 40)
(979, 40)
(1504, 40)
(1377, 40)
(1504, 40)


In [305]:
for k, v in speakers_ve.items():
    print("For speaker {}".format(k))
    # print(GMM[k].score_samples(v))
    x = GMM[k].score_samples(v) - UBM[k].score_samples(v)
    total = 0 
    correct = 0
    for i in x:
        if i > 0:
            correct +=1 
        total += 1
    # Accuracy for every phoneme
    print("accuracy is {}".format(correct/total))

For speaker hanqing
accuracy is 0.44425817267393125
For speaker jianzhi
accuracy is 0.5574245939675174
For speaker liuli
accuracy is 0.6283244680851063
For speaker Nick
accuracy is 0.46453168044077137
For speaker xiao
accuracy is 0.5136303191489362


In [306]:
dataset = [dirs for dirs in os.listdir('.') if(dirs[-4:]==("-low"))]

In [307]:
dataset

['hanqing-low', 'jianzhi-low', 'liuli-low', 'Nick-low', 'xiao-low']

In [312]:
import random
# Random select 2 as enrollment, to train GMM model
def split(speaker_wavs):
    random.shuffle(speaker_wavs)
    # print(speaker_wavs)
    enroll = speaker_wavs[:2]
    verify = speaker_wavs[2:]
    return enroll, verify

In [313]:
speakers_en = {}
speakers_ve = {}
for idx,speaker in enumerate(dataset):
    speaker_wavs = os.listdir(os.path.join('.', dataset[idx]))
    speaker_label = dataset[idx][:-5]
    enroll, verify = split(speaker_wavs)
    en_mfccs = []
    ve_mfccs = []
    for en in enroll:
        wav_path = os.path.join('.', dataset[idx], en)
        mfcc = extractMFCC(wav_path,44100)
        mfcc = mfcc - np.mean(mfcc, axis=0)
        # print(mfcc.shape)
        en_mfccs.append(mfcc)
    speakers_en[speaker_label] = np.concatenate(en_mfccs, axis=0)
    for ve in verify:
        wav_path = os.path.join('.', dataset[idx], ve)
        mfcc = extractMFCC(wav_path,44100)
        mfcc = mfcc - np.mean(mfcc, axis=0)
        # print(mfcc.shape)
        ve_mfccs.append(mfcc)
    speakers_ve[speaker_label] = np.concatenate(ve_mfccs, axis=0)
#print(speakers_en)

In [314]:
GMM = {}
UBM = {}
for k, v in speakers_en.items():
    GMM[k] = GaussianMixture(n_components= 5, covariance_type= 'diag')
    UBM[k] = GaussianMixture(n_components= 5, covariance_type= 'diag')
    print(v.shape)
    GMM[k].fit(v)
    other_v = []
    for k1, v1 in speakers_en.items():
        if(k1 != k):
            other_v.append(v1)
    other_v = np.concatenate(other_v)
    UBM[k].fit(other_v)

(346, 40)
(307, 40)
(346, 40)
(346, 40)
(346, 40)


In [315]:
for k, v in speakers_ve.items():
    print("For speaker {}".format(k))
    # print(GMM[k].score_samples(v))
    x = GMM[k].score_samples(v) - UBM[k].score_samples(v)
    total = 0 
    correct = 0
    for i in x:
        if i > 0:
            correct +=1 
        total += 1
    # Accuracy for every phoneme
    print("accuracy is {}".format(correct/total))

For speaker hanqin
accuracy is 0.21242774566473988
For speaker jianzh
accuracy is 0.46181172291296624
For speaker liul
accuracy is 0.3627167630057804
For speaker Nic
accuracy is 0.25289017341040465
For speaker xia
accuracy is 0.5447976878612717
