# Speaker Identification Using Machine Learning

## Part I: Recording Audio Sample using Pyaudio

### Record a few seconds of audio and save to a wave file

In [5]:
"""PyAudio Example: Record a few seconds of audio and save to a wave file."""

def record_audio(dir='./training_set/audio_samples/'):
    import wave
    import sys
    from datetime import datetime
    import pyaudio

    # Get current time
    current_time = datetime.now()
    # Format time
    current_time = current_time.strftime("%H-%M-%S_%d-%m-%Y")

    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1 if sys.platform == 'darwin' else 2
    RATE = 44100
    RECORD_SECONDS = 5

    file_name = 'sample_' + current_time + '.wav'
    path = dir + file_name

    with wave.open(path, 'wb') as wf:
        p = pyaudio.PyAudio()
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)

        stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)

        print('Recording...')
        for _ in range(0, RATE // CHUNK * RECORD_SECONDS):
            wf.writeframes(stream.read(CHUNK))
        print('Done')

        stream.close()
        p.terminate()

In [6]:
# Test
record_audio()

Recording...
Done


### Play a wave file

In [9]:
"""PyAudio Example: Play a wave file."""

def play_audio(dir = './training_set/audio_samples/'):
    import wave
    import glob
    import pyaudio

    CHUNK = 1024
    # Create the list of files to determine last file
    audio_files = glob.glob(dir + '*.wav')

    for file_path in audio_files:
        with wave.open(file_path, 'rb') as wf:
            # Instantiate PyAudio and initialize PortAudio system resources (1)
            p = pyaudio.PyAudio()

            # Open stream (2)
            stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                            channels=wf.getnchannels(),
                            rate=wf.getframerate(),
                            output=True)

            # Play samples from the wave file (3)
            while len(data := wf.readframes(CHUNK)):  # Requires Python 3.8+ for :=
                stream.write(data)

            # Close stream (4)
            stream.close()

    # Release PortAudio system resources (5)
    p.terminate()

In [22]:
def play_buttons(path = './training_set/audio_samples/*.wav'):
    import glob
    import IPython.display as ipd
    
    audio_files = glob.glob(path)
    # Create and display a "display" objects
    for file in audio_files:
        ipd.display(ipd.Audio(audio_files[0]))

play_buttons()

## Extracting Features from Audio Samples

In [1]:
def extract_features(audio, rate=44100):
    import librosa
    import numpy as np

    y, sr = librosa.load(audio, sr=rate)  # sr = sampling rate
    mfcc_feature = librosa.feature.mfcc(y=y, sr=sr)
    delta = librosa.feature.delta(mfcc_feature)
    combined = np.hstack((mfcc_feature, delta))
    return combined

In [None]:
# path = './training_set/audio_samples/tomek-sample_20-47-12_01-03-2023.wav'
# mfcc, mfcc_delta = extract_features(audio=path)
# print(mfcc.shape)
# print(mfcc_delta.shape)

# print(mfcc)
# print(mfcc_delta)

## Train Model using Gaussian Mixture Model

In [50]:
def train_model():
    import numpy as np
    import glob
    import wave
    import pickle
    from sklearn.mixture import GaussianMixture

    source = "./training_set/audio_samples/*.wav"
    dest = "./trained_models/"
    name = "tomek"
    file_paths = glob.glob(source)

    count = 1

    features = np.asarray(())

    for path in file_paths:

        with wave.open(path, 'rb') as wf:
            sr = wf.getframerate()
        vector = extract_features(audio=path, rate=sr)

        if features.size == 0:
            features = vector
        else:
            features = np.vstack((features, vector))

        if count == len(file_paths):
            gmm = GaussianMixture(max_iter = 200, covariance_type = 'diag', n_init = 3)
            gmm.fit(features)

            # dumping the trained gaussian model
            #picklefile = path.split("/")[-1].split("-")[0]+".gmm"
            picklefile = name + '.gmm'
            pickle.dump(gmm, open(dest + picklefile, 'wb'))
            print('+ modeling completed for speaker:', picklefile, " with data point = ",features.shape)
        count += 1
  
train_model()

+ modeling completed for speaker: tomek.gmm  with data point =  (100, 862)


## Test model

In [12]:
def test_model():
    import pickle
    import glob
    import wave
    import numpy as np

    source = "./testing_set/"
    model_dir = "./trained_models/"
    test_audio_files = glob.glob(source + "audio_samples/*.wav")
    gmm_files = glob.glob(model_dir + "*.gmm")

    # Load models
    models = [pickle.load(open(fname,'rb')) for fname in gmm_files]

    # Extract features from test audio files
    for audio_file in test_audio_files:
        with wave.open(audio_file, 'rb') as wf:
            sr = wf.getframerate()
        vector = extract_features(audio=audio_file, rate=sr)

        log_likelihood = np.zeros(len(models))

        for i in range(len(models)):
            gmm = models[i]  #checking with each model one by one
            scores = np.array(gmm.score(vector))
            log_likelihood[i] = scores.sum()
            
        winner = np.argmax(log_likelihood)
        print(winner)

test_model()



[-3611.05432274]
0
[-3662.99263968]
0


In [14]:
def test_model():
    import pickle
    import glob
    import wave
    import numpy as np

    source = "./testing_set/"
    model_dir = "./trained_models/"
    test_audio_files = glob.glob(source + "audio_samples/*.wav")
    gmm_files = glob.glob(model_dir + "*.gmm")

    # Load models
    models = [pickle.load(open(fname,'rb')) for fname in gmm_files]

    # Extract features from test audio files
    for audio_file in test_audio_files:
        with wave.open(audio_file, 'rb') as wf:
            sr = wf.getframerate()
        vector = extract_features(audio=audio_file, rate=sr)

        gmm = models[0]
        prob = gmm.predict(vector)
        print(prob)



test_model()

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
