In [3]:
# Import Library
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2Processor
import speech_recognition as sr
import torch
import gc
import librosa
import os
import numpy as np
import onnxruntime as rt
import time

In [4]:
# Necessary Variable

SAMPLE_RATE = 16_000
FFT_SIZE = 2048
HOP_SIZE = 512
N_MELS = 26
MFCC_BINS = 13
MAX_LEN_PAD = 32
CHANNELS = 3

dataset_directory = "Audio Classification/AudioClassification Dataset"
labels = os.listdir(dataset_directory)

In [5]:
labels

['Adika Rajendra Haris', 'Banyu Ontoseno', 'Gesang Budiono']

In [3]:
# Necessary Function

def resample_audio(path):
    audio, sr = librosa.load(path, sr=SAMPLE_RATE)
    if len(audio) < SAMPLE_RATE: 
        audio = np.pad(audio, (0,16000-len(audio)), "constant")
    else:
        audio = audio[:SAMPLE_RATE]
    return audio

def stft(audio):
    audio_stft = librosa.stft(y = audio, n_fft = FFT_SIZE, hop_length = HOP_SIZE, center=True, pad_mode='constant')
    amp = np.abs(audio_stft)**2

    return amp

def mel_frequency(spectrogram):
    mel = librosa.filters.mel(sr=SAMPLE_RATE, n_fft=FFT_SIZE, n_mels=N_MELS)
    
    mel_spec = mel.dot(spectrogram)
    db_mel_spect = librosa.power_to_db(mel_spec, ref=np.max)
    
    return db_mel_spect

def mfcc(mel_freq):
    mfcc = librosa.feature.mfcc(S=mel_freq, sr=SAMPLE_RATE, n_mfcc=MFCC_BINS, n_fft=FFT_SIZE, hop_length=HOP_SIZE, n_mels=N_MELS)
    
    if (MAX_LEN_PAD > mfcc.shape[1]):
        pad = MAX_LEN_PAD - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad)))
    else:
        mfcc = mfcc[:, :MAX_LEN_PAD]
    return mfcc

def delta_mfcc(mfcc):
    delta_mfcc = librosa.feature.delta(mfcc)
    return delta_mfcc

def delta_delta_mfcc(mfcc):
    delta_delta_mfcc = librosa.feature.delta(mfcc, order=2)
    return delta_delta_mfcc

def speaker_input_preprocessing(audio):
    audio_speaker = resample_audio("microphone-results.wav")
    audio_speaker = stft(audio_speaker)
    audio_speaker = mel_frequency(audio_speaker)

    audio_speaker_mfcc = mfcc(audio_speaker)
    audio_speaker_delta = delta_mfcc(audio_speaker_mfcc)
    audio_speaker_delta_delta = delta_delta_mfcc(audio_speaker_mfcc)

    audio_speaker_input = np.zeros((1, MFCC_BINS, MAX_LEN_PAD, CHANNELS), dtype=np.float32)
    audio_speaker_input[:, :, :, 0] = audio_speaker_mfcc
    audio_speaker_input[:, :, :, 1] = audio_speaker_delta
    audio_speaker_input[:, :, :, 2] = audio_speaker_delta_delta
    return audio_speaker_input

In [9]:
# Model Apply

model = Wav2Vec2ForCTC.from_pretrained(os.getcwd() + "/Automatic Speech Recognition/asr_skripsi_local_common_voice/checkpoint-2400").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("Automatic Speech Recognition/asr_skripsi_local_common_voice/")
processorLM = Wav2Vec2ProcessorWithLM.from_pretrained("Automatic Speech Recognition/asr_LM_skripsi_local_common_voice", eos_token=None, bos_token=None)
model_onnx = rt.InferenceSession('Classification.simplified.onnx', providers=["CUDAExecutionProvider"])

In [11]:
# Initialize Speech Recognizer
listener = sr.Recognizer()
listener.dynamic_energy_threshold = True
# listener.energy_threshold = 4500

In [13]:
# Inference
while True:
    input("Press to Speak...")
    try:
        with sr.Microphone() as mic:
            listener.adjust_for_ambient_noise(mic, duration=5)
            print("Listening... \n\n")
            audio = listener.listen(mic, phrase_time_limit=5)

            print("Create Wav... \n\n")
            with open("microphone-results.wav", "wb") as f:
                f.write(audio.get_wav_data())

            audio, _ = librosa.load("microphone-results.wav", sr=SAMPLE_RATE)
            audio_len = librosa.get_duration(y=audio, sr=SAMPLE_RATE)
            
            audio_speaker_input = speaker_input_preprocessing(audio)

            if audio_len > 5:
                print("Audio Duration More Than 5 Secs")
                continue

            print("Speaker Predicting...\n")
            inputDetails = model_onnx.get_inputs()
            start_time = time.time()
            pred_speaker = model_onnx.run(None, {inputDetails[0].name: audio_speaker_input})[0] 
            print("--- %s seconds ---" % (time.time() - start_time))

            if max(pred_speaker[0]) < 0.90:
                print("Speaker Unidentified")
                continue

            print("\n\nWords Predicting...\n")
            start_time = time.time()
            input_dict = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
            with torch.no_grad():
                logits = model(input_dict.input_values.to("cuda")).logits
            transcriptionLM = processorLM.batch_decode(logits.cpu().detach().numpy()).text[0]
            print("--- %s seconds ---" % (time.time() - start_time))

            top_index = np.argmax(pred_speaker[0])
            print(f"\n\nSpeaker : {labels[top_index]}\nAudio Transcribe : {transcriptionLM}")
            del audio, input_dict, logits, transcriptionLM, audio_speaker_input
            gc.collect()
            torch.cuda.empty_cache()

    except sr.UnknownValueError:
        listener = sr.Recognizer()
        continue

Press to Speak... 


ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
Expression 'alsa_snd_pcm_hw_params_set_buffer_size_near( pcm, hwParams, &alsaBufferFrames )' failed in 'src/hostapi/alsa/pa_linux_alsa.c', line: 923
Expression 'alsa_snd_pcm_hw_params_set_buffer_size_near( pcm, hwParams, &alsaBufferFrames )' failed in 'src/hostapi/alsa/pa_linux_alsa.c', line: 923
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_conf

Listening... 


Create Wav... 


Speaker Predicting...

--- 0.016329288482666016 seconds ---


Words Predicting...

--- 0.35102295875549316 seconds ---


Speaker : Gesang
Audio Transcribe : satu dua tiga empat lima enam
