Main articles being used as reference:
https://www.thepythoncode.com/article/speech-recognition-using-huggingface-transformers-in-python

https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/README.md

http://www.openslr.org/12

https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/

https://pypi.org/project/noisereduce/

https://jmvalin.ca/demo/rnnoise/


In [37]:
# Imports 
from transformers import *
import torch
import soundfile as sf
# import librosa
import os
import torchaudio
import noisereduce as nr
from scipy.io import wavfile
from os import listdir, walk
from os.path import isfile, join
from pathlib import PurePath
from pydub import AudioSegment
import shutil



In [38]:
# Preprocessor and model weights
model_name = "facebook/wav2vec2-base-960h" # 360MB
# model_name = "facebook/wav2vec2-large-960h-lv60-self" # 1.18GB

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

loading configuration file preprocessor_config.json from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

loading configuration file config.json from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base-960h",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size

In [None]:
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/16-122828-0002.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/30-4447-0004.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/7601-291468-0006.wav"
audio_path = "../data/raw_wav/clean.wav"

In [None]:
# preprocess audio & reduce noise
rate, data = wavfile.read(audio_path)
# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate, stationary=False, prop_decrease=0.9)
print(len(reduced_noise))
reduced_file_path = "../data/reduced/test.wav"
wavfile.write(reduced_file_path, rate, reduced_noise)

In [None]:
# prep audio 

# load our wav file
# speech, sr = torchaudio.load("../data/machine-learning_speech-recognition_30-4447-0004.wav")
speech, sr = torchaudio.load(audio_path)
print(speech.shape)
speech = torch.mean(speech, dim=0, keepdim=True)
speech = speech.squeeze()
# or using librosa
# speech, sr = librosa.load(audio_file, sr=16000)
sr, speech.shape
print(speech.shape)

In [None]:
# resample from whatever the audio sampling rate to 16000
resampler = torchaudio.transforms.Resample(sr, 16000)
speech = resampler(speech)
speech.shape

In [None]:
# tokenize our wav
input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"]
input_values.shape

In [None]:
# perform inference
logits = model(input_values)["logits"]
logits.shape

In [None]:
# use argmax to get the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)
predicted_ids.shape

In [None]:
# decode the IDs to text
transcription = processor.decode(predicted_ids[0])
transcription.lower()


In [39]:
# define the prediction function which takes in the file path to a wav file and outputs the predicted words

def predict (path): 
    # preprocess audio & reduce noise
    rate, data = wavfile.read(path)
    # perform noise reduction
    reduced_noise = nr.reduce_noise(y=data, sr=rate, stationary=False, prop_decrease=0.9)
    print(len(reduced_noise))
    reduced_file_path = "../data/reduced/test.wav"
    wavfile.write(reduced_file_path, rate, reduced_noise)

    # prep audio 

    # load our wav file
    speech, sr = torchaudio.load(reduced_file_path)
    # print(speech.shape)
    speech = torch.mean(speech, dim=0, keepdim=True)
    speech = speech.squeeze()
    sr, speech.shape
    # print(speech.shape)

    # resample from whatever the audio sampling rate to 16000
    resampler = torchaudio.transforms.Resample(sr, 16000)
    speech = resampler(speech)
    
    # tokenize our wav
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"]
    input_values.shape

    # perform inference
    logits = model(input_values)["logits"]

    # use argmax to get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)

    # decode the IDs to text
    transcription = processor.decode(predicted_ids[0])
    return transcription.upper()


In [40]:
root_dir = "C:/Users/jared/Downloads/test-clean/LibriSpeech/test-clean/"
new_root_dir = "../data/LibriSpeech/test-clean-wav/"

In [43]:
# testing function on LibriSpeech testing data


# evaluation function which will test the model on the testing data
def eval(dir):
    word_err_count = 0
    word_count = 0
    # dfs approach to read into each of the directories
    for (dirpath, dirnames, filenames) in walk(dir):
        # case where we have reached directory of audio files
        if(dirnames == []):
            print("currently at: " + dirpath)
            print(filenames)
            print()

            # get the answer key for the words (transcript)
            trans_path = ""
            for fname in filenames:
                if "txt" in fname:
                    trans = fname
            print("transcript file location is: " + trans)
            
            # from the transcript file, iterate over each line
            trans_lines = open(dirpath + "/" + trans, 'r').readlines()
            for line in trans_lines:
                words = line.split()
                # words[0] is the audio file name
                # call the model function on the audio file
                res = predict(dirpath + "/" + words[0] + ".wav").split()
                # words[1:] are the actual transcript (correct owrds)
                print("words[0]: " + words[0])
                print("words[1:]: " + str(words[1:]))
                print("res: " + str(res))
                # compare the two lists of words
                for i, word in enumerate(words[1:]):
                    word_count += 1
                    if(word != res[i]):
                        word_err_count += 1
                break

            break
        else:
            for dname in dirnames:
                if not os.path.exists(new_root_dir + dname):
                    os.mkdir(new_root_dir + dname)
            print("dirnames: " + str(dirnames))
print("done!")

eval(new_root_dir)

done!
dirnames: ['1089', '1188', '121', '1221', '1284', '1320', '134686', '134691', '1580', '1995', '2094', '2300', '237', '260', '2830', '2961', '3570', '3575', '3729', '4077', '4446', '4507', '4970', '4992', '5105', '5142', '5639', '5683', '61', '672', '6829', '6930', '7021', '7127', '7176', '7729', '8224', '8230', '8455', '8463', '8555', '908']
dirnames: ['134686', '134691']
currently at: ../data/LibriSpeech/test-clean-wav/1089\134686
['1089-134686-0000.wav', '1089-134686-0001.wav', '1089-134686-0002.wav', '1089-134686-0003.wav', '1089-134686-0004.wav', '1089-134686-0005.wav', '1089-134686-0006.wav', '1089-134686-0007.wav', '1089-134686-0008.wav', '1089-134686-0009.wav', '1089-134686-0010.wav', '1089-134686-0011.wav', '1089-134686-0012.wav', '1089-134686-0013.wav', '1089-134686-0014.wav', '1089-134686-0015.wav', '1089-134686-0016.wav', '1089-134686-0017.wav', '1089-134686-0018.wav', '1089-134686-0019.wav', '1089-134686-0020.wav', '1089-134686-0021.wav', '1089-134686-0022.wav', '1089

In [28]:
# conversion script to convert all flac audio files into wav form

for (dirpath, dirnames, filenames) in walk(root_dir):
        # case where we have reached directory of audio files
        new_dir = dirpath.replace(root_dir, "")
        if(dirnames == []):
            print("currently at: " + dirpath)
            print(filenames)
            print()
            # convert audio files into wav form
            for fname in filenames:
                file_path = PurePath(dirpath + "\\" + fname)
                print(file_path)
                
                print("newdir: " + new_dir)

                if ("flac" in fname):
                    flac_tmp_audio_data = AudioSegment.from_file(file_path, file_path.suffix[1:])
                    print(file_path.name.replace(file_path.suffix, "") + ".wav")

                    if not os.path.exists(new_root_dir + new_dir):
                        os.mkdir(new_root_dir + new_dir)
                    
                    new_path = new_root_dir + new_dir + "/" + file_path.name.replace(file_path.suffix, "") + ".wav"
                    print(new_path)
                    flac_tmp_audio_data.export(new_path, format="wav")
                else: 
                    # this should be the text file with the answers, so add it to the directory
                    shutil.copy(file_path, new_root_dir + new_dir + "/" + file_path.name)
        else:
            for dname in dirnames:
                if not os.path.exists(new_root_dir + new_dir + "/" + dname):
                    os.mkdir(new_root_dir + new_dir + "/" + dname)
            print("dirnames: " + str(dirnames))
print("done!")

dirnames: ['1089', '1188', '121', '1221', '1284', '1320', '1580', '1995', '2094', '2300', '237', '260', '2830', '2961', '3570', '3575', '3729', '4077', '4446', '4507', '4970', '4992', '5105', '5142', '5639', '5683', '61', '672', '6829', '6930', '7021', '7127', '7176', '7729', '8224', '8230', '8455', '8463', '8555', '908']
dirnames: ['134686', '134691']
currently at: C:/Users/jared/Downloads/test-clean/LibriSpeech/test-clean/1089\134686
['1089-134686-0000.flac', '1089-134686-0001.flac', '1089-134686-0002.flac', '1089-134686-0003.flac', '1089-134686-0004.flac', '1089-134686-0005.flac', '1089-134686-0006.flac', '1089-134686-0007.flac', '1089-134686-0008.flac', '1089-134686-0009.flac', '1089-134686-0010.flac', '1089-134686-0011.flac', '1089-134686-0012.flac', '1089-134686-0013.flac', '1089-134686-0014.flac', '1089-134686-0015.flac', '1089-134686-0016.flac', '1089-134686-0017.flac', '1089-134686-0018.flac', '1089-134686-0019.flac', '1089-134686-0020.flac', '1089-134686-0021.flac', '1089-134