Main articles being used as reference:
https://www.thepythoncode.com/article/speech-recognition-using-huggingface-transformers-in-python

https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/README.md

http://www.openslr.org/12

https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/

https://pypi.org/project/noisereduce/

https://jmvalin.ca/demo/rnnoise/


In [8]:
# Imports 
from transformers import *
import torch
import soundfile as sf
# import librosa
import os
import torchaudio
import noisereduce as nr
from scipy.io import wavfile
from os import listdir, walk
from os.path import isfile, join
from pathlib import PurePath
from pydub import AudioSegment
import shutil
from jiwer import wer
import accelerate




In [9]:
# Preprocessor and model weights
model_name = "facebook/wav2vec2-base-960h" # 360MB
# model_name = "facebook/wav2vec2-large-960h-lv60-self" # 1.18GB

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# processor = Wav2Vec2Processor.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
# model = Wav2Vec2ForCTC.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.to(device)

loading configuration file preprocessor_config.json from cache at C:\Users\lsolo/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}



KeyboardInterrupt: 

In [None]:
# directory file path parameters
# root_dir = "C:/Users/jared/Downloads/test-clean/LibriSpeech/test-clean/"
new_root_dir = "../data/LibriSpeech/test-clean-wav/"
data_path = "../data/LibriSpeech/smol/"


In [None]:
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/16-122828-0002.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/30-4447-0004.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/7601-291468-0006.wav"
audio_path = "../data/raw_wav/clean.wav"

In [None]:
# define the prediction function which takes in the file path to a wav file and outputs the predicted words

def predict (model, device, path): 
    # preprocess audio & reduce noise
    rate, data = wavfile.read(path)
    # perform noise reduction
    reduced_noise = nr.reduce_noise(y=data, sr=rate, stationary=False, prop_decrease=0.9)
    reduced_file_path = "../data/reduced/test.wav"
    wavfile.write(reduced_file_path, rate, reduced_noise)

    # prep audio 

    # load our wav file
    speech, sr = torchaudio.load(reduced_file_path)
    # print(speech.shape)
    speech = torch.mean(speech, dim=0, keepdim=True)
    speech = speech.squeeze()
    sr, speech.shape
    # print(speech.shape)

    # resample from whatever the audio sampling rate to 16000
    resampler = torchaudio.transforms.Resample(sr, 16000)
    print(type(resampler))
    speech = resampler(speech)
    speech.to(device)

    
    # tokenize our wav
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)
    # input_values.shape

    # perform inference
    logits = model(input_values)["logits"]

    # use argmax to get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)

    # decode the IDs to text
    transcription = processor.decode(predicted_ids[0])
    return transcription.upper()


In [None]:
# testing predict
test_path = "C:\\Users\\jared\\Documents\\GitHub\\Subtitle-AI\\data\\raw_wav\\assets_fish.wav"
print(predict(test_path))

TypeError: predict() missing 2 required positional arguments: 'device' and 'path'

In [None]:
# testing function on LibriSpeech testing data


# evaluation function which will test the model on the testing data
def eval(model, device, dir):
    average_word_error_rate = 0
    num_sents = 0
    # dfs approach to read into each of the directories
    for (dirpath, dirnames, filenames) in walk(dir):
        new_dir = dirpath.replace(new_root_dir, "")
        # case where we have reached directory of audio files
        if(dirnames == []):
            print("currently at: " + dirpath)
            print(filenames)
            print()

            # get the answer key for the words (transcript)
            trans_path = ""
            for fname in filenames:
                if "txt" in fname:
                    trans = fname
            print("transcript file location is: " + trans)
            
            # from the transcript file, iterate over each line
            trans_lines = open(dirpath + "/" + trans, 'r').readlines()
            for line in trans_lines:
                words = line.split(" ", 1)
                # words[0] is the audio file name
                # call the model function on the audio file
                res = predict(model, device, dirpath + "/" + words[0] + ".wav")
                # words[1:] are the actual transcript (correct owrds)
                print("words[0]: " + words[0])
                print("words[1]: " + words[1])
                print("res: " + str(res))
                # compare the two sentences
                word_error_rate = wer(words[1], res)
                average_word_error_rate += word_error_rate
                num_sents += 1
                print("word error rate: " + str(word_error_rate))
        # else:
        #     for dname in dirnames:
        #         if not os.path.exists(new_root_dir + new_dir + "/" + dname):
        #             os.mkdir(new_root_dir + new_dir + "/" + dname)
        #     print("dirnames: " + str(dirnames))
    return (average_word_error_rate/num_sents)
print("done!")



done!


In [None]:
# evaluation

eval(model, device, data_path)

import pandas as pd
 
vocab_size =[32]
conv_kernel=[10, 3, 3, 3, 3, 3, 3]
add_adapter=[True]
mask_time_length=[10]
pad_token_id=[0]
index = [0]
n_trials=len(vocab_size)*len(conv_kernel)*len(add_adapter)*len(mask_time_length)*len(pad_token_id)  
current_n = 1
print(n_trials)
 
 
df = pd.DataFrame(columns = ['vocab_size', 'conv_kernel','add_adapter','mask_time_length','pad_token_id',"score"])
model_name = "facebook/wav2vec2-base-960h" 

for a in vocab_size:
    for b in conv_kernel:
        for c in add_adapter:
            for d in mask_time_length:
                for e in pad_token_id:
                    print(a,b,c,d,e)
                    print('Currently working on ', current_n, ' of the ', n_trials,' total trials')
                    print('Progress: ',(current_n/n_trials)*100, '%')
                    model = Wav2Vec2ForCTC .from_pretrained(model_name)
                    clf = eval(vocab_size = a,
                                conv_kernel = b,
                                add_adapter = c,
                                mask_time_length = d,
                                pad_token_id = e,)
                    score = eval(model, device, data_path )
                    df.loc[index]=[a,b,c,d,e,score]
                    index = index+1
                    current_n = current_n+1
                    print(current_n)
df



currently at: ../data/LibriSpeech/smol/121\121726
['121-121726-0000.wav', '121-121726-0001.wav', '121-121726-0002.wav', '121-121726-0003.wav', '121-121726-0004.wav', '121-121726-0005.wav', '121-121726-0006.wav', '121-121726-0007.wav', '121-121726-0008.wav', '121-121726-0009.wav', '121-121726-0010.wav', '121-121726-0011.wav', '121-121726-0012.wav', '121-121726-0013.wav', '121-121726-0014.wav', '121-121726.trans.txt']

transcript file location is: 121-121726.trans.txt
<class 'torchaudio.transforms._transforms.Resample'>
words[0]: 121-121726-0000
words[1]: ALSO A POPULAR CONTRIVANCE WHEREBY LOVE MAKING MAY BE SUSPENDED BUT NOT STOPPED DURING THE PICNIC SEASON

res: ALSO A POPULAR CONTRIVANCE WHEREBY LOVE MAKING MAY BE SUSPENDED BUT NOT STOPPED DURING THE PICNIC SEASON
word error rate: 0.0
<class 'torchaudio.transforms._transforms.Resample'>
words[0]: 121-121726-0001
words[1]: HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE

res: HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE
word e

KeyboardInterrupt: 

In [None]:
# conversion script to convert all flac audio files into wav form

# for (dirpath, dirnames, filenames) in walk(root_dir):
#         # case where we have reached directory of audio files
#         new_dir = dirpath.replace(root_dir, "")
#         if(dirnames == []):
#             print("currently at: " + dirpath)
#             print(filenames)
#             print()
#             # convert audio files into wav form
#             for fname in filenames:
#                 file_path = PurePath(dirpath + "\\" + fname)
#                 print(file_path)
                
#                 print("newdir: " + new_dir)

#                 if ("flac" in fname):
#                     flac_tmp_audio_data = AudioSegment.from_file(file_path, file_path.suffix[1:])
#                     print(file_path.name.replace(file_path.suffix, "") + ".wav")

#                     if not os.path.exists(new_root_dir + new_dir):
#                         os.mkdir(new_root_dir + new_dir)
                    
#                     new_path = new_root_dir + new_dir + "/" + file_path.name.replace(file_path.suffix, "") + ".wav"
#                     print(new_path)
#                     flac_tmp_audio_data.export(new_path, format="wav")
#                 else: 
#                     # this should be the text file with the answers, so add it to the directory
#                     shutil.copy(file_path, new_root_dir + new_dir + "/" + file_path.name)
#         else:
#             for dname in dirnames:
#                 if not os.path.exists(new_root_dir + new_dir + "/" + dname):
#                     os.mkdir(new_root_dir + new_dir + "/" + dname)
#             print("dirnames: " + str(dirnames))
# print("done!")

dirnames: ['1089', '1188', '121', '1221', '1284', '1320', '1580', '1995', '2094', '2300', '237', '260', '2830', '2961', '3570', '3575', '3729', '4077', '4446', '4507', '4970', '4992', '5105', '5142', '5639', '5683', '61', '672', '6829', '6930', '7021', '7127', '7176', '7729', '8224', '8230', '8455', '8463', '8555', '908']
dirnames: ['134686', '134691']
currently at: C:/Users/jared/Downloads/test-clean/LibriSpeech/test-clean/1089\134686
['1089-134686-0000.flac', '1089-134686-0001.flac', '1089-134686-0002.flac', '1089-134686-0003.flac', '1089-134686-0004.flac', '1089-134686-0005.flac', '1089-134686-0006.flac', '1089-134686-0007.flac', '1089-134686-0008.flac', '1089-134686-0009.flac', '1089-134686-0010.flac', '1089-134686-0011.flac', '1089-134686-0012.flac', '1089-134686-0013.flac', '1089-134686-0014.flac', '1089-134686-0015.flac', '1089-134686-0016.flac', '1089-134686-0017.flac', '1089-134686-0018.flac', '1089-134686-0019.flac', '1089-134686-0020.flac', '1089-134686-0021.flac', '1089-134