Main articles being used as reference:
https://www.thepythoncode.com/article/speech-recognition-using-huggingface-transformers-in-python

https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/README.md

http://www.openslr.org/12

https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/

https://pypi.org/project/noisereduce/

https://jmvalin.ca/demo/rnnoise/


In [15]:
# Imports 
from transformers import *
import torch
import soundfile as sf
# import librosa
import os
import torchaudio
import noisereduce as nr
from scipy.io import wavfile
from os import listdir, walk
from os.path import isfile, join
from pathlib import PurePath
from pydub import AudioSegment
import shutil
from jiwer import wer
import accelerate
import pandas as pd



In [16]:
# Preprocessor and model weights
model_name = "facebook/wav2vec2-base-960h" # 360MB
# model_name = "facebook/wav2vec2-large-960h-lv60-self" # 1.18GB

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# processor = Wav2Vec2Processor.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
# model = Wav2Vec2ForCTC.from_pretrained(model_name)
org_model = Wav2Vec2ForCTC.from_pretrained(model_name)
org_model.to(device)

loading configuration file preprocessor_config.json from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

loading configuration file config.json from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base-960h",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)

In [5]:
# looking at named parameters of wav2vec2 (not useful to finding hyperparameters)
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.data)

In [13]:
# directory file path parameters
# root_dir = "C:/Users/jared/Downloads/test-clean/LibriSpeech/test-clean/"
new_root_dir = "../data/LibriSpeech/test-clean-wav/"
data_path = "../data/LibriSpeech/smol/"


In [7]:
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/16-122828-0002.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/30-4447-0004.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/7601-291468-0006.wav"
audio_path = "../data/raw_wav/clean.wav"

In [17]:
# define the prediction function which takes in the file path to a wav file and outputs the predicted words

def predict (model, device, path): 
    # preprocess audio & reduce noise
    rate, data = wavfile.read(path)
    # perform noise reduction
    reduced_noise = nr.reduce_noise(y=data, sr=rate, stationary=False, prop_decrease=0.9)
    reduced_file_path = "../data/reduced/test.wav"
    wavfile.write(reduced_file_path, rate, reduced_noise)

    # prep audio 

    # load our wav file
    speech, sr = torchaudio.load(reduced_file_path)
    # print(speech.shape)
    speech = torch.mean(speech, dim=0, keepdim=True)
    speech = speech.squeeze()
    sr, speech.shape
    # print(speech.shape)

    # resample from whatever the audio sampling rate to 16000
    resampler = torchaudio.transforms.Resample(sr, 16000)
    # print(type(resampler))
    speech = resampler(speech)
    speech.to(device)
    
    # tokenize our wav
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)
    # input_values.shape

    # perform inference
    logits = model(input_values)["logits"]

    # use argmax to get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)

    # decode the IDs to text
    transcription = processor.decode(predicted_ids[0])
    return transcription.upper()


In [18]:
# define the prediction function which takes in the file path to a wav file and outputs the predicted words

def predict_no_nr (model, device, path): 
    # prep audio 

    # load our wav file
    speech, sr = torchaudio.load(path)
    # print(speech.shape)
    speech = torch.mean(speech, dim=0, keepdim=True)
    speech = speech.squeeze()
    sr, speech.shape
    # print(speech.shape)

    # resample from whatever the audio sampling rate to 16000
    resampler = torchaudio.transforms.Resample(sr, 16000)
    # print(type(resampler))
    speech = resampler(speech)
    speech.to(device)

    
    # tokenize our wav
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)
    # input_values.shape

    # perform inference
    logits = model(input_values)["logits"]

    # use argmax to get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)

    # decode the IDs to text
    transcription = processor.decode(predicted_ids[0])
    return transcription.upper()


In [19]:
# testing predict
test_path = "C:\\Users\\jared\\Documents\\GitHub\\Subtitle-AI\\data\\raw_wav\\assets_fish.wav"
print(predict(org_model, device, test_path))
predict_no_nr(org_model, device, test_path)


I KNOW THE HUMAN BEING AND FISHED AND CO EXIST PEACEFULLY


'I KNOW THE HUMAN BEING AND FISH CAN COEXIST PEACEFULLY'

In [20]:
# testing function on LibriSpeech testing data


# evaluation function which will test the model on the testing data
def eval(model, device, dir):
    average_word_error_rate = 0
    num_sents = 0
    # dfs approach to read into each of the directories
    for (dirpath, dirnames, filenames) in walk(dir):
        new_dir = dirpath.replace(new_root_dir, "")
        # case where we have reached directory of audio files
        if(dirnames == []):
            # print("currently at: " + dirpath)
            # print(filenames)
            # print()

            # get the answer key for the words (transcript)
            trans_path = ""
            for fname in filenames:
                if "txt" in fname:
                    trans = fname
            # print("transcript file location is: " + trans)
            
            # from the transcript file, iterate over each line
            trans_lines = open(dirpath + "/" + trans, 'r').readlines()
            for line in trans_lines:
                words = line.split(" ", 1)
                # words[0] is the audio file name
                # call the model function on the audio file
                res = predict(model, device, dirpath + "/" + words[0] + ".wav")
                # words[1:] are the actual transcript (correct owrds)
                # print("words[0]: " + words[0])
                # print("words[1]: " + words[1])
                # print("res: " + str(res))
                # compare the two sentences
                word_error_rate = wer(words[1], res)
                average_word_error_rate += word_error_rate
                num_sents += 1
                # print("word error rate: " + str(word_error_rate))
        # else:
        #     for dname in dirnames:
        #         if not os.path.exists(new_root_dir + new_dir + "/" + dname):
        #             os.mkdir(new_root_dir + new_dir + "/" + dname)
        #     print("dirnames: " + str(dirnames))
    return (average_word_error_rate/num_sents)
print("done!")



done!


In [21]:
eval(org_model, device, data_path)

0.03921664633950245

In [22]:
# Hyperparameters

hidden_size = [384, 768, 1536]
num_attention_heads = [8, 12, 16]
conv_dim = [(256, 256, 256, 256, 256, 256, 256), (768, 768, 768, 768, 768, 768, 768)]
intermediate_size = [1024, 2048, 3072, 4096]

attention_dropout=[0.1, 0.3, 0.5]
hidden_dropout=[0.1, 0.3, 0.5]
feat_proj_dropout=[0.0, 0.2, 0.4]
mask_time_prob=[0.05, 0.3, 0.7]

In [19]:
# evaluation of different hyperparameters

# attention_dropout=[0.1, 0.11, 0.12, 0.099, 0.098]
# hidden_dropout=[0.1, 0.11, 0.12, 0.099, 0.098]
# feat_proj_dropout=[0.0, 0.01, 0.02, 0.03, 0.04, 0.05]
# mask_time_prob=[0.05, 0.04, 0.045, 0.055, 0.06]

# n_trials=len(attention_dropout)*len(hidden_dropout)*len(feat_proj_dropout)*len(mask_time_prob) 
n_trials = len(hidden_size) * len(num_attention_heads) * len(conv_dim) * len(intermediate_size)
current_n = 1
index = 1
print(n_trials)
 
 
df = pd.DataFrame(columns = ['attention_dropout','hidden_droupout','feat_proj_dropout','mask_time_prob',"WER"])
model_name = "facebook/wav2vec2-base-960h" 

for a in attention_dropout:
    for b in hidden_dropout:
        for c in feat_proj_dropout:
            for d in mask_time_prob:
# for a in hidden_size:
#     for b in num_attention_heads:
#         for c in conv_dim:
#             for d in intermediate_size:
                    print(a,b,c,d)
                    print('Currently working on ', current_n, ' of the ', n_trials,' total trials')
                    print('Progress: ',(current_n/n_trials)*100, '%')
                    configuration = Wav2Vec2Config(
                        attention_dropout=a,
                        hidden_dropout=b,
                        feat_proj_dropout=c,
                        mask_time_prob=d,
                        layerdrop=1,
                        ctc_loss_reduction="mean", 
                        vocab_size=len(processor.tokenizer))
                    # configuration = Wav2Vec2Config(
                    #     hidden_size=a,
                    #     num_attention_heads=b,
                    #     conv_dim=c,
                    #     intermediate_size=d,
                    # )
                    # model = Wav2Vec2Model(configuration)
                    model = Wav2Vec2ForCTC .from_pretrained(model_name, config = configuration)
                    model.to(device)
                    score = eval(model, device, data_path)

                    df.loc[index]=[a,b,c,d,score]
                    index = index+1
                    current_n = current_n+1
                    print(current_n)
df



72
0.1 0.1 0.0 0.05
Currently working on  1  of the  72  total trials
Progress:  1.3888888888888888 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2
0.1 0.1 0.0 0.3
Currently working on  2  of the  72  total trials
Progress:  2.7777777777777777 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3
0.1 0.1 0.0 0.7
Currently working on  3  of the  72  total trials
Progress:  4.166666666666666 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4
0.1 0.1 0.2 0.05
Currently working on  4  of the  72  total trials
Progress:  5.555555555555555 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5
0.1 0.1 0.2 0.3
Currently working on  5  of the  72  total trials
Progress:  6.944444444444445 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


6
0.1 0.1 0.2 0.7
Currently working on  6  of the  72  total trials
Progress:  8.333333333333332 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


7
0.1 0.1 0.4 0.05
Currently working on  7  of the  72  total trials
Progress:  9.722222222222223 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


8
0.1 0.1 0.4 0.3
Currently working on  8  of the  72  total trials
Progress:  11.11111111111111 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


9
0.1 0.1 0.4 0.7
Currently working on  9  of the  72  total trials
Progress:  12.5 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


10
0.1 0.3 0.0 0.05
Currently working on  10  of the  72  total trials
Progress:  13.88888888888889 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11
0.1 0.3 0.0 0.3
Currently working on  11  of the  72  total trials
Progress:  15.277777777777779 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


12
0.1 0.3 0.0 0.7
Currently working on  12  of the  72  total trials
Progress:  16.666666666666664 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


13
0.1 0.3 0.2 0.05
Currently working on  13  of the  72  total trials
Progress:  18.055555555555554 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


14
0.1 0.3 0.2 0.3
Currently working on  14  of the  72  total trials
Progress:  19.444444444444446 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


15
0.1 0.3 0.2 0.7
Currently working on  15  of the  72  total trials
Progress:  20.833333333333336 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


16
0.1 0.3 0.4 0.05
Currently working on  16  of the  72  total trials
Progress:  22.22222222222222 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


17
0.1 0.3 0.4 0.3
Currently working on  17  of the  72  total trials
Progress:  23.61111111111111 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


18
0.1 0.3 0.4 0.7
Currently working on  18  of the  72  total trials
Progress:  25.0 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


19
0.1 0.5 0.0 0.05
Currently working on  19  of the  72  total trials
Progress:  26.38888888888889 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


20
0.1 0.5 0.0 0.3
Currently working on  20  of the  72  total trials
Progress:  27.77777777777778 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


21
0.1 0.5 0.0 0.7
Currently working on  21  of the  72  total trials
Progress:  29.166666666666668 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


22
0.1 0.5 0.2 0.05
Currently working on  22  of the  72  total trials
Progress:  30.555555555555557 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


23
0.1 0.5 0.2 0.3
Currently working on  23  of the  72  total trials
Progress:  31.944444444444443 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


24
0.1 0.5 0.2 0.7
Currently working on  24  of the  72  total trials
Progress:  33.33333333333333 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25
0.1 0.5 0.4 0.05
Currently working on  25  of the  72  total trials
Progress:  34.72222222222222 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


26
0.1 0.5 0.4 0.3
Currently working on  26  of the  72  total trials
Progress:  36.11111111111111 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


27
0.1 0.5 0.4 0.7
Currently working on  27  of the  72  total trials
Progress:  37.5 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


28
0.3 0.1 0.0 0.05
Currently working on  28  of the  72  total trials
Progress:  38.88888888888889 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


29
0.3 0.1 0.0 0.3
Currently working on  29  of the  72  total trials
Progress:  40.27777777777778 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


30
0.3 0.1 0.0 0.7
Currently working on  30  of the  72  total trials
Progress:  41.66666666666667 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


31
0.3 0.1 0.2 0.05
Currently working on  31  of the  72  total trials
Progress:  43.05555555555556 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


32
0.3 0.1 0.2 0.3
Currently working on  32  of the  72  total trials
Progress:  44.44444444444444 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


33
0.3 0.1 0.2 0.7
Currently working on  33  of the  72  total trials
Progress:  45.83333333333333 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
df.sort_values(by = ["WER"])

Unnamed: 0,attention_dropout,hidden_droupout,feat_proj_dropout,mask_time_prob,WER
1,0.1,0.1,0.0,0.05,0.039217
58,0.5,0.1,0.2,0.05,0.039217
57,0.5,0.1,0.0,0.70,0.039217
56,0.5,0.1,0.0,0.30,0.039217
55,0.5,0.1,0.0,0.05,0.039217
...,...,...,...,...,...
24,0.1,0.5,0.2,0.70,0.039217
23,0.1,0.5,0.2,0.30,0.039217
22,0.1,0.5,0.2,0.05,0.039217
40,0.3,0.3,0.2,0.05,0.039217


In [None]:
eval(model, device, data_path)

0.03921664633950245

In [23]:
# evaluation of different hyperparameters

attention_dropout=[0.1]
hidden_dropout=[0.1, 0, 1]
feat_proj_dropout=[0.0]
mask_time_prob=[0.05]

n_trials=len(attention_dropout)*len(hidden_dropout)*len(feat_proj_dropout)*len(mask_time_prob) 
# n_trials = len(hidden_size) * len(num_attention_heads) * len(conv_dim) * len(intermediate_size)
current_n = 1
index = 1
print(n_trials)
 
 
df_test = pd.DataFrame(columns = ['attention_dropout','hidden_droupout','feat_proj_dropout','mask_time_prob',"WER"])
model_name = "facebook/wav2vec2-base-960h" 

for a in attention_dropout:
    for b in hidden_dropout:
        for c in feat_proj_dropout:
            for d in mask_time_prob:
# for a in hidden_size:
#     for b in num_attention_heads:
#         for c in conv_dim:
#             for d in intermediate_size:
                    print(a,b,c,d)
                    print('Currently working on ', current_n, ' of the ', n_trials,' total trials')
                    print('Progress: ',(current_n/n_trials)*100, '%')
                    configuration = Wav2Vec2Config(
                        attention_dropout=a,
                        hidden_dropout=b,
                        feat_proj_dropout=c,
                        mask_time_prob=d,
                        layerdrop=1,
                        ctc_loss_reduction="mean", 
                        vocab_size=len(processor.tokenizer))
                    # configuration = Wav2Vec2Config(
                    #     hidden_size=a,
                    #     num_attention_heads=b,
                    #     conv_dim=c,
                    #     intermediate_size=d,
                    # )
                    # model = Wav2Vec2Model(configuration)
                    model = Wav2Vec2ForCTC.from_pretrained(model_name, config = configuration)
                    model.to(device)
                    # score = eval(model, device, new_root_dir)
                    score = eval(model, device, data_path)
                    print(score)
                    df_test.loc[index]=[a,b,c,d,score]
                    index = index+1
                    current_n = current_n+1
                    print(current_n)
df_test



3
0.1 0.1 0.0 0.05
Currently working on  1  of the  3  total trials
Progress:  33.33333333333333 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.03921664633950245
2
0.1 0 0.0 0.05
Currently working on  2  of the  3  total trials
Progress:  66.66666666666666 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.03921664633950245
3
0.1 1 0.0 0.05
Currently working on  3  of the  3  total trials
Progress:  100.0 %


loading weights file pytorch_model.bin from cache at C:\Users\jared/.cache\huggingface\hub\models--facebook--wav2vec2-base-960h\snapshots\22aad52d435eb6dbaf354bdad9b0da84ce7d6156\pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.03921664633950245
4


Unnamed: 0,attention_dropout,hidden_droupout,feat_proj_dropout,mask_time_prob,WER
1,0.1,0.1,0.0,0.05,0.039217
2,0.1,0.0,0.0,0.05,0.039217
3,0.1,1.0,0.0,0.05,0.039217


In [21]:
df_test.sort_values(by = ["WER"])

Unnamed: 0,attention_dropout,hidden_droupout,feat_proj_dropout,mask_time_prob,WER
1,0.1,0.1,0.0,0.05,0.041336
2,0.1,0.0,0.0,0.05,0.041336
3,0.1,1.0,0.0,0.05,0.041336


In [23]:
df_test.sort_values(by=["WER"])

Unnamed: 0,attention_dropout,hidden_droupout,feat_proj_dropout,mask_time_prob,WER
1,0.1,0.1,0.0,0.05,0.041336
2,0.1,0.0,0.0,0.05,0.041336
3,0.1,1.0,0.0,0.05,0.041336


In [24]:
# from datasets import load_dataset
# from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# import torch
# from jiwer import wer


# librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")

# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# def map_to_pred(batch):
#     input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
#     with torch.no_grad():
#         logits = model(input_values.to("cuda")).logits

#     predicted_ids = torch.argmax(logits, dim=-1)
#     transcription = processor.batch_decode(predicted_ids)
#     batch["transcription"] = transcription
#     return batch

# result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["audio"])

# print("WER:", wer(result["text"], result["transcription"]))

Downloading and preparing dataset librispeech_asr/clean to C:/Users/jared/.cache/huggingface/datasets/librispeech_asr/clean/2.1.0/cff5df6e7955c80a67f80e27e7e655de71c689e2d2364bece785b972acb37fe7...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/338M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.39G [00:00<?, ?B/s]

KeyboardInterrupt: 