<a href="https://colab.research.google.com/github/karank85/speech-recognition/blob/main/Project2_DL_Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning Implementation for HuggingFace

## Import models

In [4]:
import os

import numpy as np
from numpy import ndarray
import pandas as pd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import librosa

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn

from IPython.display import Audio

!pip install transformers datasets evaluate jiwer accelerate

from datasets import Dataset, DatasetDict,Features, Array3D, Value, concatenate_datasets

from transformers import AutoProcessor, AutoModelForCTC, TrainingArguments, Trainer

import evaluate

import numpy as np

import glob

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2

In [19]:
librosa.__version__

'0.10.1'

In [13]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Prepare Dataset

### AudioDataset class

In [3]:
# Assumptions:
# - The transcription file is located in the same directory as the audio files.
class AudioDataset:
  """
  Class for loading and storing audio data.
  """

  def __init__(self):
    self.df = pd.DataFrame(columns=['id', 'path', 'transcription'])

  def load_transcriptions(self, directory_path: str) -> bool:
    """
    Load all transcriptions from a given directory, including subdirectories.
    Returns False if no transcription files were found, or if any failed to load.
    """
    transcriptions_path = glob.glob(
        f"{directory_path}/**/*.trans.txt",
        recursive=True
    )

    if len(transcriptions_path) == 0:
      return False

    for path in transcriptions_path:
      if not self.load_transcription_file(path):
        return False

    return True



  def load_transcription_file(self, file_path: str) -> bool:
    """
    Parse transcription file and records the audio ID - subtitle mapping.
    Returns False if the file could not be read.
    """
    with open(file_path, "r") as file:
      file_directory = os.path.dirname(file_path)

      lines = file.read().split("\n")
      for line in lines:
        if len(line.strip()) == 0:
          continue
        splitter = line.split(" ")
        file_name = splitter[0]
        file_content = ' '.join(splitter[1:])
        self.df.loc[len(self.df)] = {
            'id':file_name,
            'transcription':file_content,
            'path': f'{file_directory}/{file_name}.flac'
        }
      return True
    return False

  def keys(self):
    return iter(self.df['id'])

  def get(self, id: int):
    """
    Retrieve a dataframe row from ID.
    """
    return self.df.loc[self.df['id'] == id]

In [4]:
ds = AudioDataset()

### Extract our **dataset**

In [5]:
directories = ["103", "1034", "1040", "1069", "1081", "1088", "125", "1363", "1098", "163"]

for directory in directories:
  ds.load_transcriptions(f"/content/drive/MyDrive/cool-boy/datasets/{directory}")

In [6]:
audio_list = []
sampling_freq_list = []

for index, row in ds.df.iterrows():
  audio, sampling_freq = librosa.load(row['path'], sr=16_000)

  audio_list.append(audio)
  sampling_freq_list.append(sampling_freq)

ds.df['audio'] = audio_list
ds.df['sampling_freq'] = sampling_freq_list

In [7]:


custom_dataset = Dataset.from_pandas(ds.df)
custom_dataset

Dataset({
    features: ['id', 'path', 'transcription', 'audio', 'sampling_freq', '__index_level_0__'],
    num_rows: 1069
})

In [8]:
custom_dataset = custom_dataset.remove_columns(['path', 'id', '__index_level_0__'])
# transcription: str, audio: list[int], sampling_freq: int
custom_dataset

Dataset({
    features: ['transcription', 'audio', 'sampling_freq'],
    num_rows: 1069
})

In [9]:
custom_dataset = custom_dataset.train_test_split(test_size=0.2)
custom_dataset

DatasetDict({
    train: Dataset({
        features: ['transcription', 'audio', 'sampling_freq'],
        num_rows: 855
    })
    test: Dataset({
        features: ['transcription', 'audio', 'sampling_freq'],
        num_rows: 214
    })
})

In [14]:
# Making sure it's all uppercase characters as Wav2Vec Tokenizer is only trained
# on uppercase characters and we need to match the tokenizer's vocabulary.
all_upper = True
for item in custom_dataset["train"]:
  if not item["transcription"].isupper():
    all_upper = False

all_upper

True

## Set up DL Model

We'll start with Wave2Vec2.0 to explore how we can setup and train the model

### Import Processor

In [16]:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

### Apply our Processor

In [17]:
def apply_processor(batch):
  batch = processor(batch["audio"], sampling_rate=batch["sampling_freq"], text=batch["transcription"])
  batch["input_length"] = len(batch["input_values"][0])
  return batch

encoded_datasets = custom_dataset.map(apply_processor, remove_columns=custom_dataset.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/855 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/214 [00:00<?, ? examples/s]

### Apply Data Collator

In [18]:
# Need to create a data collator to prepare batches of data suitable for training CTC loss-based models
# Pad text and labels to length of the longest element in its batch to make it uniform length

@dataclass
class DataCollatorCTCLossWithPadding:
  processor: AutoProcessor
  padding: Union[bool, str] = "longest" # pad to the longest sequence

  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

    #audio features
    input_features = [{"input_values": feature["input_values"][0]} for feature in features]
    # tokenized labels
    label_features = [{"input_ids": feature["labels"]} for feature in features]

    batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

    labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    batch["labels"] = labels

    return batch

In [19]:
data_collator = DataCollatorCTCLossWithPadding(processor=processor, padding="longest")

### Set up Evaluation

In [85]:
wer_metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [21]:


def compute_metrics(pred):
    # Compute predicted labels
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)

    # Replace -100 with pad token ID in true labels
    true_labels = np.where(pred.label_ids == -100, processor.tokenizer.pad_token_id, pred.label_ids)
    true_str = processor.batch_decode(true_labels, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=true_str)

    return {"wer": wer}


### Load in our model

In [22]:
model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
model.to(device)

model.freeze_feature_extractor()



### Training our model with TrainingArguments and Trainer

In [24]:

# TEST TRAINING ARGUMENTS

training_args = TrainingArguments(
    output_dir="wav2vec_model_v2",  # Output directory to save model checkpoints and logs
    group_by_length=True,  # Group samples of roughly the same length together to minimize padding
    per_device_train_batch_size=4,  # Batch size per GPU/device during training
    evaluation_strategy="steps",  # Evaluate every `eval_steps`
    fp16=True,  # Use mixed precision training with automatic mixed precision scaler
    save_steps=100,  # Save model checkpoint every `save_steps`
    eval_steps=100,  # Evaluate model every `eval_steps`
    logging_steps=50,  # Log training metrics every `logging_steps`
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.001,  # Weight decay to prevent overfitting
    warmup_steps=100,  # Warmup steps for learning rate scheduler
    save_total_limit=1,  # Limit the total number of saved checkpoints
    max_steps=2000,  # Maximum number of training steps
)

# Define Trainer
trainer = Trainer(
    model=model,  # Model to be trained
    args=training_args,  # Training arguments
    train_dataset=encoded_datasets["train"],  # Training dataset
    eval_dataset=encoded_datasets["test"],  # Evaluation dataset
    tokenizer=processor,  # Tokenizer for preprocessing inputs
    data_collator=data_collator,  # Data collator for batching and padding
    compute_metrics=compute_metrics,  # Function to compute evaluation metrics
)

# trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


### Model Engine

In [5]:
class ModelEngine:

  def __init__(self, model_name, custom_dataset):
    print(f"Preparing model: {model_name}")
    self.model_name = model_name
    self.processor = AutoProcessor.from_pretrained(self.model_name)
    self.data_collator = self.DataCollatorCTCLossWithPadding(processor=self.processor, padding="longest")
    # self.wer_metric = load_metric("wer")
    self.model = AutoModelForCTC.from_pretrained(
        self.model_name,
        ctc_loss_reduction="mean",
        pad_token_id=self.processor.tokenizer.pad_token_id,
    )
    self.encoded_datasets = custom_dataset.map(self.apply_processors, remove_columns=custom_dataset.column_names["train"], num_proc=1)

  def apply_processors(self, batch):
    batch = self.processor(batch["audio"], sampling_rate=batch["sampling_freq"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

  def compute_metrics(self, pred):

    wer_metric = evaluate.load("wer")

    # Compute predicted labels
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = self.processor.batch_decode(pred_ids)

    # Replace -100 with pad token ID in true labels
    true_labels = np.where(pred.label_ids == -100, self.processor.tokenizer.pad_token_id, pred.label_ids)
    true_str = self.processor.batch_decode(true_labels, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=true_str)

    return {"wer": wer}

  @staticmethod
  def predict(model_checkpoint, path):

    audio, sampling_freq = librosa.load(path, sr=16_000)

    processor = AutoProcessor.from_pretrained(model_checkpoint, local_files_only=True)
    inputs = processor(audio, sampling_rate=sampling_freq, return_tensors="pt")

    model = AutoModelForCTC.from_pretrained(model_checkpoint)
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription


  def train(self, output_dir):

    print(f"Training model: {self.model_name}")

    self.model.to(device)

    self.model.freeze_feature_extractor()

    training_args = TrainingArguments(
      output_dir=output_dir,  # Output directory to save model checkpoints and logs
      group_by_length=True,  # Group samples of roughly the same length together to minimize padding
      per_device_train_batch_size=4,  # Batch size per GPU/device during training
      evaluation_strategy="steps",  # Evaluate every `eval_steps`
      fp16=True,  # Use mixed precision training with automatic mixed precision scaler
      save_steps=100,  # Save model checkpoint every `save_steps`
      eval_steps=100,  # Evaluate model every `eval_steps`
      logging_steps=50,  # Log training metrics every `logging_steps`
      learning_rate=5e-5,  # Learning rate
      weight_decay=0.001,  # Weight decay to prevent overfitting
      warmup_steps=100,  # Warmup steps for learning rate scheduler
      save_total_limit=1,  # Limit the total number of saved checkpoints
      max_steps=2000,  # Maximum number of training steps
    )

    # Define Trainer
    trainer = Trainer(
        model=self.model,  # Model to be trained
        args=training_args,  # Training arguments
        train_dataset=self.encoded_datasets["train"],  # Training dataset
        eval_dataset=self.encoded_datasets["test"],  # Evaluation dataset
        tokenizer=self.processor,  # Tokenizer for preprocessing inputs
        data_collator=self.data_collator,  # Data collator for batching and padding
        compute_metrics=self.compute_metrics,  # Function to compute evaluation metrics
    )

    trainer.train()

  # Prapre batches of data suitable for training CTC loss based models
  @dataclass
  class DataCollatorCTCLossWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest" # pad to the longest sequence for uniformity

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

      #audio features
      input_features = [{"input_values": feature["input_values"][0]} for feature in features]
      # tokenized labels
      label_features = [{"input_ids": feature["labels"]} for feature in features]

      batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

      labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

      # replace padding with -100 to ignore loss correctly
      labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

      batch["labels"] = labels

      return batch


### Evaluate with state-of-the-art models

Wav2Vec2.0

In [83]:
wav2vec = ModelEngine("facebook/wav2vec2-base", custom_dataset)

Preparing model: facebook/wav2vec2-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

In [27]:
wav2vec.train("/content/drive/MyDrive/model/wav2vec-model")

Training model: facebook/wav2vec2-base


Step,Training Loss,Validation Loss,Wer
100,3.2346,2.97545,1.0
200,2.9122,2.902834,1.0
300,2.8785,2.889525,1.0
400,2.877,2.871866,1.0
500,2.87,2.871957,1.0
600,2.8619,2.872466,1.0
700,2.8603,2.852553,1.0
800,2.0418,1.538197,0.878921
900,1.0819,0.829652,0.565838
1000,0.7527,0.601541,0.464679


Checkpoint destination directory /content/drive/MyDrive/model/wav2vec-model/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Data2Vec

In [28]:
data2vec = ModelEngine("facebook/data2vec-audio-base-960h", custom_dataset)

Preparing model: facebook/data2vec-audio-base-960h


preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/373M [00:00<?, ?B/s]

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

In [29]:
data2vec.train("/content/drive/MyDrive/model/data2vec-model")

Training model: facebook/data2vec-audio-base-960h


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Wer
100,0.0383,0.008119,0.012998
200,0.0369,0.008228,0.011161
300,0.0358,0.00907,0.011585
400,0.039,0.009325,0.011726
500,0.0357,0.008585,0.01102
600,0.0326,0.008686,0.012433
700,0.03,0.009308,0.012998
800,0.0384,0.009179,0.014128
900,0.0341,0.009741,0.012857
1000,0.0269,0.010071,0.013563


Step,Training Loss,Validation Loss,Wer
100,0.0383,0.008119,0.012998
200,0.0369,0.008228,0.011161
300,0.0358,0.00907,0.011585
400,0.039,0.009325,0.011726
500,0.0357,0.008585,0.01102
600,0.0326,0.008686,0.012433
700,0.03,0.009308,0.012998
800,0.0384,0.009179,0.014128
900,0.0341,0.009741,0.012857
1000,0.0269,0.010071,0.013563


Checkpoint destination directory /content/drive/MyDrive/model/data2vec-model/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


HUBERT

In [11]:
hubert = ModelEngine("facebook/hubert-large-ls960-ft", custom_dataset)

Preparing model: facebook/hubert-large-ls960-ft


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForCTC: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

In [14]:
hubert.train("/content/drive/MyDrive/model/hubert-model")

Training model: facebook/hubert-large-ls960-ft


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Wer
100,0.0646,0.033645,0.013535
200,0.0556,0.02298,0.012723
300,0.0399,0.020947,0.015836
400,0.0402,0.021251,0.012994
500,0.0366,0.020181,0.012994
600,0.0392,0.020523,0.015024
700,0.0408,0.019416,0.012182
800,0.0305,0.019886,0.012723
900,0.0255,0.020626,0.013265
1000,0.0316,0.020058,0.013129


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 180.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 50.38 MiB is free. Process 6090 has 15.72 GiB memory in use. Of the allocated memory 14.99 GiB is allocated by PyTorch, and 341.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Inference

### Inference with sample data

In [32]:
audio, sampling_freq = librosa.load("/content/drive/MyDrive/198/126831/198-126831-0000.flac", sr=16_000)

Audio(data=audio, rate=sampling_freq)

In [44]:
ModelEngine.predict("/content/drive/MyDrive/model/data2vec-model/checkpoint-2000", "/content/drive/MyDrive/198/126831/198-126831-0000.flac")

['FRIDAY WAS A COMFORTABLE DAY IN THE HOUSEHOLD OF KING EVERYBODY WAS IN A GOOD HUMOUR THE STORY GIRL SPARKLED THROUGH SEVERAL TAILS THAT RANGED FROM THE AFFHRITES AND GINS OF EASTERN MYTH THROUGH THE PIPING DAYS OF CHIVALRY DOWN TO THE HOMELY ANECDOTES OF CARLILE ROOKDAY FOLKS']

### Live inference

In [2]:
!pip install torchaudio ipywebrtc
# Download a static FFmpeg build and add it to PATH.
exist = !which ffmpeg
if not exist:
  !curl https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz -o ffmpeg.tar.xz \
     && tar -xf ffmpeg.tar.xz && rm ffmpeg.tar.xz
  ffmdir = !find . -iname ffmpeg-*-static
  path = %env PATH
  path = path + ':' + ffmdir[0]
  %env PATH $path

Collecting ipywebrtc
  Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.1->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
from IPython.display import Audio

from google.colab import output
output.enable_custom_widget_manager()

In [6]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [49]:
# Write to recording.webm
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)

# Translate recording.wemb to file.wav
!ffmpeg -i recording.webm -ac 1 -f wav file.wav -y -hide_banner -loglevel panic

In [50]:
hubert_result = ModelEngine.predict("/content/drive/MyDrive/model/hubert-model/checkpoint-2000", "file.wav")
wav2vec_result = ModelEngine.predict("/content/drive/MyDrive/model/wav2vec-model/checkpoint-2000", "file.wav")
data2vec_result = ModelEngine.predict("/content/drive/MyDrive/model/data2vec-model/checkpoint-2000", "file.wav")


print("hubert", hubert_result)
print("wav2vec", wav2vec_result)
print("data2vec", data2vec_result)

hubert ['MY AUDIOTISE AUDOTIS']
wav2vec ['A DEYOUTES ULD UTIS']
data2vec ['Y AUDIOTISE AUDOTIS']


## Data Augmentation for Audio

### Noise Injection

In [39]:

def manipulate(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

### Shifting Time

In [40]:
def manipulate(data, sampling_rate, shift_max, shift_direction):
    shift = np.random.randint(sampling_rate * shift_max)
    if shift_direction == 'right':
        shift = -shift
    elif self.shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    augmented_data = np.roll(data, shift)
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

### Changing Pitch

In [41]:

def manipulate(data, sampling_rate, pitch_factor):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

### Changing Speed

In [42]:
def manipulate(data, speed_factor):
    return librosa.effects.time_stretch(data, speed_factor)

### DataAugmentation class

In [69]:
class DataAugmentation:
  def __init__(self, data):
    self.data = data

  def noise_inject(self, noise_factor):
    noise = np.random.randn(len(self.data))
    augmented_data = self.data + noise_factor * noise
    augmented_data = augmented_data.astype(type(self.data[0]))
    return augmented_data

  def shift_time(self, sampling_rate, shift_max, shift_direction):
    shift = np.random.randint(sampling_rate * shift_max)
    if shift_direction == 'right':
        shift = -shift
    elif shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    augmented_data = np.roll(self.data, shift)
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

  def change_pitch(self, sampling_rate, n_steps):
    return librosa.effects.pitch_shift(self.data, sr=sampling_rate, n_steps=n_steps)

  def time_stretch(self, speed_factor):
    return librosa.effects.time_stretch(self.data, rate=speed_factor)


### Apply Augmentation and combine it with our dataset

In [70]:
def apply_augmentation(example):
    audio_data = np.array(example['audio'])  # Convert audio list to numpy array
    sampling_rate = example['sampling_freq']  # Assuming 'sampling_freq' is the key for sampling rate in the dataset

    augmenter = DataAugmentation(audio_data)

    # Randomly select an augmentation method
    augmentation_method = np.random.choice(['noise_inject', 'shift_time', 'change_pitch', 'manipulate'])

    if augmentation_method == 'noise_inject':
        augmented_data = augmenter.noise_inject(noise_factor=0.05)
    elif augmentation_method == 'shift_time':
        augmented_data = augmenter.shift_time(sampling_rate, shift_max=1, shift_direction='both')
    elif augmentation_method == 'change_pitch':
        augmented_data = augmenter.change_pitch(sampling_rate, n_steps=2)
    else:  # augmentation_method == 'time_stretch'
        augmented_data = augmenter.time_stretch(speed_factor=1.5)

    return {'audio': augmented_data.tolist(), 'sampling_freq': sampling_rate, 'transcription': example['transcription']}

In [71]:
augmented_dataset = custom_dataset.map(apply_augmentation, remove_columns=custom_dataset.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/855 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/214 [00:00<?, ? examples/s]

In [78]:
combined_dataset = DatasetDict({
    "train": concatenate_datasets([custom_dataset["train"], augmented_dataset["train"]]),
    "test": concatenate_datasets([custom_dataset["test"], augmented_dataset["test"]])
})

### Evaluate Wav2Vec2.0 again

In [86]:
wav2vec.train("/content/drive/MyDrive/model/wav2vec-model-augmented")

Training model: facebook/wav2vec2-base


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Wer
100,2.9669,2.939397,1.0
200,2.8794,2.894377,1.0
300,2.8762,2.878135,1.0
400,2.8618,2.841547,1.0
500,1.8083,1.332047,0.803816
600,0.9036,0.711263,0.522544
700,0.6195,0.540901,0.44523
800,0.5243,0.443526,0.385724
900,0.4094,0.386218,0.354205
1000,0.3716,0.352798,0.319435
