<a href="https://colab.research.google.com/github/karank85/speech-recognition/blob/main/Project2_DL_Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import numpy as np
from numpy import ndarray
import pandas as pd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import librosa

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn

import glob

!pip install transformers datasets evaluate jiwer



In [2]:
librosa.__version__

'0.10.1'

In [3]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Assumptions:
# - The transcription file is located in the same directory as the audio files.
class AudioDataset:
  """
  Class for loading and storing audio data.
  """

  def __init__(self):
    self.df = pd.DataFrame(columns=['id', 'path', 'transcription'])

  def load_transcriptions(self, directory_path: str) -> bool:
    """
    Load all transcriptions from a given directory, including subdirectories.
    Returns False if no transcription files were found, or if any failed to load.
    """
    transcriptions_path = glob.glob(
        f"{directory_path}/**/*.trans.txt",
        recursive=True
    )

    if len(transcriptions_path) == 0:
      return False

    for path in transcriptions_path:
      if not self.load_transcription_file(path):
        return False

    return True



  def load_transcription_file(self, file_path: str) -> bool:
    """
    Parse transcription file and records the audio ID - subtitle mapping.
    Returns False if the file could not be read.
    """
    with open(file_path, "r") as file:
      file_directory = os.path.dirname(file_path)

      lines = file.read().split("\n")
      for line in lines:
        if len(line.strip()) == 0:
          continue
        splitter = line.split(" ")
        file_name = splitter[0]
        file_content = ' '.join(splitter[1:])
        self.df.loc[len(self.df)] = {
            'id':file_name,
            'transcription':file_content,
            'path': f'{file_directory}/{file_name}.flac'
        }
      return True
    return False

  def keys(self):
    return iter(self.df['id'])

  def get(self, id: int):
    """
    Retrieve a dataframe row from ID.
    """
    return self.df.loc[self.df['id'] == id]

In [6]:
ds = AudioDataset()

In [7]:
ds.load_transcriptions("/content/drive/MyDrive/")

True

In [8]:
audio_list = []
sampling_freq_list = []

for index, row in ds.df.iterrows():
  audio, sampling_freq = librosa.load(row['path'], sr=16_000)

  audio_list.append(audio)
  sampling_freq_list.append(sampling_freq)

ds.df['audio'] = audio_list
ds.df['sampling_freq'] = sampling_freq_list

In [9]:
from datasets import Dataset, Features, Array3D, Value

custom_dataset = Dataset.from_pandas(ds.df)
custom_dataset

Dataset({
    features: ['id', 'path', 'transcription', 'audio', 'sampling_freq', '__index_level_0__'],
    num_rows: 76
})

In [10]:
custom_dataset = custom_dataset.remove_columns(['path', 'id', '__index_level_0__'])
# transcription: str, audio: list[int], sampling_freq: int
custom_dataset

Dataset({
    features: ['transcription', 'audio', 'sampling_freq'],
    num_rows: 76
})

In [11]:
custom_dataset = custom_dataset.train_test_split(test_size=0.2)
custom_dataset

DatasetDict({
    train: Dataset({
        features: ['transcription', 'audio', 'sampling_freq'],
        num_rows: 60
    })
    test: Dataset({
        features: ['transcription', 'audio', 'sampling_freq'],
        num_rows: 16
    })
})

In [12]:
# Making sure it's all uppercase characters as Wav2Vec Tokenizer is only trained
# on uppercase characters and we need to match the tokenizer's vocabulary.
for item in custom_dataset["train"]:
  print(item["transcription"])

THERE SHE FELL MISERABLY SHORT OF THE TRUE HEROIC HEIGHT AT PRESENT SHE DID NOT KNOW HER OWN POVERTY FOR SHE HAD NO LOVER TO PORTRAY SHE HAD REACHED THE AGE OF SEVENTEEN
FRENCH BY HER MOTHER HER PROFICIENCY IN EITHER WAS NOT REMARKABLE AND SHE SHIRKED HER LESSONS IN BOTH WHENEVER SHE COULD WHAT A STRANGE UNACCOUNTABLE CHARACTER FOR WITH ALL THESE SYMPTOMS OF PROFLIGACY AT TEN YEARS OLD
DARK LANK HAIR AND STRONG FEATURES SO MUCH FOR HER PERSON AND NOT LESS UNPROPITIOUS FOR HEROISM SEEMED HER MIND SHE WAS FOND OF ALL BOY'S PLAYS AND GREATLY PREFERRED CRICKET
THERE WAS NOT ONE FAMILY AMONG THEIR ACQUAINTANCE WHO HAD REARED AND SUPPORTED A BOY ACCIDENTALLY FOUND AT THEIR DOOR NOT ONE YOUNG MAN WHOSE ORIGIN WAS UNKNOWN HER FATHER HAD NO WARD AND THE SQUIRE OF THE PARISH NO CHILDREN
WHO OWNED THE CHIEF OF THE PROPERTY ABOUT FULLERTON THE VILLAGE IN WILTSHIRE WHERE THE MORLANDS LIVED WAS ORDERED TO BATH FOR THE BENEFIT OF A GOUTY CONSTITUTION AND HIS LADY A GOOD HUMOURED WOMAN FOND OF MISS MO

In [13]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
def apply_processor(batch):
  batch = processor(batch["audio"], sampling_rate=batch["sampling_freq"], text=batch["transcription"])
  batch["input_length"] = len(batch["input_values"][0])
  return batch

encoded_datasets = custom_dataset.map(apply_processor, remove_columns=custom_dataset.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/60 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]

In [15]:
# Need to create a data collator to prepare batches of data suitable for training CTC loss-based models
# Pad text and labels to length of the longest element in its batch to make it uniform length

@dataclass
class DataCollatorCTCLossWithPadding:
  processor: AutoProcessor
  padding: Union[bool, str] = "longest" # pad to the longest sequence

  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

    #audio features
    input_features = [{"input_values": feature["input_values"][0]} for feature in features]
    # tokenized labels
    label_features = [{"input_ids": feature["labels"]} for feature in features]

    batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

    labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attentions_mask.ne(1), -100)

    batch["labels"] = labels

    return batch

In [16]:
data_collator = DataCollatorCTCLossWithPadding(processor=processor, padding="longest")

## Wave2Vec2.0

In [17]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

tokenizer = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import evaluate

wer = evaluate.load("wer")

In [19]:
import numpy as np

def compute_metrics(pred):
    # Compute predicted labels
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)

    # Replace -100 with pad token ID in true labels
    true_labels = np.where(pred.label_ids == -100, processor.tokenizer.pad_token_id, pred.label_ids)
    true_str = processor.batch_decode(true_labels, group_tokens=False)

    # Compute Word Error Rate (WER)
    wer_score = wer.compute(predictions=pred_str, references=true_str)

    return {"wer": wer_score}


In [20]:
from transformers import AutoModelForCTC

model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
!pip install -U accelerate
!pip install -U transformers

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="wav2vec_model_v1",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["test"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m245.8/290.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).