<a href="https://colab.research.google.com/github/karank85/speech-recognition/blob/main/Project2_DL_Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

import numpy as np
from numpy import ndarray
import pandas as pd

import librosa

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from tqdm import tqdm

import glob

In [None]:
librosa.__version__

'0.10.1'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Assumptions:
# - The transcription file is located in the same directory as the audio files.
class AudioDataset:
  """
  Class for loading and storing audio data.
  """

  def __init__(self):
    self.df = pd.DataFrame(columns=['id', 'path', 'transcription'])

  def load_transcriptions(self, directory_path: str) -> bool:
    """
    Load all transcriptions from a given directory, including subdirectories.
    Returns False if no transcription files were found, or if any failed to load.
    """
    transcriptions_path = glob.glob(
        f"{directory_path}/**/*.trans.txt",
        recursive=True
    )

    if len(transcriptions_path) == 0:
      return False

    for path in transcriptions_path:
      if not self.load_transcription_file(path):
        return False

    return True



  def load_transcription_file(self, file_path: str) -> bool:
    """
    Parse transcription file and records the audio ID - subtitle mapping.
    Returns False if the file could not be read.
    """
    with open(file_path, "r") as file:
      file_directory = os.path.dirname(file_path)

      lines = file.read().split("\n")
      for line in lines:
        if len(line.strip()) == 0:
          continue
        splitter = line.split(" ")
        file_name = splitter[0]
        file_content = ' '.join(splitter[1:])
        self.df.loc[len(self.df)] = {
            'id':file_name,
            'transcription':file_content,
            'path': f'{file_directory}/{file_name}.flac'
        }
      return True
    return False

  def keys(self):
    return iter(self.df['id'])

  def get(self, id: int):
    """
    Retrieve a dataframe row from ID.
    """
    return self.df.loc[self.df['id'] == id]

In [None]:
ds = AudioDataset()

In [None]:
ds.load_transcriptions("/content/drive/MyDrive/")


True

## Wave2Vec2.0 Model Framework

### Hyperparameters

In [None]:
configs = {
    "code_vector_size": 0,
    "num_code_vector_groups": 0,
    "num_code_vectors_per_group": 0,
    "mask_time_prob": 0,
    "num_mask_time_steps": 0,
    "extracted_feature_size": 0,
    "encoder_hidden_size": 0,
    "gumbel_init_temperature": 0,
    "contrastive_loss_temperature": 0,
    "num_contrastive_loss_negative_samples": 0,
    "loss_alpha": 0
}

### GumbelVectorQuantizer

In [None]:
class GumbelVectorQuantizer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_groups = config["num_code_vector_groups"]
        self.num_vectors = config["num_code_vectors_per_group"]

        self.linear = nn.Linear(
            config["extracted_feature_size"],
            self.num_groups * self.num_vectors
        )
        self.code_book = nn.Parameter(
            torch.FloatTensor(1, self.num_groups, self.num_vectors, config["code_vector_size"] // self.num_groups)
        )

        self.temperature = config["gumbel_init_temperature"]

    @staticmethod
    def _compute_perplexity(probs: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        """
        Args:
            probs (torch.Tensor): with shape `(B, L, G, V)`
            lengths (torch.Tensor): with shape `(B)`

        Returns:
            torch.Tensor with shape `(G, V)`
        """
        where_calculate_probs = torch.arange(probs.size(1), device=probs.device).unsqueeze(0) < lengths.unsqueeze(-1)
        probs = probs[where_calculate_probs == 1]

        num_values = probs.size(0)
        perplexity = probs.sum(0) / num_values

        return perplexity

    def forward(self, hidden_states: torch.Tensor, lengths: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            hidden_states (torch.Tensor): with shape `(B, L, D1)`
            lengths (torch.Tensor): with shape `(B)`

        Returns:
            tuple(
            code_vectors (torch.Tensor): with shape `(B, L, D2)`
            perplexity (torch.Tensor): with shape `(G, V)`
            )
        """

        batch_size, length, _ = hidden_states.shape

        hidden_states = self.linear(hidden_states)
        # `(B, L, G * V)` -> `(B * L * G, V)`
        hidden_states = hidden_states.view(batch_size * length * self.num_groups, -1)

        code_vector_probs = nn.functional.gumbel_softmax(
            hidden_states.float(), tau=self.temperature, hard=True
        ).type_as(hidden_states)
        code_vector_soft_dist = torch.softmax(
            hidden_states.view(batch_size, length, self.num_groups, -1).float(), dim=-1
        )
        perplexity = self._compute_perplexity(code_vector_soft_dist, lengths)

        code_vector_probs = code_vector_probs.view(batch_size * length, self.num_groups, -1).unsqueeze(-1)

        code_vectors = code_vector_probs * self.code_book
        # `(B * L, G, V, D)` -> `(B, L, G * D)`
        code_vectors = code_vectors.sum(-2).view(batch_size, length, -1)

        return code_vectors, perplexity