<a href="https://colab.research.google.com/github/karank85/speech-recognition/blob/main/Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import numpy as np
from numpy import ndarray
import pandas as pd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from pathlib import Path

import librosa

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn

from IPython.display import Audio

import glob

In [2]:
librosa.__version__

'0.10.1'

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [5]:
# Assumptions:
# - The transcription file is located in the same directory as the audio files.
class AudioDataset:
  """
  Class for loading and storing audio data.
  """

  def __init__(self):
    self.df = pd.DataFrame(columns=['id', 'path', 'transcription'])

  def load_transcriptions(self, directory_path: str) -> bool:
    """
    Load all transcriptions from a given directory, including subdirectories.
    Returns False if no transcription files were found, or if any failed to load.
    """
    sound_names = glob.glob(
        f"{directory_path}/**/*.wav",
        recursive=True
    )

    if len(sound_names) == 0:
      return False

    for path in sound_names:
      if not self.load_transcription_file(path):
        return False

    return True

  def load_transcription_file(self, file_path: str) -> bool:
    """
    Parse transcription file and records the audio ID - subtitle mapping.
    Returns False if the file could not be read.
    """

    file_directory = Path(file_path).parent
    file_name = Path(file_path).stem
    self.df.loc[len(self.df)] = {
        'id':file_name,
        'transcription':file_directory.name,
        'path': f'{file_directory}/{file_name}.wav'
    }
    return True

  def keys(self):
    return iter(self.df['id'])

  def get(self, id: int):
    """
    Retrieve a dataframe row from ID.
    """
    return self.df.loc[self.df['id'] == id]

In [13]:
def split_dataframe(df, ratio):
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate the split index
    split_index = int(len(df_shuffled) * ratio)

    # Split the DataFrame
    df_1 = df_shuffled.iloc[:split_index]
    df_2 = df_shuffled.iloc[split_index:]

    return df_1, df_2

In [14]:
df_dict_train = {}
df_dict_benchmark = {}

In [15]:
all_subdirectories = glob.glob(
        f"/content/drive/MyDrive/test_hmm/*/",
        recursive=True
)

for path in all_subdirectories:
  fruit_label = Path(path).name
  ds = AudioDataset()
  if ds.load_transcriptions(path):
    df_dict_train[fruit_label], df_dict_benchmark[fruit_label] = split_dataframe(ds.df, 0.8)

In [17]:
df_dict_train['lime']

Unnamed: 0,id,path,transcription
0,lime11,/content/drive/MyDrive/test_hmm/lime/lime11.wav,lime
1,lime15,/content/drive/MyDrive/test_hmm/lime/lime15.wav,lime
2,lime06,/content/drive/MyDrive/test_hmm/lime/lime06.wav,lime
3,lime01,/content/drive/MyDrive/test_hmm/lime/lime01.wav,lime
4,lime12,/content/drive/MyDrive/test_hmm/lime/lime12.wav,lime
5,lime07,/content/drive/MyDrive/test_hmm/lime/lime07.wav,lime
6,lime04,/content/drive/MyDrive/test_hmm/lime/lime04.wav,lime
7,lime02,/content/drive/MyDrive/test_hmm/lime/lime02.wav,lime
8,lime09,/content/drive/MyDrive/test_hmm/lime/lime09.wav,lime
9,lime14,/content/drive/MyDrive/test_hmm/lime/lime14.wav,lime


In [18]:
df_dict_benchmark['lime']

Unnamed: 0,id,path,transcription
12,lime08,/content/drive/MyDrive/test_hmm/lime/lime08.wav,lime
13,lime05,/content/drive/MyDrive/test_hmm/lime/lime05.wav,lime
14,lime10,/content/drive/MyDrive/test_hmm/lime/lime10.wav,lime
