<a href="https://colab.research.google.com/github/karank85/speech-recognition/blob/main/Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import os

import numpy as np
from numpy import ndarray
import pandas as pd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from pathlib import Path
import random

import librosa

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn

from IPython.display import Audio

import glob

In [43]:
librosa.__version__

'0.10.1'

In [44]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [45]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [63]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [46]:
# Assumptions:
# - The transcription file is located in the same directory as the audio files.
class AudioDataset:
  """
  Class for loading and storing audio data.
  """

  def __init__(self):
    self.df = pd.DataFrame(columns=['id', 'path', 'transcription'])

  def load_transcriptions(self, directory_path: str) -> bool:
    """
    Load all transcriptions from a given directory, including subdirectories.
    Returns False if no transcription files were found, or if any failed to load.
    """
    sound_names = glob.glob(
        f"{directory_path}/**/*.wav",
        recursive=True
    )

    if len(sound_names) == 0:
      return False

    for path in sound_names:
      if not self.load_transcription_file(path):
        return False

    return True

  def load_transcription_file(self, file_path: str) -> bool:
    """
    Parse transcription file and records the audio ID - subtitle mapping.
    Returns False if the file could not be read.
    """

    file_directory = Path(file_path).parent
    file_name = Path(file_path).stem
    self.df.loc[len(self.df)] = {
        'id':file_name,
        'transcription':file_directory.name,
        'path': f'{file_directory}/{file_name}.wav'
    }
    return True

  def keys(self):
    return iter(self.df['id'])

  def get(self, id: int):
    """
    Retrieve a dataframe row from ID.
    """
    return self.df.loc[self.df['id'] == id]

In [47]:
random.seed(42)

def split_dataframe(df, ratio):
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate the split index
    split_index = int(len(df_shuffled) * ratio)

    # Split the DataFrame
    # df_1 = df_shuffled.iloc[:split_index] not needed for benchmarking
    df_2 = df_shuffled.iloc[split_index:]

    return df_2

In [48]:
df_dict_benchmark = {}

In [49]:
all_subdirectories = glob.glob(
        f"/content/drive/MyDrive/test_hmm/*/",
        recursive=True
)

for path in all_subdirectories:
  fruit_label = Path(path).name
  ds = AudioDataset()
  if ds.load_transcriptions(path):
    df_dict_benchmark[fruit_label] = split_dataframe(ds.df, 0.8)

In [51]:
df_dict_benchmark['lime']

Unnamed: 0,id,path,transcription
12,lime08,/content/drive/MyDrive/test_hmm/lime/lime08.wav,lime
13,lime05,/content/drive/MyDrive/test_hmm/lime/lime05.wav,lime
14,lime10,/content/drive/MyDrive/test_hmm/lime/lime10.wav,lime


In [52]:
df_dict_combined = pd.concat(df_dict_benchmark.values(), ignore_index=True)
df_dict_combined = df_dict_combined.drop(columns=['id'])
df_dict_combined

Unnamed: 0,path,transcription
0,/content/drive/MyDrive/test_hmm/lime/lime08.wav,lime
1,/content/drive/MyDrive/test_hmm/lime/lime05.wav,lime
2,/content/drive/MyDrive/test_hmm/lime/lime10.wav,lime
3,/content/drive/MyDrive/test_hmm/orange/orange0...,orange
4,/content/drive/MyDrive/test_hmm/orange/orange0...,orange
5,/content/drive/MyDrive/test_hmm/orange/orange1...,orange
6,/content/drive/MyDrive/test_hmm/kiwi/kiwi15.wav,kiwi
7,/content/drive/MyDrive/test_hmm/kiwi/kiwi13.wav,kiwi
8,/content/drive/MyDrive/test_hmm/kiwi/kiwi11.wav,kiwi
9,/content/drive/MyDrive/test_hmm/pineapple/pine...,pineapple


In [70]:
def get_random_inputs(df, amt):
    random_df = df.sample(n=amt)
    paths = random_df['path'].tolist()
    transcriptions = random_df['transcription'].tolist()
    return paths, transcriptions

# Call the function to select random rows from random DataFrames
paths, transcriptions = get_random_inputs(df_dict_combined, 10)

print(paths)
print(transcriptions)

['/content/drive/MyDrive/test_hmm/orange/orange15.wav', '/content/drive/MyDrive/test_hmm/lime/lime08.wav', '/content/drive/MyDrive/test_hmm/pineapple/pineapple15.wav', '/content/drive/MyDrive/test_hmm/kiwi/kiwi13.wav', '/content/drive/MyDrive/test_hmm/kiwi/kiwi11.wav', '/content/drive/MyDrive/test_hmm/peach/peach06.wav', '/content/drive/MyDrive/test_hmm/orange/orange02.wav', '/content/drive/MyDrive/test_hmm/lime/lime05.wav', '/content/drive/MyDrive/test_hmm/pineapple/pineapple06.wav', '/content/drive/MyDrive/test_hmm/peach/peach09.wav']
['orange', 'lime', 'pineapple', 'kiwi', 'kiwi', 'peach', 'orange', 'lime', 'pineapple', 'peach']


In [73]:
from pydub import AudioSegment

# Load each audio file
audio_segments = [AudioSegment.from_wav(file_path) for file_path in paths]
sentence = ' '.join(transcriptions)

# Concatenate the audio segments
combined_audio = sum(audio_segments)

#Export the combined audio to a file
combined_audio.export("/content/drive/MyDrive/combined_audio.wav", format="wav")

# Load the combined audio file
audio, sampling_freq = librosa.load("/content/drive/MyDrive/combined_audio.wav", sr=16_000)

print(sentence)
Audio(data=audio, rate=sampling_freq)

orange lime pineapple kiwi kiwi peach orange lime pineapple peach
