# ASR Dataset EDA

## 0. Download Data Set

In [None]:
import os
import tarfile
import requests
from utils import get_data_dir, get_project_root, get_raw_data_dir

url = ""

output_path = os.path.join(get_project_root() / "zipped_data", "cv-en.tar.gz")
os.makedirs(os.path.dirname(output_path), exist_ok=True)

from tqdm.notebook import tqdm  # Use tqdm.auto if outside Jupyter

print("Downloading Common Voice dataset...")

# Send the request with stream
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
block_size = 8192  # 8 KB

# Set up tqdm progress bar
progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)

with open(output_path, "wb") as f:
    for chunk in response.iter_content(chunk_size=block_size):
        if chunk:
            f.write(chunk)
            progress_bar.update(len(chunk))

progress_bar.close()
print("Download complete.")

# 2. Extract the .tar.gz file
extract_dir = get_raw_data_dir("asr") 
os.makedirs(extract_dir, exist_ok=True)

print(f"Extracting to {extract_dir}...")
with tarfile.open(output_path, "r:gz") as tar_ref:
    tar_ref.extractall(path=extract_dir)
print("Extraction complete.")


## 0.1 Download Subset of Data

In [None]:
import gdown
import zipfile
import os
from utils import get_data_dir, get_project_root, get_processed_data_dir

# 1. Download the file from Google Drive
url = "https://drive.google.com/uc?id=1rHenEIWb10HqrCp1NQ9fyRWAYlF_er2H"
output = os.path.join(get_project_root() / "zipped_data", "speech_sample.zip")
os.makedirs(os.path.dirname(output), exist_ok=True)
gdown.download(url, output, quiet=True)

extract_dir = get_processed_data_dir("asr")

# 3. Unzip the file into the target directory
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


## 1. Import necessary libraries

In [None]:
import json
import librosa
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as transforms
import matplotlib.pyplot as plt
from utils import get_data_dir, get_project_root, get_processed_data_dir
from IPython.display import Audio

## 2. Basic EDA

In [None]:
# Load the audio clips path and sentences from test.json
data_path = get_processed_data_dir("asr/converted_clips")
test_json_path = data_path / "test.json"
with open(test_json_path, 'r') as f:
    data = json.load(f)

audio_files = data[0:3]
audio_files

In [None]:
# Load the sample audio file
for idx, audio_info in enumerate(audio_files):
    audio_path = audio_info['key']
    
    # Load the sample audio file
    waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")

    print(f"Sample Rate: {sample_rate}")

    display(Audio(audio_path,))
    
    plt.figure(figsize=(6, 2))
    plt.plot(waveform.t().numpy())
    plt.title(f"Waveform of file {idx+1}")
    plt.show()


In [None]:
for idx, audio_info in enumerate(audio_files):
    audio_path = audio_info['key']
    waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
    waveform_np = waveform.numpy().flatten()

    # Mel spectrogram
    spectrogram = librosa.feature.melspectrogram(y=waveform_np, sr=sample_rate, n_mels=128, fmax=sample_rate/2)
    log_spectrogram = librosa.power_to_db(spectrogram)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(log_spectrogram, sr=sample_rate, x_axis='time', y_axis='mel')
    plt.title(f'Mel Spectrogram {idx+1}')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()

In [None]:
# Defining LogMelSpec transformation
log_mel_spec_transform = transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=128,
    hop_length=380,
    n_fft = 1024*2
)

# Compute the log-mel spectrogram
log_mel_spec = log_mel_spec_transform(waveform)
log_mel_spec = torch.log(log_mel_spec + 1e-14)  # Avoid log(0)

# Display the original spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(log_mel_spec.squeeze(0).numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.title("Original Log-Mel Spectrogram")
plt.colorbar(format='%+2.0f dB')
plt.show()

In [None]:
# SpecAugment transformations
spec_augment = nn.Sequential(
                transforms.FrequencyMasking(freq_mask_param=30),
                transforms.TimeMasking(time_mask_param=70)
)

# Applying SpecAugment
augmented_log_mel_spec = spec_augment(log_mel_spec)

# Display augmented spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(augmented_log_mel_spec.squeeze(0).numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.title("Augmented Log-Mel Spectrogram")
plt.colorbar(format='%+2.0f dB')
plt.show()

In [None]:
log_mel_spec.squeeze(0).numpy(), log_mel_spec.squeeze(0).shape

In [None]:
augmented_log_mel_spec.squeeze(0).numpy(), augmented_log_mel_spec.squeeze(0).numpy().shape