In [None]:
import os
import pandas as pd
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, Resample
from skimage.util import random_noise


CONFIG = {
    "epochs": 30,
    "num_classes": 264,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "eval_split_ratio": 0.2,
    "stratify_column": "primary_label",
    "sample_rate": 32_000,
    "hop_length": 512,
    "max_time": 5,
    "n_mels": 224,
    "n_fft": 1024,
    "learning_rate": 1e-5,
    "PLOTS_DIR": "./plots/results",
    "RESULTS_DIR": "./results",
    "MODELS_DIR": "./saved_models",
}

In [None]:
def to_mono(audio):
    return torch.mean(audio, axis=0)


def crop_audio(audio, num_samples):
    return audio[:num_samples]


def pad_audio(audio, num_samples):
    pad_length = num_samples - audio.shape[0]
    last_dim_padding = (0, pad_length)
    audio = F.pad(audio, last_dim_padding)
    return audio


def generate_spectrogram(filepath, target_sample_rate, num_samples):
    audio, sample_rate = torchaudio.load(filepath, format="ogg")
    audio = to_mono(audio)

    if sample_rate != target_sample_rate:
        resample = Resample(sample_rate, target_sample_rate)
        audio = resample(audio)

    if audio.shape[0] > num_samples:
        audio = crop_audio(audio, num_samples)

    if audio.shape[0] < num_samples:
        audio = pad_audio(audio, num_samples)

    mel_spectrogram = MelSpectrogram(
        sample_rate=target_sample_rate, n_mels=CONFIG["n_mels"], n_fft=CONFIG["n_fft"]
    )
    mel = mel_spectrogram(audio)
    return mel


def save_mel_tensor(mel_tensor, output_filename):
    torch.save(mel_tensor, output_filename)


data = pd.read_csv("../data/train_metadata.csv")

output_dir = "../data/tensors/"
os.makedirs(output_dir, exist_ok=True)

for index, row in data.iterrows():
    print(f"Row {index} / {len(data)}")
    audio_filepath = "../data/train_audio/" + row["filename"]
    os.makedirs(output_dir + row["filename"].split("/")[0], exist_ok=True)
    output_filename = os.path.join(output_dir, os.path.splitext(row["filename"])[0] + ".pt")
    mel = generate_spectrogram(audio_filepath, CONFIG["sample_rate"], CONFIG["sample_rate"] * 5)
    save_mel_tensor(mel, output_filename)

print("Spectrogram generation and saving complete.")

In [None]:
def add_gaussian_noise_to_mel(mel_tensor, mean=0, var=0.05):
    mel_np = mel_tensor.numpy()
    noisy_mel_np = random_noise(mel_np, mode="gaussian", mean=mean, var=var, clip=True)
    noisy_mel_tensor = torch.tensor(noisy_mel_np, dtype=mel_tensor.dtype)

    return noisy_mel_tensor


data = pd.read_csv("../data/train_metadata.csv")
least_common_names = data["common_name"].value_counts().nsmallest(100).index
filtered_data = data[data["common_name"].isin(least_common_names)]
filtered_data.reset_index(drop=True, inplace=True)
augmented_entries = []

for index, row in filtered_data.iterrows():
    print(f"Row {index} of {len(filtered_data)}")
    audio_filepath = "../data/train_audio/" + row["filename"]
    mel = generate_spectrogram(audio_filepath, CONFIG["sample_rate"], CONFIG["sample_rate"] * 5)

    noisy_mel = add_gaussian_noise_to_mel(mel)

    augmented_filename = os.path.splitext(row["filename"])[0] + "_augmented.pt"
    output_filename = os.path.join(output_dir, augmented_filename)

    save_mel_tensor(noisy_mel, output_filename)

    new_row = row.copy()
    new_row["filename"] = augmented_filename
    augmented_entries.append(new_row)

augmented_data = pd.DataFrame(augmented_entries)
data_with_augmentations = pd.concat([data, augmented_data], ignore_index=True)
data_with_augmentations.to_csv("../data/train_metadata_with_augmentations.csv", index=False)

In [49]:
data["filename_tensor"] = [path.split(".")[0] + ".pt" for path in data["filename"]]
data_with_augmentations["filename_tensor"] = [path.split(".")[0] + ".pt" for path in data_with_augmentations["filename"]]

In [51]:
data_with_augmentations.to_csv("../data/train_metadata_with_augmentations.csv", index=False)
data.to_csv("../data/train_metadata.csv", index=False)