# Install Dependencies

In [None]:
pip install torchaudio librosa soundfile numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

# Create data directory

In [None]:
pip install matplotlib IPython

Collecting jedi>=0.16 (from IPython)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [1]:
mkdir -p ./data

In [3]:
import os
import torchaudio

os.makedirs("./data", exist_ok=True)
torchaudio.datasets.LIBRISPEECH(root="./data", url="train-clean-100", download=True)


100.0%


<torchaudio.datasets.librispeech.LIBRISPEECH at 0x105a213d0>

# Download LibriSpeech (train-clean-100)

In [4]:
dataset = torchaudio.datasets.LIBRISPEECH(root="./data", url="train-clean-100", download=False)

waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id = dataset[0]

print(f"Sample rate: {sample_rate}")
print(f"Transcript: {transcript}")
print(f"Waveform shape: {waveform.shape}")

Sample rate: 16000
Transcript: CHAPTER ONE MISSUS RACHEL LYNDE IS SURPRISED MISSUS RACHEL LYNDE LIVED JUST WHERE THE AVONLEA MAIN ROAD DIPPED DOWN INTO A LITTLE HOLLOW FRINGED WITH ALDERS AND LADIES EARDROPS AND TRAVERSED BY A BROOK
Waveform shape: torch.Size([1, 225360])


# Clean Dataset

In [5]:
import torchaudio
import torchaudio.transforms as T
import os
import json
import torch

def clean_data(min_duration=1.5, max_peak=0.95, sr=16000, save_dir="spectrogram_data"):
    dataset = torchaudio.datasets.LIBRISPEECH(root="./data", url="train-clean-100", download=False)
    mel_transform = T.MelSpectrogram(sample_rate=sr, n_fft=512, hop_length=128, n_mels=64)

    os.makedirs(save_dir, exist_ok=True)
    os.makedirs("meta", exist_ok=True)

    cleaned = []

    for i in range(len(dataset)):
        waveform, sample_rate, _, speaker_id, _, utt_id = dataset[i]

        duration = waveform.shape[1] / sample_rate
        peak_val = waveform.abs().max().item()

        if duration >= min_duration and peak_val <= max_peak:
            waveform = waveform.mean(dim=0, keepdim=True)

            # Convert to spectrogram
            spec = mel_transform(waveform)

            filename = f"{speaker_id}_{utt_id}.pt"
            path = os.path.join(save_dir, filename)
            torch.save(spec, path)

            cleaned.append({
                "index": i,
                "duration": duration,
                "speaker_id": int(speaker_id),
                "utterance_id": int(utt_id),
                "spectrogram_path": path
            })

    with open("meta/clean_metadata.json", "w") as f:
        json.dump(cleaned, f, indent=2)

    print(f"✅ Converted and saved {len(cleaned)} spectrograms to '{save_dir}'")

In [6]:
import os
import json
import random
import torch
import torchaudio
import soundfile as sf

def create_train_data(configs, output_dir="train_data", seed=42):
    # Set random seeds so we can reproduce results
    random.seed(seed)
    torch.manual_seed(seed)

    # Load cleaned metadata
    with open("meta/clean_metadata.json") as f:
        metadata = json.load(f)

    # Load dataset (no download here since we already have it)
    dataset = torchaudio.datasets.LIBRISPEECH(root="./data", url="train-clean-100", download=False)
    os.makedirs(output_dir, exist_ok=True)

    for i in range(configs["num_samples"]):
        if configs["type"] == "cocktail":
            # Pick multiple random speakers
            speakers = random.sample(metadata, configs["num_speakers"])
            waves = []

            # Figure out the shortest clip (so we can align them all)
            shortest_len = None
            for s in speakers:
                wave, sr, *_ = dataset[s["index"]]
                wave = wave[:, :int(sr * configs["duration"])]
                if shortest_len is None or wave.shape[1] < shortest_len:
                    shortest_len = wave.shape[1]

            # Truncate everything to match the shortest, apply random volume
            for s in speakers:
                wave, sr, *_ = dataset[s["index"]]
                wave = wave[:, :shortest_len]
                amp = random.uniform(0.01, 0.1)  # really soft background voices
                waves.append(amp * wave)

            # Sum up all the quiet speakers — one louder target is at index 0
            mix = sum(waves)
            target = waves[0]  # target speaker is just the first one we picked

        elif configs["type"] == "twospeaker":
            # Just pick two people to mix
            s1, s2 = random.sample(metadata, 2)
            wave1, sr, *_ = dataset[s1["index"]]
            wave2, sr, *_ = dataset[s2["index"]]

            # Truncate to match lengths
            min_len = min(wave1.shape[1], wave2.shape[1])
            wave1 = wave1[:, :min_len]
            wave2 = wave2[:, :min_len]

            # Speaker 1 is loud, speaker 2 is background
            mix = 0.8 * wave1 + 0.2 * wave2
            target = wave1

        sf.write(f"{output_dir}/mix_{i}.wav", mix.squeeze().numpy(), sr)
        sf.write(f"{output_dir}/target_{i}.wav", target.squeeze().numpy(), sr)

    print(f"✔️ Done! Made {configs['num_samples']} samples in '{output_dir}'")

In [None]:
cocktail_config = {
    "type": "cocktail",
    "num_samples": 100,
    "num_speakers": 15,
    "duration": 3.0
}

twospeaker_config = {
    "type": "twospeaker",
    "num_samples": 100,
}
clean_data()
create_train_data(cocktail_config)
create_train_data(twospeaker_config)


✅ Converted and saved 27840 spectrograms to 'spectrogram_data'
✔️ Done! Made 100 samples in 'train_data'
✔️ Done! Made 100 samples in 'train_data'


In [None]:
from torch import nn
class SplitterRNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.3):
    super(SplitterRNN, self).__init__()
    self.lstm = nn.LSTM(input_size=input_size,
                        hidden_size=hidden_size,
                        num_layers=num_layers,
                        batch_first=True,
                        dropout=dropout)
    self.linear = nn.Linear(hidden_size, input_size)

  def forward(self, x):
    output, _ = self.lstm(x)  # output: (batch_size, time_steps, hidden_size)
    output = self.linear(output)  # output: (batch_size, time_steps, input_size)
    return output

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Parameters
input_size = 257
hidden_size = 512
num_layers = 2
batch_size = 32
sequence_length = 100
dropout = 0.25
lr = 0.001

model = SplitterRNN(input_size, hidden_size, num_layers, dropout).to(device=device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torchaudio
import os

class SpeechSeparationDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        files = os.listdir(self.data_dir)
        wav_files = [f for f in files if f.endswith(".wav")]
        ids = {f.split("_")[1].split(".")[0] for f in wav_files}
        self.sample_ids = sorted(ids)

    def __len__(self):
        return len(self.sample_ids)

    def __getitem__(self, idx):
        sample_id = self.sample_ids[idx]
        mix_path = os.path.join(self.data_dir, f"mix_{sample_id}.wav")
        target_path = os.path.join(self.data_dir, f"target_{sample_id}.wav")

        mix, _ = torchaudio.load(mix_path)
        target, _ = torchaudio.load(target_path)

        return mix, target

# TODO: Computational Experiment (pad or truncate??)
def pad_inputs(batch):
    mixed_audio, target_audio = zip(*batch)
    mixed_audio = [x.squeeze(0) for x in mixed_audio]
    target_audio = [x.squeeze(0) for x in target_audio]

    mixes = pad_sequence(mixed_audio, batch_first=True)
    targets = pad_sequence(target_audio, batch_first=True)

    return mixes.unsqueeze(1), targets.unsqueeze(1)

train_dataset = SpeechSeparationDataset("train_data")
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pad_inputs)

# Iterate through batches
for mix_batch, target_batch in train_loader:
    print("Batch mix shape:", mix_batch.shape)
    print("Batch target shape:", target_batch.shape)
    break


Batch mix shape: torch.Size([8, 1, 48000])
Batch target shape: torch.Size([8, 1, 48000])


In [None]:
def train_rnn(model, criterion, optimizer, delay, delay_range=None, verbose=True, nepochs=5000):
  losses = []
  for e in nepochs:

  pass

In [None]:
def test_rnn():
  pass

# Designing Computational Experiments

Helper Functions

In [None]:
pip install scipy sounddevice

In [None]:
import sounddevice as sd
import numpy as np

# play_wav_file("path/to/your/audio.wav")
def play_audio(audio, sample_rate=44100):
    """
    Plays an audio sample.

    Parameters:
    - audio (numpy.ndarray): The audio data. Can be mono (1D) or stereo (2D).
    - sample_rate (int): Sampling rate in Hz (default: 44100).
    """
    if not isinstance(audio, np.ndarray):
        raise ValueError("Audio must be a NumPy array.")

    if audio.ndim > 2 or (audio.ndim == 2 and audio.shape[1] > 2):
        raise ValueError("Audio must be mono or stereo.")

    sd.play(audio, samplerate=sample_rate)
    sd.wait()  # Wait until audio playback is finished


import matplotlib.pyplot as plt

# visualize_wav("path/to/your/audio.wav")
def visualize_wav(filepath, show_spectrogram=True):
    """
    Visualizes the waveform (and optionally spectrogram) of a WAV file.

    Parameters:
    - filepath (str): Path to the WAV file.
    - show_spectrogram (bool): Whether to show the spectrogram below the waveform.
    """
    # Load audio
    data, samplerate = sf.read(filepath)
    duration = len(data) / samplerate
    time = np.linspace(0, duration, num=len(data))

    # Set up plot
    fig, axs = plt.subplots(2 if show_spectrogram else 1, 1, figsize=(12, 6), sharex=True)

    if not isinstance(axs, np.ndarray):
        axs = [axs]  # ensure axs is always a list for consistency

    # Plot waveform
    axs[0].plot(time, data)
    axs[0].set_title('Waveform')
    axs[0].set_ylabel('Amplitude')
    axs[0].grid(True)

    # Optional: Plot spectrogram
    if show_spectrogram:
        axs[1].specgram(data[:, 0] if data.ndim > 1 else data, Fs=samplerate, NFFT=1024, noverlap=512)
        axs[1].set_title('Spectrogram')
        axs[1].set_ylabel('Frequency [Hz]')
        axs[1].set_xlabel('Time [s]')

    plt.tight_layout()
    plt.show()

"""
Example:

# Load audio files
speaker1, sr = librosa.load("speaker1.wav", sr=None)
speaker2, _ = librosa.load("speaker2.wav", sr=None)

# Mix with speaker2 at 4x the volume of speaker1
mixed, s1_adj, s2_adj = adjust_loudness_mix(speaker1, speaker2, loudness_ratio=4.0)

# Save to disk
librosa.output.write_wav("mixed.wav", mixed, sr)
"""
def adjust_loudness_mix(speaker1_wave, speaker2_wave, loudness_ratio):
    """
    Adjusts the loudness of speaker2 relative to speaker1.

    Parameters:
        speaker1_wave (np.ndarray): Audio waveform of speaker 1.
        speaker2_wave (np.ndarray): Audio waveform of speaker 2.
        loudness_ratio (float): Factor to scale speaker2's amplitude.
                                 e.g., 2.0 means speaker2 is twice as loud as speaker1.

    Returns:
        mixed_wave (np.ndarray): The resulting mixture of the two speakers.
        adjusted_speaker2 (np.ndarray): The scaled version of speaker2_wave.
    """
    # Match length if needed
    min_len = min(len(speaker1_wave), len(speaker2_wave))
    speaker1_wave = speaker1_wave[:min_len]
    speaker2_wave = speaker2_wave[:min_len]

    # Apply loudness adjustment to speaker2
    adjusted_speaker2 = speaker2_wave * loudness_ratio

    # Mix the two signals
    mixed_wave = speaker1_wave + adjusted_speaker2

    # Optional: Normalize to avoid clipping
    max_val = np.max(np.abs(mixed_wave))
    if max_val > 1.0:
        mixed_wave = mixed_wave / max_val
        adjusted_speaker2 = adjusted_speaker2 / max_val
        speaker1_wave = speaker1_wave / max_val

    return mixed_wave, speaker1_wave, adjusted_speaker2

ModuleNotFoundError: No module named 'sounddevice'

# Nate does some delicious testing here

In [None]:
# dataset = torchaudio.datasets.LIBRISPEECH(root="./data", url="train-clean-100", download=False)
waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id = dataset[0]

play_audio(waveform)

NameError: name 'play_audio' is not defined

**Experiment 1 - adjust loudness of target input and compare loss**

Goal: Assess how robust the model is when one speaker is significantly louder.

Design:

For each mixed sample, systematically vary the amplitude of one speaker:

 - Ratios: 1:1 (baseline), 2:1, 4:1, 8:1

Evaluate performance degradation or adaptation.

Hypotheses:

MSE may degrade linearly with imbalance.

CTC might maintain higher robustness if sequences are still distinguishable.

Experiment 2 - adjust loudness of background noise and compare loss (I think this may combine with experiments 1 and 4)

**Experiment 3 - compare efficiency on different languages**

Goal: Evaluate performance when one or both speakers speak a non-English language.

Design:

Use mixtures of English-English, English-Other (e.g., English-Spanish), and Other-Other.

Focus on tonal vs non-tonal languages for diversity.

Optional:

Augment training data with multilingual samples.

Try language embeddings if you're using a deeper pipeline.

Analysis:

Does the model performance degrade with unfamiliar phonetic structures?

Does it favor English content due to training bias?



**Experiment 4 - add non-human background noise and compare loss**

Goal: Evaluate robustness to noise interference (e.g., cafe noise, car engines, music).

Design:

Add non-human noise at varying SNRs (e.g., 30 dB, 20 dB, 10 dB).

Use both stationary and non-stationary noise.

Compare:

Performance with and without noise-aware preprocessing.

Try noise suppression front-ends or data augmentation with noise.

**Extra Experiments You Could Add**

Speaker permutation invariance: Does the model consistently output the same speaker when order varies?

Speaker gender and pitch: Investigate separation accuracy across gender combinations or pitch similarity.

Temporal length variation: Does it work on long conversations vs short utterances?

Window size sensitivity: Test how performance changes with different input chunk sizes.