In [21]:
from src.lightning_modules import FM
from src.networks import StableDiffusionXL, MediumUNet

unet = MediumUNet()
encoder = StableDiffusionXL()

fm = FM(model=unet, encoder_decoder=encoder)

Loaded VAE model stabilityai/stable-diffusion-xl-base-1.0


In [1]:
from src.networks import SmallUNet, MediumUNet, LargeUNet

small_unet = SmallUNet()
medium_unet = MediumUNet()
large_unet = LargeUNet()

small_unet_params = sum(p.numel() for p in small_unet.parameters())
medium_unet_params = sum(p.numel() for p in medium_unet.parameters())
large_unet_params = sum(p.numel() for p in large_unet.parameters())

print(f"Small UNet: {small_unet_params}")
print(f"Medium UNet: {medium_unet_params}")
print(f"Large UNet: {large_unet_params}")

  from .autonotebook import tqdm as notebook_tqdm


Small UNet: 95092228
Medium UNet: 163560068
Large UNet: 262387588


In [1]:
import IPython.display as ipd
from src.dataset import (
    VoxCeleb, 
    EarsGender, 
    GenderAudioDataset, 
    CREMAD, 
    SampleVoiceData, 
    JLCorpus, 
    LibriSpeech,
    MSSNSD,
)
import random
from src.networks import PretrainedMimi
import torch

encoder = PretrainedMimi()
gender = 'male'

vox_dataset = VoxCeleb(gender=gender)
print("Length of VoxCeleb dataset: ", len(vox_dataset))

ears_dataset = EarsGender(gender=gender)
print("Length of Ears dataset: ", len(ears_dataset))

cremad = CREMAD(gender=gender)
print("Length of CREMAD dataset: ", len(cremad))

sample_voice = SampleVoiceData(gender=gender)
print("Length of SampleVoice dataset: ", len(sample_voice))

jl_corupus = JLCorpus(gender=gender)
print("Length of JLCorpus dataset: ", len(jl_corupus))

libri = LibriSpeech(gender)
print("Length of LibriSpeech dataset: ", len(libri))

mssnd = MSSNSD('train', 'clean')
print("Length of MSSNSD dataset: ", len(mssnd))

gender_dataset = GenderAudioDataset(gender=gender)
print("Length of gender dataset: ", len(gender_dataset))

for _ in range(5):
    rand_idx = random.randint(0, len(vox_dataset))
    gender_sample = gender_dataset[rand_idx]
    with torch.no_grad():
        encoded = encoder.encode(gender_sample.unsqueeze(0))
        decoded = encoder.decode(encoded)
    ipd.display(ipd.Audio(gender_sample, rate=gender_dataset.sample_rate))
    ipd.display(ipd.Audio(decoded.squeeze(), rate=gender_dataset.sample_rate)) 

  from .autonotebook import tqdm as notebook_tqdm
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


Loaded Mimi model
Length of VoxCeleb dataset:  3682
Length of Ears dataset:  6450
Length of CREMAD dataset:  3930
Length of SampleVoice dataset:  51
Length of JLCorpus dataset:  2420
Length of LibriSpeech dataset:  14197
Length of MSSNSD dataset:  23075
Length of gender dataset:  30730


In [None]:
from src.dataset import LibriSpeech, FSDNoisy18k, SpeechNoiseDataset, BaseConcatAudio
import IPython.display as ipd

libri = LibriSpeech('male')
fsd = FSDNoisy18k('train')

speech = BaseConcatAudio([libri], 2.5, 16000)
noise = BaseConcatAudio([fsd], 2.5, 16000)
noisy_speech = SpeechNoiseDataset(speech, noise)

for i in range(10):
    s, n = noisy_speech[i]
    ipd.display(ipd.Audio(s, rate=16000))
    ipd.display(ipd.Audio(n, rate=16000))
    print("-" * 20)

In [3]:
from src.dataset import MSSNSD, BaseConcatAudio, WHAM
import IPython.display as ipd
wham = WHAM('train')
wham_dataset = BaseConcatAudio([wham], 5.0, 16000)
ipd.display(ipd.Audio(wham_dataset[0], rate=16000))