In [1]:
import IPython.display as ipd
import pytorch_lightning as pl
import torch

from mridangam.data import MridangamDataModule
from mridangam.loss import ReconstructionLoss
from mridangam.loss import StationaryRegularization
from mridangam.loss import TransientRegularization
from mridangam.models import MLP
from mridangam.models import TCN
from mridangam.tasks import MridangamTonicClassification
from mridangam.tasks import TransientStationarySeparation

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


# Mridangam Tonic Classification

For this first experiment I am using embeddings generated by Crepe with a simple
deep MLP classifier.

The first step is to preprocess our dataset of audio files and create a PyTorch
Lightning DataModule for our experiment. In this step we compute embeddings using
the full Crepe model and save the result embeddings to disk.

In [3]:
datamodule = MridangamDataModule(
    dataset_dir="dataset/preprocesed",
    unprocessed_dir="dataset/mridangam_stroke_1.5/",
    batch_size=8,
    num_workers=4,
    attribute="tonic",
    device=device,
)
datamodule.prepare_data()

Found 6977 audio files in dataset directory


100%|██████████| 6977/6977 [00:10<00:00, 642.89it/s]


In [4]:
datamodule.setup("fit")
train_dataloader = datamodule.train_dataloader()
val_dataloader = datamodule.val_dataloader()

In [5]:
# Get input feature size and target num_classes from data
audio, embedding, label = next(iter(train_dataloader))

in_features = embedding.size(-1)
print(in_features)

out_features = train_dataloader.dataset.num_classes
print(out_features)

print(label.dtype)

2048
6
torch.int64


In [6]:
mlp = MLP(in_features=in_features, hidden=[256, 128], out_features=out_features)
model = MridangamTonicClassification(model=mlp)

In [7]:
trainer = pl.Trainer(max_epochs=1000, accelerator=device)
trainer.fit(
    model=model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/jshier/development/mridangam-tss/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | model   | MLP              | 558 K 
1 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
558 K     Trainable params
0         Non-trainable params
558 K     Total params
2.233     Total estimated model params size (MB)


Epoch 18:  25%|██▍       | 194/786 [00:01<00:05, 104.86it/s, loss=0.0373, v_num=0] 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [50]:
datamodule.setup("test")
test_dataloader = datamodule.test_dataloader()
trainer.test(model=model, dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 88/88 [00:00<00:00, 226.16it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test/loss           1.7945014238357544
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test/loss': 1.7945014238357544}]

In [51]:
# print(torch.cuda.memory_allocated())
# print(torch.cuda.memory_reserved())
# del test_dataloader
# del model
# del mlp
# torch.cuda.empty_cache()
# print(torch.cuda.memory_allocated())
# print(torch.cuda.memory_reserved())

0
2097152
0
0


In [52]:
print(len(test_dataloader.dataset))
audio, emb, label = test_dataloader.dataset[9]

print(label)
y = mlp(emb)
print(torch.argmax(y))

NameError: name 'test_dataloader' is not defined

# Mridangam Transient/Stationary Separation

In [6]:
tcn = TCN(
    in_channels=1,
    hidden_channels=32,
    out_channels=2,
    dilation_base=2,
    num_layers=10,
    kernel_size=13,
)

In [9]:
transient_loss = TransientRegularization(n_fft=2048, hop_size=512)
stationary_loss = StationaryRegularization(n_fft=2048, hop_size=512)
recon_loss = ReconstructionLoss()

tss_model = TransientStationarySeparation(
    tcn,
    reconstruction_loss=recon_loss,
    transient_loss=transient_loss,
    stationary_loss=stationary_loss,
    learning_rate=5e-4,
)

tss_model = tss_model.to(device)

In [10]:
audio, embedding, label = next(iter(train_dataloader))

t, s = tss_model(audio.to(device))
print(t.shape)
print(s.shape)

torch.Size([8, 1, 48000])
torch.Size([8, 1, 48000])


In [12]:
trainer = pl.Trainer(max_epochs=200, accelerator=device)
trainer.fit(model=tss_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type                     | Params
----------------------------------------------------
0 | model  | TCN                      | 144 K 
1 | r_loss | ReconstructionLoss       | 0     
2 | t_loss | TransientRegularization  | 0     
3 | s_loss | StationaryRegularization | 0     
----------------------------------------------------
144 K     Trainable params
0         Non-trainable params
144 K     Total params
0.579     Total estimated model params size (MB)


Epoch 0:   3%|▎         | 23/786 [2:30:46<83:21:56, 393.34s/it, loss=10.8, v_num=8]
Epoch 199: 100%|██████████| 786/786 [01:52<00:00,  6.99it/s, loss=1.05, v_num=9] 

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|██████████| 786/786 [01:52<00:00,  6.99it/s, loss=1.05, v_num=9]


In [37]:
train_iter = iter(train_dataloader)

In [38]:
audio, embedding, label = next(train_iter)
trans, sus = tss_model(audio.to(device))
y_hat = trans + sus

ipd.display(ipd.Audio(audio[0, 0].detach().cpu().numpy(), rate=48000))
ipd.display(ipd.Audio(trans[0, 0].detach().cpu().numpy(), rate=48000))
ipd.display(ipd.Audio(sus[0, 0].detach().cpu().numpy(), rate=48000))
ipd.display(ipd.Audio(y_hat[0, 0].detach().cpu().numpy(), rate=48000))

# Combining Approaches

Now we'll pass the embedding generated using Crepe to a FiLM operator that will
modulare each layer of the transient/stationary separation decoder.

In [None]:
film_encoder = MLP(in_features=2048, hidden=[256], out_features=128)
tcn_film = TCN(
    in_channels=1,
    hidden_channels=32,
    out_channels=2,
    dilation_base=2,
    num_layers=12,
    kernel_size=3,
    use_film=True,
    film_size=128,
)

In [None]:
transient_loss = TransientRegularization()
stationary_loss = StationaryRegularization()
recon_loss = ReconstructionLoss()

tss_film_model = TransientStationarySeparation(
    tcn_film,
    film_encoder=film_encoder,
    reconstruction_loss=recon_loss,
    transient_loss=transient_loss,
    stationary_loss=stationary_loss,
    learning_rate=1e-4,
)

In [None]:
audio, embedding, label = next(iter(train_dataloader))
print(embedding.shape)

t, s = tss_film_model(audio, embedding)
print(t.shape)
print(s.shape)

torch.Size([8, 1, 2048])
torch.Size([8, 1, 48000])
torch.Size([8, 1, 48000])


In [None]:
trainer = pl.Trainer(max_epochs=1000, accelerator=device)
trainer.fit(model=tss_film_model, train_dataloaders=train_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                     | Params
----------------------------------------------------------
0 | model        | TCN                      | 149 K 
1 | r_loss       | MSS                      | 0     
2 | t_loss       | TransientRegularization  | 0     
3 | s_loss       | StationaryRegularization | 0     
4 | film_encoder | MLP                      | 557 K 
----------------------------------------------------------
707 K     Trainable params
0         Non-trainable params
707 K     Total params
2.829     Total estimated model params size (MB)


Epoch 0:   9%|▉         | 66/698 [00:10<01:41,  6.20it/s, loss=14.9, v_num=11]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
