In [1]:
import sys
sys.path.append("D:\projects\python\AMT")

In [2]:
import torch
from src.models.dataset import MaestroDataset
from src.models.multi_pitch_estimator import MultiPitchEstimator
from omegaconf import OmegaConf
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [3]:
cfg = OmegaConf.load("../src/configs/model_config.yaml")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [5]:
train_dataset = MaestroDataset(hdf5_path='../'+cfg.data.hdf5_path, split="train")
val_dataset = MaestroDataset(
        hdf5_path='../'+cfg.data.hdf5_path, split="validation")

In [6]:
train_loader = DataLoader(
        train_dataset, batch_size=cfg.batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(
        val_dataset, batch_size=cfg.batch_size, shuffle=False)

In [7]:
loader = iter(train_loader)

In [8]:
batch = next(loader)

In [9]:
cqt, pianoroll = batch
print(cqt.shape, pianoroll.shape)

torch.Size([32, 360, 1, 288, 5]) torch.Size([32, 360, 88])


In [10]:
model = MultiPitchEstimator(
        kernel1_size=(cfg.kernel1_size_x, cfg.kernel1_size_y),
        out_channels1=cfg.out_channels1,
        max_pool_kernel1=(cfg.max_pool_kernel1_x,
                          cfg.max_pool_kernel1_y),
        kernel2_size=(cfg.kernel2_size_x, cfg.kernel2_size_y),
        out_channels2=cfg.out_channels2,
        max_pool_kernel2=(cfg.max_pool_kernel2_x,
                          cfg.max_pool_kernel2_y),
        lstm1_hidden_size=cfg.lstm1_hidden_state,
        dropout_size=cfg.dropout_size,
        lstm2_hidden_size=cfg.lstm2_hidden_state
    ).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss
optimizer = optim.Adam(
    model.parameters(),
    lr=cfg.lr,
    weight_decay=cfg.weight_decay
)

In [11]:
model.train()

MultiPitchEstimator(
  (cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(10, 2), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(4, 2), stride=(4, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 2), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (lstm1): LSTM(2112, 500, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.75, inplace=False)
  (lstm2): LSTM(1000, 200, batch_first=True, bidirectional=True)
  (fc): Sequential(
    (0): Linear(in_features=400, out_features=88, bias=True)
    (1): Sigmoid()
  )
)

In [13]:
optimizer.zero_grad()

In [17]:
cqt.shape, pianoroll.shape

(torch.Size([32, 360, 1, 288, 5]), torch.Size([32, 360, 88]))

In [12]:
output = model(cqt)

In [18]:
output.shape

torch.Size([32, 360, 88])

In [20]:
preds = torch.sigmoid(output) > 0.5

In [21]:
preds.shape

torch.Size([32, 360, 88])

In [37]:
def check(preds):
    count = 0
    for i in range(len(preds)):
        for j in range(len(preds[i])):
            for k in range(len(preds[i][j])):
                if preds[i][k][k]:
                    count += 1
    
    return count

check(pianoroll)

29520

In [28]:
(preds==pianoroll).sum().item()

27760

In [24]:
pianoroll.numel()

1013760