In [1]:
import torch
import torchaudio

from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


from pitch_tracker.utils import dataset
from pitch_tracker.utils.dataset import AudioDataset
from pitch_tracker.utils.constants import (F_MIN, HOP_LENGTH, N_FFT, N_MELS,
                                           PICKING_FRAME_SIZE,
                                           PICKING_FRAME_STEP,
                                           PICKING_FRAME_TIME, SAMPLE_RATE,
                                           STEP_FRAME, STEP_TIME, WIN_LENGTH,
                                           N_CLASS, )
from pitch_tracker.utils import files

  INST_TAXONOMY = yaml.load(fhandle)
  MIXING_COEFFICIENTS = yaml.load(fhandle)


In [2]:
DATASET_DIR = '../content/pickled_database/'


dataset_paths = list(files.list_folder_paths_in_dir(DATASET_DIR))
train_set, validation_set = train_test_split(dataset_paths, test_size=0.40, random_state=1, shuffle=True)
validation_set, test_set = train_test_split(validation_set, test_size=0.50, random_state=1, shuffle=True)
print(f'train_song_set: {len(train_set)}')
print(f'validation_song_set: {len(validation_set)}')
print(f'test_song_set: {len(test_set)}')

train_song_set: 64
validation_song_set: 21
test_song_set: 22


In [3]:
train_dataset = AudioDataset(train_set)
validation_set = AudioDataset(validation_set)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_set, batch_size=8, shuffle=True)

In [4]:
device = "cuda" if torch.cuda.is_available() \
    else "mps" if torch.backends.mps.is_available() \
    else "cpu"
print(f"Using {device} device")

Using mps device


In [55]:
from collections import OrderedDict
from typing import Tuple, Union


def create_conv2d_block(
        conv2d_input: Tuple[int,int,Union[Tuple[int,int], int]],
        maxpool_kernel_size: Union[Tuple[int,int], int, None],):
    in_channels, out_channels, (kernel_size) = conv2d_input
    
    conv2d = nn.Conv2d(in_channels, out_channels, kernel_size)
    relu = nn.ReLU()
    batch_norm = nn.BatchNorm2d(out_channels)
    maxpool_2d = nn.MaxPool2d(maxpool_kernel_size) if maxpool_kernel_size else None
    
    conv2d_block = nn.Sequential(
        OrderedDict([
            ('conv2d', conv2d),
            ('relu', relu),
            ('batch_norm', batch_norm),  
        ])
    )

    if maxpool_2d:
        conv2d_block.add_module('maxpool2d', maxpool_2d)
    
    return conv2d_block

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.conv2d_block1 = create_conv2d_block(
            conv2d_input=(1,64,3),
            maxpool_kernel_size=3,
        )
        
        self.conv2d_block2 = create_conv2d_block(
            conv2d_input=(64,64,3),
            maxpool_kernel_size=3,
        )

        self.conv2d_block2 = create_conv2d_block(
            conv2d_input=(64,64,3),
            maxpool_kernel_size=3,
        )
        self.dense_layer = nn.LazyLinear(128)
        self.output_layer = nn.Linear(128, 88)

    def forward(self, x):
        x = self.conv2d_block1(x)
        x = self.conv2d_block2(x)
        x = self.conv2d_block3(x)
        x = torch.flatten(x)
        x = self.dense_layer(x)
        out = self.output_layer(x)

        return out

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (conv2d_block1): Sequential(
    (conv2d): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (relu): ReLU()
    (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (maxpool2d): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2d_block2): Sequential(
    (conv2d): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (relu): ReLU()
    (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (maxpool2d): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
)


In [57]:
sample_feature, sample_label = next(iter(train_dataloader))
model(sample_feature.to(device)).shape

torch.Size([8, 64, 115, 8])

In [53]:
conv2d_stack = create_conv2d_block((1,64,3), 3)

In [54]:
conv2d_stack(sample_feature).shape

torch.Size([8, 64, 349, 28])

In [52]:
conv2d_stack

Sequential(
  (conv2d): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (relu): ReLU()
  (batch_norm): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool2d): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
)