<a href="https://colab.research.google.com/github/duotien/mono_pitch_tracker/blob/main/notebooks/mono_pitch_tracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [3]:
%%capture
!git clone --recursive https://github.com/duotien/mono_pitch_tracker.git
!apt -qq install -y sox
!pip install pyyaml==5.4.1 sox librosa==0.9.2 torchinfo

In [4]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
print(f'Get file from google drive...')
import shutil
file_to_copy_path = '/content/drive/MyDrive/DataBase/MedleyDB/V1_MIX_AUDIO_ONLY.zip'
file_to_paste_path = '/content/V1_MIX_AUDIO_ONLY.zip'

shutil.copy2(file_to_copy_path, file_to_paste_path)

In [None]:
%%capture
!unzip -o V1_MIX_AUDIO_ONLY.zip -d /content/mono_pitch_tracker/medleydb/medleydb/data/Audio/

# Import

In [None]:
import sys
import os

NOTEBOOK_DIR = os.getcwd()
MONO_PITCH_TRACKER_DIR = os.path.join(os.getcwd(), 'mono_pitch_tracker/')
MEDLEYDB_DIR = os.path.join(MONO_PITCH_TRACKER_DIR, 'medleydb/')

sys.path.append(MONO_PITCH_TRACKER_DIR)
sys.path.append(MEDLEYDB_DIR)

In [None]:
# change working directory to `mono_pitch_tracker/``
%cd $MONO_PITCH_TRACKER_DIR

In [None]:
# cut audio into frames
import numpy as np
import pandas as pd
import yaml
import json
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

import medleydb
from pitch_tracker.utils.constants import (F_MIN, HOP_LENGTH, N_FFT, N_MELS,
                                           PICKING_FRAME_SIZE,
                                           PICKING_FRAME_STEP,
                                           PICKING_FRAME_TIME, SAMPLE_RATE,
                                           STEP_FRAME, STEP_TIME, WIN_LENGTH,
                                           N_CLASS, )
from pitch_tracker.utils.audio import load_audio_mono
from pitch_tracker.utils.files import get_file_name, list_file_paths_in_dir, list_folder_paths_in_dir

# from pitch_tracker.utils import dataset, files
from pitch_tracker.utils.medleydb_melody import gen_label
from pitch_tracker.utils import dataset
from pitch_tracker.utils.dataset import AudioDataset


  INST_TAXONOMY = yaml.load(fhandle)
  MIXING_COEFFICIENTS = yaml.load(fhandle)


# Prepare Dataset

In [None]:
hop_size = 512
step_frame = 2

DATASET_DIR = f'{MONO_PITCH_TRACKER_DIR}/content/pickled_database/{hop_size}_{step_frame}'
DATA_SPLIT_PATH = '../pitch_tracker/utils/data_split.json'

In [None]:
# Prepare datset
!python scripts/prepare_dataset.py --stft_hop_size {hop_size} --step_frame {step_frame}

In [None]:
def prepare_dataset(split_by:str='ARTIST'):
    if split_by == 'SONG':
        # split 60/20/20
        dataset_paths = list(list_folder_paths_in_dir(DATASET_DIR))
        train_set, validation_set = train_test_split(
            dataset_paths,
            test_size=0.40,
            random_state=1,
            shuffle=True)
        validation_set, test_set = train_test_split(
            validation_set,
            test_size=0.50,
            random_state=1,
            shuffle=True)

    # Split by artist, used in:
    # https://github.com/dogacbasaran/ismir2018_dominant_melody_estimation/blob/master/random_dataset_splits/dataset-ismir-splits.json
    # However the `AimeeNorwich_Child` file is broken so there's only 108/109 songs available
    if split_by == 'ARTIST':
        with open(DATA_SPLIT_PATH, 'r') as f:
            splits = json.load(f)
        train_set = [os.path.join(DATASET_DIR, song_name) for song_name in splits['train']]
        validation_set = [os.path.join(DATASET_DIR, song_name) for song_name in splits['validation']]
        test_set = [os.path.join(DATASET_DIR, song_name) for song_name in splits['test']]

    print(f'train_set: {len(train_set)}')
    print(f'validation_set: {len(validation_set)}')
    print(f'test_set: {len(test_set)}')


    train_dataset = AudioDataset(train_set)
    validation_dataset = AudioDataset(validation_set)
    test_dataset = AudioDataset(test_set)

    return train_dataset, validation_dataset, test_dataset

In [None]:
train_dataset, validation_dataset, test_dataset = prepare_dataset()

# Modeling & training

In [None]:
from pitch_tracker.ml.net import Audio_CNN, Audio_CRNN, create_conv2d_block, conv2d_output_shape
from pitch_tracker.ml.train_model import train_model
from pitch_tracker.ml.earlystopping import EarlyStopping
from torch.optim.lr_scheduler import ReduceLROnPlateau


import torch
from torch import nn
from torchinfo import summary

In [None]:
class Model_1_512_5(nn.Module):
    def __init__(self):
        super(Model_1_512_5, self).__init__()
        self.conv2d_block1 = create_conv2d_block(
            conv2d_input=(1,128,3),
            padding='same',
            maxpool_kernel_size=None,
        )
        
        self.conv2d_block2 = create_conv2d_block(
            conv2d_input=(128,128,3),
            padding='same',
            maxpool_kernel_size=(1,5),
        )

        self.conv2d_block3 = create_conv2d_block(
            conv2d_input=(128,64,3),
            padding='same',
            maxpool_kernel_size=(1,2),
        )

        self.flatten_layer = nn.Flatten(start_dim=2)

        self.gru = nn.GRU(
            input_size=512,
            hidden_size=128,
            batch_first=True,
            bidirectional=False,
            dropout=0.2,
        )

        self.gru_bidirectional = nn.GRU(
            input_size=128,
            hidden_size=64,
            batch_first=True,
            bidirectional=True,
            dropout=0.2,
        )
        self.maxpool1d = nn.MaxPool1d(
            kernel_size=5,
        )
        self.dense_layer = nn.LazyLinear(128)
        self.output_layer = nn.LazyLinear(88)
        
    def forward(self, x):
        x = self.conv2d_block1(x)
        x = self.conv2d_block2(x)
        x = self.conv2d_block3(x)
        x = x.permute((0,2,3,1)) # [batch, channel, n_frames, n_mel] -> [batch, n_frames, n_mel * channel]
        x = self.flatten_layer(x)
        x, h_n = self.gru(x)
        x, h_n = self.gru_bidirectional(x)
        x = x.permute(0,2,1)
        x = self.maxpool1d(x)
        x = x.permute(0,2,1)
        x = self.dense_layer(x)
        x = self.output_layer(x)
        return x
        

In [None]:
class Model_2_512_5(nn.Module):
    def __init__(self):
        super(Model_2_512_5, self).__init__()
        self.conv2d_block1 = create_conv2d_block(
            conv2d_input=(1,256,3),
            padding='same',
            maxpool_kernel_size=None,
        )
        
        self.conv2d_block2 = create_conv2d_block(
            conv2d_input=(256,128,3),
            padding='same',
            maxpool_kernel_size=(1,5),
        )

        self.conv2d_block3 = create_conv2d_block(
            conv2d_input=(128,64,3),
            padding='same',
            maxpool_kernel_size=(1,5),
        )
        
        self.flatten_layer = nn.Flatten(start_dim=2)

        self.gru = nn.GRU(
            input_size=192,
            hidden_size=128,
            batch_first=True,
            bidirectional=False,
            dropout=0.2,
        )

        self.gru_bidirectional = nn.GRU(
            input_size=128,
            hidden_size=128,
            batch_first=True,
            bidirectional=True,
            dropout=0.2,
        )
        self.maxpool1d = nn.MaxPool1d(
            kernel_size=5,
        )
        self.dense_layer = nn.LazyLinear(128)
        self.output_layer = nn.LazyLinear(88)
        
    def forward(self, x):
        x = self.conv2d_block1(x)
        x = self.conv2d_block2(x)
        x = self.conv2d_block3(x)
        # x = self.conv2d_block4(x)
        x = x.permute((0,2,3,1)) # [batch, channel, n_frames, n_mel] -> [batch, n_frames, n_mel * channel]
        # print(x.is_contiguous())
        x = self.flatten_layer(x)
        x, h_n = self.gru(x)
        x, h_n = self.gru_bidirectional(x)
        x = x.permute(0,2,1)
        x = self.maxpool1d(x)
        x = x.permute(0,2,1)
        x = self.dense_layer(x)
        x = self.output_layer(x)
        return x
        

In [None]:
class Model_1_512_2(nn.Module):
    def __init__(self):
        super(Model_1_512_2, self).__init__()
        self.conv2d_block1 = create_conv2d_block(
            conv2d_input=(1,128,3),
            padding='same',
            maxpool_kernel_size=None,
        )
        
        self.conv2d_block2 = create_conv2d_block(
            conv2d_input=(128,128,3),
            padding='same',
            maxpool_kernel_size=(1,5),
        )

        self.conv2d_block3 = create_conv2d_block(
            conv2d_input=(128,64,3),
            padding='same',
            maxpool_kernel_size=(1,2),
        )

        self.flatten_layer = nn.Flatten(start_dim=2)

        self.gru = nn.GRU(
            input_size=512,
            hidden_size=128,
            batch_first=True,
            bidirectional=False,
            dropout=0.2,
        )

        self.gru_bidirectional = nn.GRU(
            input_size=128,
            hidden_size=64,
            batch_first=True,
            bidirectional=True,
            dropout=0.2,
        )
        self.maxpool1d = nn.MaxPool1d(
            kernel_size=5,
        )
        self.dense_layer = nn.LazyLinear(128)
        self.output_layer = nn.LazyLinear(88)
        
    def forward(self, x):
        x = self.conv2d_block1(x)
        x = self.conv2d_block2(x)
        x = self.conv2d_block3(x)
        x = x.permute((0,2,3,1)) # [batch, channel, n_frames, n_mel] -> [batch, n_frames, n_mel * channel]
        x = self.flatten_layer(x)
        x, h_n = self.gru(x)
        x, h_n = self.gru_bidirectional(x)
        x = x.permute(0,2,1)
        x = self.maxpool1d(x)
        x = x.permute(0,2,1)
        x = self.dense_layer(x)
        x = self.output_layer(x)
        return x
        

In [None]:
model = Model_1_512_2()
dummy_in_shape = [1] + list(train_dataset.__getitem__(0)[0].shape)
dummy_in = torch.randn(dummy_in_shape)
print(f'Input size: {tuple(dummy_in.shape)}')
print(f'Output size: {tuple(model(dummy_in).shape)}')
del dummy_in

In [None]:
summary(model, dummy_in_shape)

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() \
    else "mps" if torch.backends.mps.is_available() \
    else "cpu"
print(f"Using {DEVICE} device")

# create config file
p = {
    # dataset
    'batch_size': 8,
    # fit
    'n_epochs': 100,
    'learning_rate': 1e-3,
    # early stopping
    'es_patience': 10,
    'es_verbose': True,
    'es_dir_path': './checkpoints',
    # lr scheduler
    'ls_patience': 4,
    'ls_factor': 0.2,
    # misc
    'device': DEVICE,
}

with open('./scripts/config/model_config.yml', 'w') as f:
    yaml.dump(p,f,sort_keys=False)

In [None]:
# create config file
p = {
    # dataset
    'batch_size': 16,
    # fit
    'n_epochs': 100,
    'learning_rate': 1e-3,
    # early stopping
    'es_patience': 10,
    'es_verbose': True,
    'es_dir_path': './checkpoints',
    # lr scheduler
    'ls_patience': 6,
    'ls_factor': 0.2,
    # misc
    'device': 'DEVICE',
}

with open('./scripts/config/model_config.yml', 'w') as f:
    yaml.dump(p,f,sort_keys=False)

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=p['batch_size'],
    shuffle=True
)
    
validation_dataloader = DataLoader(
    validation_dataset,
    batch_size=p['batch_size'],
    shuffle=True
)

In [None]:
# BCE loss doesn't work well.
loss_fn = nn.CrossEntropyLoss().to(p['device'])
model = model.to(p['device'])

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=p['learning_rate']
)
early_stopping = EarlyStopping(
    patience=p['es_patience'],
    verbose=p['es_verbose'],
    dir_path=p['es_dir_path']
)
lr_scheduler = ReduceLROnPlateau(
    optimizer=optimizer,
    patience=p['ls_patience'],
    factor=p['ls_factor'],
    verbose=True
)

In [None]:
model, avg_train_losses, avg_validation_losses = train_model(
    model=model,
    train_dataloader=train_dataloader,
    validation_dataloader=validation_dataloader,
    loss_fn=loss_fn,
    optimizer=optimizer,
    n_epochs=p['n_epochs'],
    early_stopping=early_stopping,
    lr_scheduler=lr_scheduler,
    device=p['device'],
)

In [None]:
# Train model
# ! python scripts/model.py

In [None]:
model = torch.load('/content/drive/MyDrive/School work/KLTN/2022-2023/models/mpt_v01_20231028-2010.pth')

In [None]:
import torch

dummy_in = torch.randn(8, 1, 1050, 88)
in_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
out_names = [ "output1" ]

torch.onnx.export(model, dummy_in, f="model.onnx", input_names=in_names, output_names=out_names, opset_version=7, verbose=True)