In [1]:
# Standard library imports
import collections
import datetime
import glob
from itertools import product
import os

# Third party imports
from IPython import display
import numpy as np
import pandas as pd
import pretty_midi
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [23]:
_SAMPLING_RATE = 16000
midi_dir = 'maestro-v3.0.0'
npy_dir = 'maestro-numpy'
filenames = glob.glob(f'{midi_dir}/**/*.mid*')

sample_file = filenames[10]
pm = pretty_midi.PrettyMIDI(sample_file)

def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
    waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
    waveform_short = waveform[:seconds*_SAMPLING_RATE]
    return display.Audio(waveform_short, rate=_SAMPLING_RATE)

display_audio(pm)



# MIDI to NumPy
Extract sequences from the MIDI files and store them as NumPy arrays in npy files.

In [3]:
vocab_size = 128

def midi_to_notes(midi_file):
    pm = pretty_midi.PrettyMIDI(midi_file)
    instrument = pm.instruments[0]
    notes = collections.defaultdict(list)

    sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
    prev_start = sorted_notes[0].start

    for note in sorted_notes:
        start = note.start
        end = note.end
        notes['pitch'].append(note.pitch)
        notes['start'].append(start)
        notes['end'].append(end)
        notes['step'].append(start - prev_start)
        notes['duration'].append(end - start)
        prev_start = start

    return pd.DataFrame({name: np.array(value) for name, value in notes.items()})

def get_sequences(seq_length):
    all_notes = []
    for f in filenames:
        notes = midi_to_notes(f)
        all_notes.append(notes)
    all_notes = pd.concat(all_notes)
    key_order = ['pitch', 'step', 'duration']
    data = np.stack([all_notes[key] for key in key_order], axis=1)
    data = torch.tensor(data)
    sequences = data.unfold(0, seq_length + 1, 1)

    inputs = sequences[:, :, :-1].numpy()
    labels = sequences[:, :, -1].numpy()
    return inputs, labels

def create_npy_datasets(seq_length):
    inputs_file = os.path.join(npy_dir, f'inputs.{seq_length}.npy')
    labels_file = os.path.join(npy_dir, f'labels.{seq_length}.npy')
    if os.path.exists(inputs_file) and os.path.exists(labels_file):
        print('npy datasets already exist, delete to re-generate... quitting')
        return
    if os.path.exists(inputs_file):
        os.unlink(inputs_file)
    if os.path.exists(labels_file):
        os.unlink(labels_file)
    if not os.path.exists(npy_dir):
        os.mkdir(npy_dir)
    inputs, labels = get_sequences(seq_length)
    with open(inputs_file, 'wb') as f:
        np.save(f, inputs)
    with open(labels_file, 'wb') as f:
        np.save(f, labels)

Create two datasets, for comparison.

In [4]:
create_npy_datasets(seq_length=16)
create_npy_datasets(seq_length=32)

npy datasets already exist, delete to re-generate... quitting
npy datasets already exist, delete to re-generate... quitting


# PyTorch Dataset and Dataloader
Loads the extract sequences.

In [4]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
print(device)

cuda:0


In [5]:
class MusicDataset(Dataset):
    def __init__(self, X, y, vocab_size, split='train'):
        train_end = val_start = round(X.shape[0] * 0.8)
        val_end = test_start = round(y.shape[0] * 0.9)
        if split == 'train':
            X = X[:train_end]
            y = y[:train_end]
        elif split == 'val':
            X = X[val_start:val_end]
            y = y[val_start:val_end]
        elif split == 'test':
            X = X[test_start:]
            y = y[test_start:]
        else:
            raise NotImplementedError()
            
        self.X = torch.tensor(X, device=device)
        self.y = torch.tensor(y, device=device)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        return x, y

In [7]:
# Load X, y out here instead of in the dataset class to save memory/time
with open(os.path.join(npy_dir, 'inputs.16.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.16.npy'), 'rb') as f:
    y = np.load(f)

In [8]:
# Construct datasets and dataloaders
train_ds = MusicDataset(X, y, vocab_size, split='train')
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
val_ds = MusicDataset(X, y, vocab_size, split='val')
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True, drop_last=True)

# Free up memory
del X; del y

Convert tensor back to playable audio file.

In [6]:
def notes_to_midi(notes, out_file=None, instr_name='Acoustic Grand Piano', velocity=100):
    pm = pretty_midi.PrettyMIDI()
    instr = pretty_midi.Instrument(
        program=pretty_midi.instrument_name_to_program(instr_name)
    )
    prev_start = 0
    for i, note in notes.iterrows():
        start = float(prev_start + note['step'])
        end = float(start + note['duration'])
        note = pretty_midi.Note(
            velocity=velocity,
            pitch=int(note['pitch']),
            start=start,
            end=end
        )
        instr.notes.append(note)
        prev_start = start

    pm.instruments.append(instr)
    if out_file:
        pm.write(out_file)
    return pm

def tensor_to_midi(tensor):
    prev_start = 0
    notes = []
    for i in range(tensor.shape[1]):
        pitch, step, duration = tensor[:, i]
        pitch = pitch.item()
        step = step.item()
        duration = duration.item()
        start = prev_start + step
        end = start + duration
        note = (pitch, step, duration)
        notes.append((*note, start, end))
        prev_start = start
    notes = pd.DataFrame(notes, columns=('pitch', 'step', 'duration', 'start', 'end'))
    pm = notes_to_midi(notes)
    return pm

In [10]:
pm = tensor_to_midi(train_ds[1][0])
display_audio(pm)



# Transformer model

In [7]:
class MusicTransformer(nn.Module):
    def __init__(self, seq_length, vocab_size, 
                 num_decoder_layers, num_hidden_fc_layers, 
                 transformer_dropout, device):
        super().__init__()
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=3, nhead=3, dropout=transformer_dropout, 
            batch_first=True)
        self.decoder = nn.TransformerDecoder(
            decoder_layer, num_layers=num_decoder_layers)
        self.decoder = self.decoder.to(device=device).double()
        self.flatten = nn.Flatten()
        if num_hidden_fc_layers == 1:
            self.fcnn = nn.Sequential(
                nn.Linear(seq_length * 3, vocab_size + 2),
                nn.ReLU(),
                nn.Dropout(p=0.3),
                nn.Linear(vocab_size + 2, vocab_size + 2)
            )
        elif num_hidden_fc_layers == 2:
            self.fcnn = nn.Sequential(
                nn.Linear(seq_length * 3, vocab_size + 2),
                nn.ReLU(),
                nn.Dropout(p=0.3),
                nn.Linear(vocab_size + 2, vocab_size + 2),
                nn.ReLU(),
                nn.Dropout(p=0.3),
                nn.Linear(vocab_size + 2, vocab_size + 2)
            )
        else:
            raise NotImplementedError
        self.fcnn = self.fcnn.to(device=device).double()
        
    def forward(self, x, y):
        z = self.decoder(x, y)
        z = self.flatten(z)
        z = self.fcnn(z)
        return z

In [12]:
model = MusicTransformer(
    seq_length=16,
    vocab_size=vocab_size,
    num_decoder_layers=3,
    num_hidden_fc_layers=1,
    transformer_dropout=0.1,
    device=device
)

Sanity check.

In [13]:
for x, y in train_dl:
    break
x = x.moveaxis(-1, 1)
y = y.unsqueeze(1)
print(x.shape, y.shape)
with torch.no_grad():
    result = model(x, y)
print(result)
print(result.shape)

torch.Size([64, 16, 3]) torch.Size([64, 1, 3])
tensor([[-0.1291,  0.0723, -0.2718,  ..., -0.3469,  0.0498, -0.5381],
        [-0.1491, -0.0690, -0.0029,  ..., -0.1222,  0.1673, -0.6168],
        [-0.0047,  0.0458, -0.2076,  ..., -0.3636, -0.0101, -0.5937],
        ...,
        [-0.1722,  0.1734, -0.3336,  ..., -0.3354, -0.2484, -0.4472],
        [ 0.1827,  0.2892, -0.3723,  ..., -0.3800, -0.3705, -0.1495],
        [-0.1627,  0.0730, -0.3008,  ..., -0.2745,  0.0895, -0.6271]],
       device='cuda:0', dtype=torch.float64)
torch.Size([64, 130])


Functions to get a MIDI file from model outputs.

In [8]:
def predict_next_notes(x, y, model, temperature=0.1):
    x = x.unsqueeze(0).moveaxis(-1, 1)
    y = y.unsqueeze(0).unsqueeze(0)
    with torch.no_grad():
        preds = model(x, y)
    pitch_logits = preds[:, :vocab_size]
    if np.random.random() <= temperature:
        pitch = torch.randint(low=0, high=vocab_size+1, size=(1,))
    else:
        pitch = torch.argmax(F.softmax(pitch_logits, dim=0))
    step = preds[:, -2]
    duration = preds[:, -1]
    return pitch.item(), step.item(), duration.item()

def generate_sequence(num_preds, x, y, model, temperature):
    generated_notes = []
    prev_start = 0
    input_notes = x
    seq_length = input_notes.shape[1]
    for _ in range(num_preds):
        pitch, step, duration = predict_next_notes(x, y, model, temperature)
        start = prev_start + step
        end = start + duration
        input_note = (pitch, step, duration)
        generated_notes.append((*input_note, start, end))
        input_notes = input_notes[:, 1:]
        input_note = torch.tensor(input_note) / torch.tensor([seq_length, 1, 1])
        input_note = input_note.to(device=device).unsqueeze(1)
        input_notes = torch.cat((input_notes, input_note), dim=1)
        prev_start = start
    
    generated_notes = pd.DataFrame(generated_notes, columns=('pitch', 'step', 'duration', 'start', 'end'))
    pm = notes_to_midi(generated_notes)
    return pm

Sanity check.

In [15]:
pm = generate_sequence(120, *train_ds[0], model, temperature=0.1)

# Training the model

In [9]:
mse_criterion = nn.MSELoss()
ce_criterion = nn.CrossEntropyLoss()
loss_weights = {'ce': 0.5, 'mse': 0.5}

# Get current datetime
def current_time():
    return datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Custom loss function
def criterion(preds, true, weights):
    true = true.squeeze()
    ce_loss = ce_criterion(preds[:, :vocab_size], true[:, 0].long())
    ce_loss *= weights['ce']
    mse_loss = mse_criterion(preds[:, vocab_size:], true[:, 1:])
    mse_loss *= weights['mse']
    return ce_loss + mse_loss

# Gets loss over a validation/test dataloader
def evaluate_loss(model, test_dl, loss_weights):
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, (x, y) in enumerate(test_dl):
            x = x.moveaxis(-1, 1)
            y = y.unsqueeze(1)
            outputs = model(x, y)
            loss = criterion(outputs, y, loss_weights)
            test_loss += loss.item()
    test_loss /= (i + 1)
    model.train()
    return test_loss

# Training loop
def train(model, train_dl, valid_dl, loss_weights, num_epochs, learning_rate):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    best_val_loss = 9999
    stop = False
    model_path = ''
    model_dir = 'models'
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    model.train()
    
    print('Started training')
    for epoch in range(num_epochs):
        if stop:
            break
        running_loss = 0.0
        
        for i, (x, y) in enumerate(train_dl):  
            optimizer.zero_grad()
            x = x.moveaxis(-1, 1)
            y = y.unsqueeze(1)
            outputs = model(x, y)
            loss = criterion(outputs, y, loss_weights)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] train loss: {running_loss / 2000:.3f}')
                running_loss = 0.0
                
            if i % 8000 == 7999:  # print every 8000 mini-batches
                val_loss = evaluate_loss(model, valid_dl, loss_weights)
                print(f'[{epoch + 1}, {i + 1:5d}] val loss: {val_loss:.3f}')
                timestamp = current_time()
                seq_length = x.shape[1]
                fname = f'music_transformer.{timestamp}.{seq_length}.pt'
                torch.save(model.state_dict(), os.path.join(model_dir, fname))
                print(f'Saved model to {os.path.join(model_dir, fname)}')
                if val_loss < best_val_loss:
                    print('new best!')
                    best_val_loss = val_loss
                    model_path = fname
                if val_loss > best_val_loss:
                    print('model performance decreasing... stopping early')
                    stop = True
                    break
                    
    print('Finished training')
    return model_path, best_val_loss

We try the following hyperparameter settings:
- Input sequence length: 16 vs. 32
- Number of transformer decoder layers: 3 vs. 4
- Number of hidden FC layers: 1 vs. 2
- Dropout in the transformer layers: 0.1 vs. 0.3
- Learning rate: $1^{-3}$ vs. $1^{-4}$

In [19]:
# Setup dataloaders
with open(os.path.join(npy_dir, 'inputs.16.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.16.npy'), 'rb') as f:
    y = np.load(f)

train_ds = MusicDataset(X, y, vocab_size, split='train')
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
val_ds = MusicDataset(X, y, vocab_size, split='val')
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

In [20]:
# Train the model with length-16 inputs
hparam_settings = []
learning_rates = [1e-3, 1e-4]
for tup in product([3,6],[1,2],[0.1,0.3]):
    setting = dict([
        ('num_decoder_layers', tup[0]),
        ('num_hidden_fc_layers', tup[1]),
        ('transformer_dropout', tup[2])
    ])
    hparam_settings.append(setting)

best16_val_loss = 9999
best16_model_path = ''
best16_hparams = {}
for lr in learning_rates:
    for hparams in hparam_settings:
        print(f'Learning rate = {lr}')
        print(hparams)
        model = MusicTransformer(    
            **hparams,
            seq_length=16,
            vocab_size=vocab_size,
            device=device
        )
        path, val_loss = train(
            model, 
            train_dl, 
            val_dl, 
            loss_weights, 
            num_epochs=1,
            learning_rate=lr
        )
        if val_loss < best16_val_loss:
            best16_model_path = path
            best16_val_loss = val_loss
            best16_hparams = hparams.copy()
            best16_hparams['lr'] = lr
        print()
            
print(f'Best validation loss: {best16_val_loss}')
print(f'Hyper parameter settings: {best16_hparams}')
print(f'Path to best model: {best16_model_path}')

Learning rate = 0.001
{'num_decoder_layers': 3, 'num_hidden_fc_layers': 1, 'transformer_dropout': 0.1}
Started training
[1,  2000] train loss: 1.517
[1,  4000] train loss: 1.014
[1,  6000] train loss: 0.957
[1,  8000] train loss: 0.917
[1,  8000] val loss: 18.627
Saved model to models/music_transformer.2022-03-08_12-07-14.16.pt
new best!
[1, 10000] train loss: 0.907
[1, 12000] train loss: 0.881
[1, 14000] train loss: 0.872
[1, 16000] train loss: 0.857
[1, 16000] val loss: 23.285
Saved model to models/music_transformer.2022-03-08_12-11-45.16.pt
model performance decreasing... stopping early
Finished training

Learning rate = 0.001
{'num_decoder_layers': 3, 'num_hidden_fc_layers': 1, 'transformer_dropout': 0.3}
Started training
[1,  2000] train loss: 2.092
[1,  4000] train loss: 2.080
[1,  6000] train loss: 2.077
[1,  8000] train loss: 2.080
[1,  8000] val loss: 2.076
Saved model to models/music_transformer.2022-03-08_12-16-16.16.pt
new best!
[1, 10000] train loss: 1.303
[1, 12000] train

[1, 22000] train loss: 2.018
[1, 24000] train loss: 1.639
[1, 24000] val loss: 2.472
Saved model to models/music_transformer.2022-03-08_14-56-59.16.pt
model performance decreasing... stopping early
Finished training

Learning rate = 0.0001
{'num_decoder_layers': 6, 'num_hidden_fc_layers': 1, 'transformer_dropout': 0.3}
Started training
[1,  2000] train loss: 2.140
[1,  4000] train loss: 2.086
[1,  6000] train loss: 2.080
[1,  8000] train loss: 2.081
[1,  8000] val loss: 2.089
Saved model to models/music_transformer.2022-03-08_15-05-51.16.pt
new best!
[1, 10000] train loss: 2.080
[1, 12000] train loss: 2.082
[1, 14000] train loss: 2.076
[1, 16000] train loss: 2.077
[1, 16000] val loss: 2.089
Saved model to models/music_transformer.2022-03-08_15-14-42.16.pt
model performance decreasing... stopping early
Finished training

Learning rate = 0.0001
{'num_decoder_layers': 6, 'num_hidden_fc_layers': 2, 'transformer_dropout': 0.1}
Started training
[1,  2000] train loss: 2.094
[1,  4000] train l

Now the length-32 sequences. Restart the notebook first, to clear out memory.

In [9]:
# Setup dataloaders
with open(os.path.join(npy_dir, 'inputs.32.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.32.npy'), 'rb') as f:
    y = np.load(f)

train_ds = MusicDataset(X, y, vocab_size, split='train')
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
val_ds = MusicDataset(X, y, vocab_size, split='val')
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

In [12]:
# Train the model with length-32 inputs
hparam_settings = []
learning_rates = [1e-3, 1e-4]
for tup in product([3,6],[1,2],[0.1,0.3]):
    setting = dict([
        ('num_decoder_layers', tup[0]),
        ('num_hidden_fc_layers', tup[1]),
        ('transformer_dropout', tup[2])
    ])
    hparam_settings.append(setting)

best32_val_loss = 9999
best32_model_path = ''
best32_hparams = {}
for lr in learning_rates:
    for hparams in hparam_settings:
        print(f'Learning rate = {lr}')
        print(hparams)
        model = MusicTransformer(    
            **hparams,
            seq_length=32,
            vocab_size=vocab_size,
            device=device
        )
        path, val_loss = train(
            model, 
            train_dl, 
            val_dl, 
            loss_weights, 
            num_epochs=1,
            learning_rate=lr
        )
        if val_loss < best32_val_loss:
            best32_model_path = path
            best32_val_loss = val_loss
            best32_hparams = hparams.copy()
            best32_hparams['lr'] = lr
        print()
            
print(f'Best validation loss: {best32_val_loss}')
print(f'Hyper parameter settings: {best32_hparams}')
print(f'Path to best model: {best32_model_path}')

Learning rate = 0.001
{'num_decoder_layers': 3, 'num_hidden_fc_layers': 1, 'transformer_dropout': 0.1}
Started training
[1,  2000] train loss: 1.186
[1,  4000] train loss: 0.848
[1,  6000] train loss: 0.775
[1,  8000] train loss: 0.750
[1,  8000] val loss: 25.160
Saved model to models/music_transformer.2022-03-08_16-03-48.32.pt
new best!
[1, 10000] train loss: 0.726
[1, 12000] train loss: 0.714
[1, 14000] train loss: 0.690
[1, 16000] train loss: 0.677
[1, 16000] val loss: 34.093
Saved model to models/music_transformer.2022-03-08_16-08-13.32.pt
model performance decreasing... stopping early
Finished training

Learning rate = 0.001
{'num_decoder_layers': 3, 'num_hidden_fc_layers': 1, 'transformer_dropout': 0.3}
Started training
[1,  2000] train loss: 1.648
[1,  4000] train loss: 1.156
[1,  6000] train loss: 1.040
[1,  8000] train loss: 0.985
[1,  8000] val loss: 15.476
Saved model to models/music_transformer.2022-03-08_16-12-39.32.pt
new best!
[1, 10000] train loss: 0.950
[1, 12000] trai

[1,  8000] train loss: 2.076
[1,  8000] val loss: 2.089
Saved model to models/music_transformer.2022-03-08_18-46-24.32.pt
new best!
[1, 10000] train loss: 2.078
[1, 12000] train loss: 2.079
[1, 14000] train loss: 2.009
[1, 16000] train loss: 1.799
[1, 16000] val loss: 4.745
Saved model to models/music_transformer.2022-03-08_18-55-08.32.pt
model performance decreasing... stopping early
Finished training

Learning rate = 0.0001
{'num_decoder_layers': 6, 'num_hidden_fc_layers': 2, 'transformer_dropout': 0.1}
Started training
[1,  2000] train loss: 1.806
[1,  4000] train loss: 1.255
[1,  6000] train loss: 1.114
[1,  8000] train loss: 1.027
[1,  8000] val loss: 16.925
Saved model to models/music_transformer.2022-03-08_19-03-54.32.pt
new best!
[1, 10000] train loss: 0.972
[1, 12000] train loss: 0.927
[1, 14000] train loss: 0.905
[1, 16000] train loss: 0.888
[1, 16000] val loss: 26.614
Saved model to models/music_transformer.2022-03-08_19-12-39.32.pt
model performance decreasing... stopping e

# Evaluate on test data
Check testing loss and generate some music. Again, restart the notebook to clear memory.

In [10]:
# Load length-16 testing dataset
with open(os.path.join(npy_dir, 'inputs.16.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.16.npy'), 'rb') as f:
    y = np.load(f)
    
test_ds = MusicDataset(X, y, vocab_size=vocab_size, split='test')
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

Evaluate the model trained on length-16 sequences.

In [11]:
best16_hparams = {
    'num_decoder_layers': 3, 
    'num_hidden_fc_layers': 2,
    'transformer_dropout': 0.3
}
best16_model_path = 'models/music_transformer.2022-03-08_14-25-53.16.pt'

model = MusicTransformer(    
    **best16_hparams,
    seq_length=16,
    vocab_size=vocab_size,
    device=device
)
if os.path.exists(best16_model_path):
    model.load_state_dict(torch.load(best16_model_path))
model.eval()

MusicTransformer(
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (linear1): Linear(in_features=3, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=3, bias=True)
        (norm1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
        (dropout3): Dropout(p=0.3, inplace=False)
      )
      (1): TransformerDecoderLay

In [12]:
evaluate_loss(model, test_dl, loss_weights)

2.0609212708439624

In [13]:
pm = generate_sequence(120, *test_ds[0], model, temperature=0.3)
display_audio(pm)



Evaluate the model trained on length-32 sequences.

In [14]:
# Clear memory
del test_ds; del test_dl
del model

In [15]:
# Load length-32 testing dataset
with open(os.path.join(npy_dir, 'inputs.32.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.32.npy'), 'rb') as f:
    y = np.load(f)
    
test_ds = MusicDataset(X, y, vocab_size=vocab_size, split='test')
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

In [16]:
best32_hparams = {
    'num_decoder_layers': 6, 
    'num_hidden_fc_layers': 2,
    'transformer_dropout': 0.3
}
best32_model_path = 'music_transformer.2022-03-08_19-21-25.32.pt'

model = MusicTransformer(    
    **best32_hparams,
    seq_length=32,
    vocab_size=vocab_size,
    device=device
)
if os.path.exists(best32_model_path):
    model.load_state_dict(torch.load(best32_model_path))
model.eval()

MusicTransformer(
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (linear1): Linear(in_features=3, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=3, bias=True)
        (norm1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
        (dropout3): Dropout(p=0.3, inplace=False)
      )
      (1): TransformerDecoderLay

In [17]:
evaluate_loss(model, test_dl, loss_weights)

2.5013409363677104

In [22]:
pm = generate_sequence(120, *test_ds[10000], model, temperature=0.3)
display_audio(pm)

