In [1]:
# Standard library imports
import collections
import datetime
import glob
import os

# Third party imports
from IPython import display
import numpy as np
import pandas as pd
import pretty_midi
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
_SAMPLING_RATE = 16000
midi_dir = 'maestro-v3.0.0'
npy_dir = 'maestro-numpy'
filenames = glob.glob(f'{midi_dir}/**/*.mid*')

sample_file = filenames[1]
pm = pretty_midi.PrettyMIDI(sample_file)

def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
    waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
    waveform_short = waveform[:seconds*_SAMPLING_RATE]
    return display.Audio(waveform_short, rate=_SAMPLING_RATE)

display_audio(pm)



# MIDI to NumPy
Extract sequences from the MIDI files and store them as NumPy arrays in npy files.

In [3]:
vocab_size = 128

def midi_to_notes(midi_file):
    pm = pretty_midi.PrettyMIDI(midi_file)
    instrument = pm.instruments[0]
    notes = collections.defaultdict(list)

    sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
    prev_start = sorted_notes[0].start

    for note in sorted_notes:
        start = note.start
        end = note.end
        notes['pitch'].append(note.pitch)
        notes['start'].append(start)
        notes['end'].append(end)
        notes['step'].append(start - prev_start)
        notes['duration'].append(end - start)
        prev_start = start

    return pd.DataFrame({name: np.array(value) for name, value in notes.items()})

def get_sequences(seq_length):
    all_notes = []
    for f in filenames:
        notes = midi_to_notes(f)
        all_notes.append(notes)
    all_notes = pd.concat(all_notes)
    key_order = ['pitch', 'step', 'duration']
    data = np.stack([all_notes[key] for key in key_order], axis=1)
    data = torch.tensor(data)
    sequences = data.unfold(0, seq_length + 1, 1)

    inputs = sequences[:, :, :-1].numpy()
    labels = sequences[:, :, -1].numpy()
    return inputs, labels

def create_npy_datasets(seq_length):
    inputs_file = os.path.join(npy_dir, f'inputs.{seq_length}.npy')
    labels_file = os.path.join(npy_dir, f'labels.{seq_length}.npy')
    if os.path.exists(inputs_file) and os.path.exists(labels_file):
        print('npy datasets already exist, delete to re-generate... quitting')
        return
    if os.path.exists(inputs_file):
        os.unlink(inputs_file)
    if os.path.exists(labels_file):
        os.unlink(labels_file)
    if not os.path.exists(npy_dir):
        os.mkdir(npy_dir)
    inputs, labels = get_sequences(seq_length)
    with open(inputs_file, 'wb') as f:
        np.save(f, inputs)
    with open(labels_file, 'wb') as f:
        np.save(f, labels)

Create two datasets, for comparison.

In [4]:
create_npy_datasets(seq_length=16)
create_npy_datasets(seq_length=32)

npy datasets already exist, delete to re-generate... quitting
npy datasets already exist, delete to re-generate... quitting


# PyTorch Dataset and Dataloader
Loads the extract sequences.

In [5]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
print(device)

cuda:0


In [6]:
class MusicDataset(Dataset):
    def __init__(self, X, y, vocab_size, split='train'):
        train_end = val_start = round(X.shape[0] * 0.8)
        val_end = test_start = round(y.shape[0] * 0.9)
        if split == 'train':
            X = X[:train_end]
            y = y[:train_end]
        elif split == 'val':
            X = X[val_start:val_end]
            y = y[val_start:val_end]
        elif split == 'test':
            X = X[test_start:]
            y = y[test_start:]
        else:
            raise NotImplementedError()
            
        self.X = torch.tensor(X, device=device)
        self.y = torch.tensor(y, device=device)
        self.dividend = torch.tensor([vocab_size, 1., 1.], device=device)
        self.dividend = self.dividend.reshape((3, 1))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx] / self.dividend
        y = self.y[idx]
        return x, y

In [7]:
# Load X, y out here instead of in the dataset class to save memory/time
with open(os.path.join(npy_dir, 'inputs.16.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.16.npy'), 'rb') as f:
    y = np.load(f)

In [8]:
# Construct datasets and dataloaders
train_ds = MusicDataset(X, y, vocab_size, split='train')
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
val_ds = MusicDataset(X, y, vocab_size, split='val')
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True, drop_last=True)

# Free up memory
del X; del y

Convert tensor back to playable audio file.

In [9]:
def notes_to_midi(notes, out_file=None, instr_name='Acoustic Grand Piano', velocity=100):
    pm = pretty_midi.PrettyMIDI()
    instr = pretty_midi.Instrument(
        program=pretty_midi.instrument_name_to_program(instr_name)
    )
    prev_start = 0
    for i, note in notes.iterrows():
        start = float(prev_start + note['step'])
        end = float(start + note['duration'])
        note = pretty_midi.Note(
            velocity=velocity,
            pitch=int(note['pitch']),
            start=start,
            end=end
        )
        instr.notes.append(note)
        prev_start = start

    pm.instruments.append(instr)
    if out_file:
        pm.write(out_file)
    return pm

def tensor_to_midi(tensor, vocab_size=None):
    prev_start = 0
    notes = []
    for i in range(tensor.shape[1]):
        pitch, step, duration = tensor[:, i]
        pitch = pitch.item()
        step = step.item()
        duration = duration.item()
        start = prev_start + step
        end = start + duration
        if vocab_size:
            pitch *= vocab_size
        note = (pitch, step, duration)
        notes.append((*note, start, end))
        prev_start = start
    notes = pd.DataFrame(notes, columns=('pitch', 'step', 'duration', 'start', 'end'))
    pm = notes_to_midi(notes)
    return pm

In [10]:
display_audio(tensor_to_midi(train_ds[0][0], vocab_size=vocab_size))



# Transformer model

In [23]:
class MusicTransformer(nn.Module):
    def __init__(self, seq_length, vocab_size, device):
        super().__init__()
        decoder_layer = nn.TransformerDecoderLayer(d_model=3, nhead=3, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
        self.decoder = self.decoder.to(device=device).double()
        self.flatten = nn.Flatten()
        self.fcnn = nn.Sequential(
            nn.Linear(seq_length * 3, vocab_size + 2),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(vocab_size + 2, vocab_size + 2)
        )
        self.fcnn = self.fcnn.to(device=device).double()
        
    def forward(self, x, y):
        z = self.decoder(x, y)
        z = self.flatten(z)
        z = self.fcnn(z)
        return z

In [24]:
model = MusicTransformer(seq_length=16, vocab_size=vocab_size, device=device)

Sanity check.

In [25]:
for x, y in train_dl:
    break
x = x.moveaxis(-1, 1)
y = y.unsqueeze(1)
print(x.shape, y.shape)
with torch.no_grad():
    result = model(x, y)
print(result)
print(result.shape)

torch.Size([64, 16, 3]) torch.Size([64, 1, 3])
tensor([[-0.1412,  0.0871, -0.2941,  ...,  0.1089, -0.0416,  0.0232],
        [ 0.0202, -0.0767,  0.0965,  ...,  0.1827, -0.0832, -0.2043],
        [ 0.0184,  0.1316, -0.1203,  ...,  0.2069,  0.2923,  0.2618],
        ...,
        [-0.0763,  0.2151, -0.5328,  ...,  0.0254, -0.1217, -0.1594],
        [-0.2847,  0.3787, -0.1465,  ..., -0.0097,  0.1293, -0.0987],
        [ 0.0398,  0.2627, -0.3908,  ..., -0.0651,  0.1615,  0.0134]],
       device='cuda:0', dtype=torch.float64)
torch.Size([64, 130])


Functions to get a MIDI file from model outputs.

In [26]:
def predict_next_notes(x, y, model, temperature=0.1):
    x = x.unsqueeze(0).moveaxis(-1, 1)
    y = y.unsqueeze(0).unsqueeze(0)
    with torch.no_grad():
        preds = model(x, y)
    pitch_logits = preds[:, :vocab_size]
    if np.random.random() <= temperature:
        pitch = torch.randint(low=0, high=vocab_size+1, size=(1,))
    else:
        pitch = torch.argmax(F.softmax(pitch_logits, dim=0))
    step = preds[:, -2]
    duration = preds[:, -1]
    return pitch.item(), step.item(), duration.item()

def generate_sequence(num_preds, x, y, model, temperature):
    generated_notes = []
    prev_start = 0
    input_notes = x
    seq_length = input_notes.shape[1]
    for _ in range(num_preds):
        pitch, step, duration = predict_next_notes(x, y, model, temperature)
        start = prev_start + step
        end = start + duration
        input_note = (pitch, step, duration)
        generated_notes.append((*input_note, start, end))
        input_notes = input_notes[:, 1:]
        input_note = torch.tensor(input_note) / torch.tensor([seq_length, 1, 1])
        input_note = input_note.to(device=device).unsqueeze(1)
        input_notes = torch.cat((input_notes, input_note), dim=1)
        prev_start = start
    
    generated_notes = pd.DataFrame(generated_notes, columns=('pitch', 'step', 'duration', 'start', 'end'))
    pm = notes_to_midi(generated_notes)
    return pm

Sanity check.

In [27]:
pm = generate_sequence(120, *train_ds[0], model, temperature=0.1)

# Training the model
First we try the length-16 sequences then the length-32 sequences.

In [28]:
mse_criterion = nn.MSELoss()
ce_criterion = nn.CrossEntropyLoss()
loss_weights = {'ce': 0.3, 'mse': 0.7}

# Get current datetime
def current_time():
    return datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Custom loss function
def criterion(preds, true, weights):
    true = true.squeeze()
    ce_loss = ce_criterion(preds[:, :vocab_size], true[:, 0].long())
    ce_loss *= weights['ce']
    mse_loss = mse_criterion(preds[:, vocab_size:], true[:, 1:])
    positive_pressure = 10 * torch.max(torch.max(-preds[:, vocab_size]), 0)
    mse_loss *= weights['mse']
    return ce_loss + mse_loss

# Gets loss over a validation/test dataloader
def evaluate_loss(model, test_dl, loss_weights):
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, (x, y) in enumerate(test_dl):
            x = x.moveaxis(-1, 1)
            y = y.unsqueeze(1)
            outputs = model(x, y)
            loss = criterion(outputs, y, loss_weights)
            test_loss += loss.item()
    test_loss /= (i + 1)
    model.train()
    return test_loss

# Training loop
def train(model, train_dl, valid_dl, loss_weights, num_epochs):
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    best_val_loss = 9999
    model_path = ''
    stop = False
    model.train()
    
    print('Started training')
    for epoch in range(num_epochs):
        if stop:
            break
        running_loss = 0.0
        
        for i, (x, y) in enumerate(train_dl):  
            optimizer.zero_grad()
            x = x.moveaxis(-1, 1)
            y = y.unsqueeze(1)
            outputs = model(x, y)
            loss = criterion(outputs, y, loss_weights)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] train loss: {running_loss / 2000:.3f}')
                running_loss = 0.0
                
            if i % 8000 == 7999:  # print every 8000 mini-batches
                val_loss = evaluate_loss(model, valid_dl, loss_weights)
                print(f'[{epoch + 1}, {i + 1:5d}] val loss: {val_loss:.3f}')
                timestamp = current_time()
                seq_length = x.shape[1]
                fname = f'music_transformer.{timestamp}.{seq_length}.pt'
                torch.save(model.state_dict(), fname)
                print(f'Saved model to {fname}')
                if val_loss < best_val_loss:
                    print('new best!')
                    best_val_loss = val_loss
                    model_path = fname
                if val_loss > best_val_loss:
                    print('model performance decreasing... stopping early')
                    stop = True
                    break
                    
    print('Finished training')
    return model_path

In [29]:
model = MusicTransformer(seq_length=16, vocab_size=vocab_size, device=device)
best16 = train(model, train_dl, val_dl, loss_weights, num_epochs=1)

Started training
[1,  2000] train loss: 1.070
[1,  4000] train loss: 0.712
[1,  6000] train loss: 0.614
[1,  8000] train loss: 0.559
[1,  8000] val loss: 19.104
Saved model to music_transformer.2022-03-05_12-01-45.16.pt
new best!
[1, 10000] train loss: 0.527
[1, 12000] train loss: 0.512
[1, 14000] train loss: 0.504
[1, 16000] train loss: 0.490
[1, 16000] val loss: 25.042
Saved model to music_transformer.2022-03-05_12-10-41.16.pt
model performance decreasing... stopping early
Finished training


Now the length-32 sequences.

In [30]:
with open(os.path.join(npy_dir, 'inputs.32.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.32.npy'), 'rb') as f:
    y = np.load(f)

train_ds = MusicDataset(X, y, vocab_size, split='train')
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
val_ds = MusicDataset(X, y, vocab_size, split='val')
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

In [31]:
model = MusicTransformer(seq_length=32, vocab_size=vocab_size, device=device)

In [32]:
best32 = train(model, train_dl, val_dl, loss_weights, num_epochs=1)

Started training
[1,  2000] train loss: 0.862
[1,  4000] train loss: 0.502
[1,  6000] train loss: 0.475
[1,  8000] train loss: 0.445
[1,  8000] val loss: 14.005
Saved model to music_transformer.2022-03-05_12-20-17.32.pt
new best!
[1, 10000] train loss: 0.428
[1, 12000] train loss: 0.411
[1, 14000] train loss: 0.399
[1, 16000] train loss: 0.384
[1, 16000] val loss: 17.316
Saved model to music_transformer.2022-03-05_12-29-04.32.pt
model performance decreasing... stopping early
Finished training


# Evaluate on test data
Check testing loss and generate some music.

In [33]:
# Load length-16 testing dataset
with open(os.path.join(npy_dir, 'inputs.16.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.16.npy'), 'rb') as f:
    y = np.load(f)
    
test_ds = MusicDataset(X, y, vocab_size=vocab_size, split='test')
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

Evaluate the model trained on length-16 sequences.

In [34]:
model = MusicTransformer(seq_length=16, vocab_size=vocab_size, device=device)
if os.path.exists(best16):
    model.load_state_dict(torch.load(best16))
model.eval()

MusicTransformer(
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (linear1): Linear(in_features=3, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=3, bias=True)
        (norm1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (dropout3): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerDecoderLay

In [35]:
evaluate_loss(model, test_dl, loss_weights)

19.22366668011664

In [41]:
pm = generate_sequence(120, *test_ds[0], model, temperature=0.1)
display_audio(pm)



Evaluate the model trained on length-32 sequences.

In [37]:
# Load length-32 testing dataset
with open(os.path.join(npy_dir, 'inputs.32.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.32.npy'), 'rb') as f:
    y = np.load(f)
    
test_ds = MusicDataset(X, y, vocab_size=vocab_size, split='test')
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True, drop_last=True)

del X; del y

In [38]:
model = MusicTransformer(seq_length=32, vocab_size=vocab_size, device=device)
if os.path.exists(best32):
    model.load_state_dict(torch.load(best32))
model.eval()

MusicTransformer(
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=3, out_features=3, bias=True)
        )
        (linear1): Linear(in_features=3, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=3, bias=True)
        (norm1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (dropout3): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerDecoderLay

In [40]:
evaluate_loss(model, test_dl, loss_weights)

14.092101079217596

In [42]:
pm = generate_sequence(120, *test_ds[100], model, temperature=0.1)
display_audio(pm)

