In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
print('Installing dependencies...')
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev
!pip install -qU pyfluidsynth pretty_midi

# Hack to allow python to pick up the newly-installed fluidsynth lib. 
# This is only needed for the hosted Colab environment.
import ctypes.util
orig_ctypes_util_find_library = ctypes.util.find_library
def proxy_find_library(lib):
  if lib == 'fluidsynth':
    return 'libfluidsynth.so.1'
  else:
    return orig_ctypes_util_find_library(lib)
ctypes.util.find_library = proxy_find_library

Installing dependencies...
Selecting previously unselected package fluid-soundfont-gm.
(Reading database ... 155320 files and directories currently installed.)
Preparing to unpack .../fluid-soundfont-gm_3.1-5.1_all.deb ...
Unpacking fluid-soundfont-gm (3.1-5.1) ...
Selecting previously unselected package libfluidsynth1:amd64.
Preparing to unpack .../libfluidsynth1_1.1.9-1_amd64.deb ...
Unpacking libfluidsynth1:amd64 (1.1.9-1) ...
Setting up fluid-soundfont-gm (3.1-5.1) ...
Setting up libfluidsynth1:amd64 (1.1.9-1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.3) ...
/sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link

[K     |████████████████████████████████| 5.6 MB 5.3 MB/s 
[K     |████████████████████████████████| 51 kB 8.2 MB/s 
[?25h  Building wheel for pretty-midi (setup.py) ... [?25l[?25hdone


In [19]:
import pandas as pd
import collections
import numpy as np
import pretty_midi
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython import display
from typing import Tuple
from torch.utils.data import Dataset, DataLoader

In [4]:
# constants
seq_length = 25
vocab_size = 128
NUM_EPOCHS = 5
_SAMPLING_RATE = 16000
PATH = '/content/drive/MyDrive/UCLA/ECE 147/Project/LSTM_weights'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [5]:
class MusicDataset(Dataset):
    def __init__(self, X, y, split='train'):
        train_end = val_start = round(X.shape[0] * 0.8)
        val_end = test_start = round(y.shape[0] * 0.9)
        if split == 'train':
            X = X[:train_end]
            y = y[:train_end]
        elif split == 'val':
            X = X[val_start:val_end]
            y = y[val_start:val_end]
        elif split == 'test':
            X = X[test_start:]
            y = y[test_start:]
        else:
            raise NotImplementedError()
            
        self.X = torch.tensor(X, device=device)
        self.y = torch.tensor(y, device=device)
        self.dividend = torch.tensor([vocab_size, 1., 1.], device=device)
        self.dividend = self.dividend.reshape((3, 1))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx] / self.dividend
        y = self.y[idx]
        return x, y

In [6]:
root = '/content/drive/MyDrive/UCLA/ECE 147/Project'
npy_dir = os.path.join(root, 'maestro-numpy')

# Load X, y out here instead of in each dataset class to save memory
with open(os.path.join(npy_dir, 'inputs.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.npy'), 'rb') as f:
    y = np.load(f)

# Construct datasets and dataloaders
train_ds = MusicDataset(X, y, split='train')
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_ds = MusicDataset(X, y, split='val')
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True)
test_ds = MusicDataset(X, y, split='test')
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

# free up memory
del X; del y

In [7]:
from torch import nn

class DenseLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, lstm_layers=1, dense=False):
        super(DenseLSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layers = lstm_layers
        self.dense = dense
        # define the LSTM layer
        self.lstm = nn.LSTM(input_size=self.input_dim, 
                            hidden_size=self.hidden_dim, 
                            num_layers=self.layers,
                            batch_first=True)
        self.act1 = nn.ReLU()
        # change linear layer inputs depending on if lstm is bidrectional
        self.linear = nn.Linear(self.hidden_dim, vocab_size + 2)


    def forward(self, inputs, labels=None):
        #out = inputs.unsqueeze(1)
        out, h = self.lstm(inputs)
        out = self.act1(out)
        out = self.linear(out)
        return out

In [8]:
x, y = train_ds[0]
x.shape, y.shape

(torch.Size([3, 25]), torch.Size([3]))

Input: (batch, source sequence length, feature number)
Output: (batch, target sequence length, feature number)

x.shape should be (batch size, 25, 3)
y.shap

In [9]:
# Custom loss function
def criterion(preds, true, weights={'ce': 0.2, 'mse': 0.8}):
  mse_criterion = nn.MSELoss()
  ce_criterion = nn.CrossEntropyLoss()
  true = true.squeeze()
  ce_loss = ce_criterion(preds[:, :vocab_size], true[:, 0].long())
  ce_loss *= weights['ce']
  mse_loss = mse_criterion(preds[:, vocab_size:], true[:, 1:])
  mse_loss *= weights['mse']
  return ce_loss + mse_loss

In [None]:
# # Custom loss function
# def criterion2(preds, true, step, weights={'ce': 0.2, 'mse': 0.8}):
#   mse_criterion = nn.MSELoss()
#   ce_criterion = nn.CrossEntropyLoss()
#   true = true.squeeze()
#   ce_loss = ce_criterion(preds[:, :vocab_size], true[:, 0].long())
#   ce_loss *= weights['ce']
#   mse_loss = mse_criterion(preds[:, vocab_size:], true[:, 1:])
#   mse_loss *= weights['mse']
#   custom_loss = preds[:, -2] - true[:, 1]
#   return ce_loss + mse_loss

In [10]:
def train_one_epoch(epoch_index, model, optimizer, verbose=False):
    running_loss = 0.
    total_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_dl):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs, labels = inputs.float(), labels.float()
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        inputs = torch.permute(inputs, (0, 2, 1)).float()
        outputs = model(inputs)

        # Compute the loss and its gradients
        last_output = outputs[:, -1, :].squeeze()
        loss = criterion(last_output, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            if verbose:
              print('  batch {} loss: {}'.format(i + 1, last_loss))
            total_loss += last_loss
            running_loss = 0.

    return total_loss

In [11]:
# PyTorch LSTM

lstm = DenseLSTM(input_dim=3, hidden_dim=50, lstm_layers=seq_length)
lstm.to(device)

DenseLSTM(
  (lstm): LSTM(3, 50, num_layers=25, batch_first=True)
  (act1): ReLU()
  (linear): Linear(in_features=50, out_features=130, bias=True)
)

In [12]:
adam = torch.optim.Adam(lstm.parameters(), lr=1e-3)

In [13]:
# train model 1
print('------------- Model 1 -------------')
for epoch in range(NUM_EPOCHS):
  epoch_loss = train_one_epoch(epoch, lstm, adam)
  print(f'Epoch: {epoch+1}  Total loss: {epoch_loss}')

# save model params
torch.save(lstm.state_dict(), os.path.join(PATH, 'my_model.pt'))

------------- Model 1 -------------
Epoch: 1  Total loss: 7.558472519353495
Epoch: 2  Total loss: 7.493615530719982
Epoch: 3  Total loss: 7.4938076740428805
Epoch: 4  Total loss: 7.494018625515513
Epoch: 5  Total loss: 7.493919484625567


# Generate Music

In [12]:
lstm.load_state_dict(torch.load(os.path.join(PATH, 'my_model.pt')))
lstm.eval()

DenseLSTM(
  (lstm): LSTM(3, 50, num_layers=25, batch_first=True)
  (act1): ReLU()
  (linear): Linear(in_features=50, out_features=130, bias=True)
)

In [13]:
def predict_next_notes(notes, model, temperature=0.3):    
    x = notes.T.unsqueeze(0).float()
    y = torch.zeros((1, 1, 3), device=device).float()
    with torch.no_grad():
        preds = model(x, y)
    pitch_logits = preds[:, :vocab_size]
    if np.random.random() <= temperature:
        pitch = torch.randint(low=0, high=vocab_size+1, size=(1,))
    else:
        pitch = torch.argmax(F.softmax(pitch_logits, dim=0))
    step = preds[:, -1, -2]
    duration = preds[:, -1, -1]
    return pitch.item(), step.item(), duration.item()

In [14]:
# Make room in memory
del train_ds, train_dl, val_ds, val_dl

In [15]:
# Load X, y out here instead of in the dataset class to save memory/time
with open(os.path.join(npy_dir, 'inputs.npy'), 'rb') as f:
    X = np.load(f)
with open(os.path.join(npy_dir, 'labels.npy'), 'rb') as f:
    y = np.load(f)

# Load testing dataset
test_ds = MusicDataset(X, y, split='test')
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

del X; del y

In [16]:
def notes_to_midi(notes, out_file=None, instr_name='Acoustic Grand Piano', velocity=100):
    pm = pretty_midi.PrettyMIDI()
    instr = pretty_midi.Instrument(
        program=pretty_midi.instrument_name_to_program(instr_name)
    )
    prev_start = 0
    for i, note in notes.iterrows():
        start = float(prev_start + note['step'])
        end = float(start + note['duration'])
        note = pretty_midi.Note(
            velocity=velocity,
            pitch=int(note['pitch']),
            start=start,
            end=end
        )
        instr.notes.append(note)
        prev_start = start

    pm.instruments.append(instr)
    if out_file:
        pm.write(out_file)
    return pm

In [17]:
def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
    waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
    waveform_short = waveform[:seconds*_SAMPLING_RATE]
    return display.Audio(waveform_short, rate=_SAMPLING_RATE)

In [23]:
generated_notes = []
num_preds = 120
prev_start = 0
input_notes, _ = test_ds[1]

for _ in range(num_preds):
    pitch, step, duration = predict_next_notes(input_notes, lstm)
    start = prev_start + step
    end = start + duration
    input_note = (pitch, step, duration)
    generated_notes.append((*input_note, start, end))
    input_notes = input_notes[:, 1:]
    input_note = torch.tensor(input_note) / torch.tensor([seq_length, 1, 1])
    input_note = input_note.to(device=device).unsqueeze(1)
    input_notes = torch.cat((input_notes, input_note), dim=1)
    prev_start = start

generated_notes = pd.DataFrame(generated_notes, columns=('pitch', 'step', 'duration', 'start', 'end'))
pm = notes_to_midi(generated_notes)
display_audio(pm)