# Attentive Music

I plan to use a Transformer architecture to generate musical MIDI sequences.

In [48]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [75]:
from music21 import *
import os, sys
import numpy as np
from tqdm import tqdm_notebook as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torchsample.modules import ModuleTrainer
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from fastai.basic_data import DataBunch, DataLoader, DeviceDataLoader
from fastai.metrics import accuracy
import ipdb
from inspect import getsource
from train import *

device = torch.device("cuda:0")

## Data

I've found a [dataset](https://github.com/jukedeck/nottingham-dataset) of MIDI files.

In [2]:
PATH="../nottingham-dataset/MIDI"
files = [f for f in os.listdir(PATH) if os.path.isfile(PATH+'/'+f)]
files[:10]

['waltzes7.mid',
 'reelsa-c79.mid',
 'reelsr-t57.mid',
 'jigs211.mid',
 'morris29.mid',
 'reelsu-z8.mid',
 'jigs156.mid',
 'ashover5.mid',
 'reelsa-c32.mid',
 'morris10.mid']

From [this](https://www.hackerearth.com/blog/machine-learning/jazz-music-using-deep-learning/) tutorial for parsing MIDI.

In [3]:
def get_notes(file_list, PATH):  
    notes = []  
    for file in tqdm(file_list):  
    # converting .mid file to stream object
        midi = converter.parse(PATH + '/' + file)  
        notes_to_parse = [] 
        try:  
            # Given a single stream, partition into a part for each unique instrument  
            parts = instrument.partitionByInstrument(midi)  
        except:  
            pass  
        if parts: # if parts has instrument parts   
            notes_to_parse = parts.parts[0].recurse()  
        else:  
            notes_to_parse = midi.flat.notes  
        for element in notes_to_parse:   
            if isinstance(element, note.Note):  
                # if element is a note, extract pitch   
                notes.append(str(element.pitch))  
            elif(isinstance(element, chord.Chord)):  
                # if element is a chord, append the normal form of the   
                # chord (a list of integers) to the list of notes.   
                notes.append('.'.join(str(n) for n in element.normalOrder)) 
    
    with open('data/notes', 'wb') as filepath:  
        pickle.dump(notes, filepath)  
    return notes

In [4]:
# Create notes again
# notes = get_notes(files, PATH)

# Load from previously saved version
if os.path.getsize('data/notes') > 0:
    with open('data/notes', 'rb') as f:
        unpickler = pickle.Unpickler(f)
        notes = unpickler.load()

In [5]:
pitchnames = sorted(set(item for item in notes))
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
print('Vocab size:',len(pitchnames))

Vocab size: 120


In [6]:
int_notes = [note_to_int[x] for x in notes]; int_notes[:10]

[88, 111, 34, 108, 103, 88, 34, 110, 88, 94]

In [7]:
bs = 8

In [8]:
xs = np.array([np.array(int_notes[i*bs:(i+1)*bs]) for i in range(len(int_notes)//bs)])
# ys = np.array([int_notes[(i+1)*bs] for i in range(len(int_notes)//bs)])
ys = np.array([np.array(int_notes[i*bs+1:(i+1)*bs+1]) for i in range(len(int_notes)//bs)])

In [9]:
xs[:10]

array([[ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  67, 118,  94,  88,  34, 110],
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  44, 108,  97,  83, 103,  34],
       ...,
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  44, 108,  97,  83, 103,  34],
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  67, 118,  94,  88,  34, 110]])

These are the next notes in the sequence for each sequence in `xs`.

In [10]:
ys[:10]

array([[111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  67, 118,  94,  88,  34, 110,  88],
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  44, 108,  97,  83, 103,  34,  88],
       ...,
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  44, 108,  97,  83, 103,  34,  88],
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  67, 118,  94,  88,  34, 110,  88]])

But our y data may need to be one-hot encoded for our training to work.

In [11]:
def one_hot(batch,vocab_size):
    ones = torch.eye(vocab_size)
    return ones.index_select(0,batch)

In [12]:
xs.shape

(30727, 8)

In [13]:
x_tr, x_val, y_tr, y_val = train_test_split(xs[:30720], ys[:30720], test_size=0.25)

In [14]:
x_tr, x_val, y_tr, y_val = train_test_split(xs[:1280], ys[:1280], test_size=0.25)

In [15]:
def tensor(from_int):
    return torch.from_numpy(np.array(from_int)).long()

We need to create a class for our dataset.

In [16]:
class MusicData(Dataset):

    def __init__(self, x_data, y_data):
        self.len = len(x_data)
        self.x_data = tensor(x_data)
        self.y_data = tensor(y_data)
            
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

In [86]:
tr_data = MusicData(x_tr, y_tr)
val_data = MusicData(x_val, y_val)

tr_loader = DataLoader(dataset=tr_data,
                        batch_size=32,
                        shuffle=True,
                        num_workers=1,
                        pin_memory=True)
val_loader = DataLoader(dataset=val_data,
                        batch_size=32,
                        shuffle=False,
                        num_workers=1,
                        pin_memory=True)

## LSTM

Let's first try an LSTM as a simple example.

In [29]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [18]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers, batch_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.vocab_size = vocab_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_size, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim, vocab_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(self.batch_size, self.num_layers, self.hidden_dim).cuda(),
                torch.zeros(self.batch_size, self.num_layers, self.hidden_dim).cuda())

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(sentence.size(0),sentence.size(1), -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=-1).view(self.batch_size, self.vocab_size, -1)
        return tag_scores

A helper function to return the accuracy from the log_softmax and the labels.

In [68]:
def acc(pred, targ): return (pred.max(1)[1] == targ).sum().item()/(pred.max(1)[1] == targ).view(-1).size()[0]

In [20]:
model = LSTMTagger(embedding_dim=50,hidden_dim=128,vocab_size=len(note_to_int), num_layers=8, batch_size=32).cuda()
loss_function = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in tqdm(range(4), desc='Epoch'):  # again, normally you would NOT do 4 epochs, it is toy data
    for i, (inputs, labels) in enumerate(tqdm(tr_loader, desc='Batch')):
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        optimizer.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

        # Step 3. Run our forward pass.
        tag_scores = model(inputs)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, labels)
        loss.backward()
        sys.stdout.write('\r'+str(loss))
        optimizer.step()
    
    # Validation
    val_acc = 0
    val_loss = 0
    for i, (inputs, labels) in enumerate(tqdm(val_loader, desc='Val')):
        inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()
        val_tag_scores = model(inputs)
        val_loss += loss_function(val_tag_scores, labels)
        val_acc += acc(val_tag_scores, labels)
        
    val_acc, val_loss = val_acc/(i+1), val_loss/(i+1)
    print('val_acc: {}, val_loss:{}'.format(val_acc, val_loss))
    

KeyboardInterrupt: 

In [55]:
print(getsource(Learner))

class Learner():
    "Trainer for `model` using `data` to minimize `loss_func` with optimizer `opt_func`."
    data:DataBunch
    model:nn.Module
    opt_func:Callable=AdamW
    loss_func:Callable=None
    metrics:Collection[Callable]=None
    true_wd:bool=True
    bn_wd:bool=True
    wd:Floats=defaults.wd
    train_bn:bool=True
    path:str = None
    model_dir:str = 'models'
    callback_fns:Collection[Callable]=None
    callbacks:Collection[Callback]=field(default_factory=list)
    layer_groups:Collection[nn.Module]=None
    def __post_init__(self)->None:
        "Setup path,metrics, callbacks and ensure model directory exists."
        self.path = Path(ifnone(self.path, self.data.path))
        (self.path/self.model_dir).mkdir(parents=True, exist_ok=True)
        self.model = self.model.to(self.data.device)
        self.loss_func = ifnone(self.loss_func, self.data.loss_func)
        self.metrics=listify(self.metrics)
        if not self.layer_groups: self.layer_groups = [nn.Sequent

In [87]:
data = DataBunch(tr_loader, val_loader)

In [88]:
model = LSTMTagger(embedding_dim=50,hidden_dim=128,vocab_size=len(note_to_int), num_layers=8, batch_size=32).cuda()

In [95]:
learn = Learner(data, model, metrics=[acc])

In [96]:
learn.lr_find()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

Had to modify `fastai.train` (found in `train.py`) to work with my particular NN.

In [70]:
learn.fit(1)

epoch,train_loss,valid_loss,acc


AttributeError: 'float' object has no attribute 'detach'

In [180]:
torch.save(model.state_dict(), 'models/lstm_model')

In [37]:
model = LSTMTagger(embedding_dim=50,hidden_dim=128,vocab_size=len(note_to_int), num_layers=8, batch_size=32).cuda()
model.load_state_dict(torch.load('models/lstm_model'))

### Test

In [38]:
next(iter(tr_loader))[0][:10]

tensor([[ 94, 116,  75, 111, 108, 103,  97,  83],
        [110,  34,  88,  88, 118,  67,  94,  94],
        [ 34, 118, 110, 107,  12,  94,  44, 118],
        [110,  88,  25, 103, 108, 111,  75, 108],
        [ 94,  94, 115,  94,  97,  83,  88,  97],
        [ 34, 119,  89, 111, 103, 111, 108,  83],
        [ 97,  88,  94, 100, 103,  67, 119,  94],
        [ 81,  88,  88,  57, 102, 102, 118,  67],
        [118, 110, 107,  12, 103,  88,  34,  88],
        [ 88, 110,  88, 118,  67, 100,  94, 118]])

In [39]:
tags = next(iter(tr_loader))[1]

In [40]:
tags

tensor([[119,  44, 111, 119,  89,  81,  89,  57],
        [108,   5, 108, 111, 119, 108, 103,  67],
        [100,  94,  34,  88,  94,  67, 118,  88],
        [106, 108,  57, 103, 108,  57, 103,  97],
        [ 97, 108, 111,  34, 108, 103,  97,  94],
        [107,  88,  81,  94, 100,  88, 107, 107],
        [100,  94, 100, 103, 100,  88, 118, 100],
        [119, 113, 108, 113,  31, 108, 103, 103],
        [ 34, 108, 111, 119,  89,  34, 116,  89],
        [100,  94,  88,  57, 118, 110, 118,  67],
        [ 89,  88,  97,  94,  47,  88, 115,  94],
        [ 89, 119, 111, 108,  83,  97,  88,  97],
        [ 34,  97, 103,  94,  88,  12, 110, 107],
        [118,  88,  94, 118, 100,  81,  88,  94],
        [100, 103,  94, 118,  94,  88,  81,  88],
        [ 67, 103, 103, 100,  94,  88, 108,   5],
        [ 94,  67, 103, 103,  88,  34, 103, 103],
        [103,  97,  12,  88,  94,  97, 103,  34],
        [ 94, 118,  88,  94,  97, 103,  34, 102],
        [103, 108, 111, 119,  89, 119, 108, 103],


In [41]:
preds = model(next(iter(tr_loader))[0].cuda()).cpu().detach().numpy()

In [42]:
preds.shape

(32, 120, 8)

In [43]:
np.argmax(preds, axis=1)

array([[ 13,  13,  13,  13,  13,  13,  13,  13],
       [ 13,  13,  13,  13,  13,  13,  13,  13],
       [ 13,  13,  13,  13,  13,  13,  13,  13],
       [ 13,  13,  13,  28,  13,  13,  13,  13],
       [ 13,  13,  13,  13,  13,  13,  13,  13],
       [ 13,  13,  13,  13,  13,  13,  13,  13],
       [ 13,  13,  28,  13,  13,  28,  28,  13],
       [ 28,  28,  28,  28,  28,  28,  28,  28],
       [ 58,  58,  58,  58,  58,  58,  58,  58],
       [ 13,  13,  43,  13,  43,  13,  13,  13],
       [ 13,  13,  28,  13,  28,  13,  13,  13],
       [ 13,  13,  13,  28,  13,  13,  13,  43],
       [ 58,  58,  58,  58,  58,  58,  58,  58],
       [ 13,  13,  13,  13,  13,  13,  13,  13],
       [ 13,  13,  43,  13,  13,  13,  13,  13],
       [ 13,  13,  13,  13,  28,  13,  13,  13],
       [ 13,  13,  13,  13,  13,  13,  43,  13],
       [ 28,  13,  28,  13,  13,  13,  13,  13],
       [ 13,  13,  28,  13,  13,  13,  28,  13],
       [ 43,  13,  13,  58,  13,  13,  13,  13],
       [ 13, 103,  1