# Attentive Music

I plan to use a Transformer architecture to generate musical MIDI sequences.

In [153]:
from music21 import *
import os
import numpy as np
from tqdm import tqdm_notebook as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torchsample.modules import ModuleTrainer
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

## Data

I've found a [dataset](https://github.com/jukedeck/nottingham-dataset) of MIDI files.

In [121]:
PATH="../nottingham-dataset/MIDI"
files = [f for f in os.listdir(PATH) if os.path.isfile(PATH+'/'+f)]
files[:10]

['waltzes7.mid',
 'reelsa-c79.mid',
 'reelsr-t57.mid',
 'jigs211.mid',
 'morris29.mid',
 'reelsu-z8.mid',
 'jigs156.mid',
 'ashover5.mid',
 'reelsa-c32.mid',
 'morris10.mid']

From [this](https://www.hackerearth.com/blog/machine-learning/jazz-music-using-deep-learning/) tutorial for parsing MIDI.

In [122]:
def get_notes(file_list, PATH):  
    notes = []  
    for file in tqdm(file_list):  
    # converting .mid file to stream object
        midi = converter.parse(PATH + '/' + file)  
        notes_to_parse = [] 
        try:  
            # Given a single stream, partition into a part for each unique instrument  
            parts = instrument.partitionByInstrument(midi)  
        except:  
            pass  
        if parts: # if parts has instrument parts   
            notes_to_parse = parts.parts[0].recurse()  
        else:  
            notes_to_parse = midi.flat.notes  
        for element in notes_to_parse:   
            if isinstance(element, note.Note):  
                # if element is a note, extract pitch   
                notes.append(str(element.pitch))  
            elif(isinstance(element, chord.Chord)):  
                # if element is a chord, append the normal form of the   
                # chord (a list of integers) to the list of notes.   
                notes.append('.'.join(str(n) for n in element.normalOrder)) 
    
    with open('data/notes', 'wb') as filepath:  
        pickle.dump(notes, filepath)  
    return notes

In [123]:
# Create notes again
# notes = get_notes(files, PATH)

# Load from previously saved version
if os.path.getsize('data/notes') > 0:
    with open('data/notes', 'rb') as f:
        unpickler = pickle.Unpickler(f)
        notes = unpickler.load()

In [124]:
pitchnames = sorted(set(item for item in notes))
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

In [125]:
int_notes = [note_to_int[x] for x in notes]; int_notes[:10]

[88, 111, 34, 108, 103, 88, 34, 110, 88, 94]

In [126]:
bs = 8

In [127]:
xs = [np.array(int_notes[i*bs:(i+1)*bs]) for i in range(len(int_notes)//bs)]
ys = [np.array(int_notes[i*bs+1:(i+1)*bs+1]) for i in range(len(int_notes)//bs)]

In [128]:
xs[:10]

[array([ 88, 111,  34, 108, 103,  88,  34, 110]),
 array([ 88,  94,  67, 118,  94,  88,  34, 110]),
 array([ 88, 111,  34, 108, 103,  88,  34, 110]),
 array([ 88,  94,  44, 108,  97,  83, 103,  34]),
 array([ 88, 111,  34, 108, 103,  88,  34, 110]),
 array([ 88,  94,  67, 118,  94,  88,  34, 110]),
 array([ 88, 111,  34, 108, 103,  88,  34, 110]),
 array([ 88,  94,  44, 108,  97,  83, 103,  34]),
 array([ 88, 111,  34, 108, 103,  88,  34, 110]),
 array([ 88,  94,  67, 118,  94,  88,  34, 110])]

These are the next notes in the sequence for every note in `xs`.

In [129]:
ys[:10]

[array([111,  34, 108, 103,  88,  34, 110,  88]),
 array([ 94,  67, 118,  94,  88,  34, 110,  88]),
 array([111,  34, 108, 103,  88,  34, 110,  88]),
 array([ 94,  44, 108,  97,  83, 103,  34,  88]),
 array([111,  34, 108, 103,  88,  34, 110,  88]),
 array([ 94,  67, 118,  94,  88,  34, 110,  88]),
 array([111,  34, 108, 103,  88,  34, 110,  88]),
 array([ 94,  44, 108,  97,  83, 103,  34,  88]),
 array([111,  34, 108, 103,  88,  34, 110,  88]),
 array([ 94,  67, 118,  94,  88,  34, 110,  88])]

But our y data will need to be one-hot encoded for our training to work.

In [213]:
def one_hot(batch,vocab_size):
    ones = torch.sparse.torch.eye(vocab_size)
    return ones.index_select(0,batch)

In [130]:
x_tr, x_val, y_tr, y_val = train_test_split(xs, ys, test_size=0.25)

In [154]:
def tensor(from_int):
    return torch.from_numpy(np.array(from_int)).long()

We need to create a class for our dataset.

In [215]:
class MusicData(Dataset):

    def __init__(self, x_data, y_data):
        self.len = len(x_data)
        self.x_data = tensor(x_data)
        self.y_data = tensor(y_data)
            
    def __getitem__(self, index):
        return self.x_data[index], one_hot(self.y_data[index],vocab_size=120)
    
    def __len__(self):
        return self.len

In [216]:
tr_data = MusicData(x_tr, y_tr)
val_data = MusicData(x_val, y_val)

tr_loader = DataLoader(dataset=tr_data,
                      batch_size=1,
                      shuffle=True,
                      num_workers=2)
val_loader = DataLoader(dataset=val_data,
                      batch_size=1,
                      shuffle=True,
                      num_workers=2)

## LSTM

Let's first try an LSTM as a simple example.

In [161]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, n_hidden, n_fac, bs, nl):
        super().__init__()
        self.n_hidden = n_hidden
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
#         self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (Variable(torch.zeros(self.nl, bs, self.n_hidden)),
                  Variable(torch.zeros(self.nl, bs, self.n_hidden)))

In [208]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [218]:
model = LSTMTagger(embedding_dim=50,hidden_dim=256,vocab_size=len(note_to_int),tagset_size=8)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(4):  # again, normally you would NOT do 4 epochs, it is toy data
    for i, data in enumerate(tr_loader):
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.h = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        inputs, labels = data
        inputs, labels = Variable(inputs), Variable(labels)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# # See what the scores are after training
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[0][0], word_to_ix)
#     tag_scores = model(inputs)

#     # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#     # for word i. The predicted tag is the maximum scoring tag.
#     # Here, we can see the predicted sequence below is 0 1 2 0 1
#     # since 0 is index of the maximum value of row 1,
#     # 1 is the index of maximum value of row 2, etc.
#     # Which is DET NOUN VERB DET NOUN, the correct sequence!
#     print(tag_scores)

RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed.  at /opt/conda/conda-bld/pytorch-nightly-cpu_1543565205389/work/aten/src/THNN/generic/ClassNLLCriterion.c:93

In [162]:
batch_size = 1
use_cuda = False
criterion = nn.CrossEntropyLoss()
model = LSTMTagger(n_fac=50,n_hidden=256,vocab_size=len(note_to_int),bs=batch_size,nl=8)
if(use_cuda):
    model.cuda()
    criterion.cuda()
trainer = ModuleTrainer(model)
trainer.set_optimizer(optim.Adam, lr=1e-3)
trainer.set_loss(criterion)

# Bug in torchsample?
trainer._has_multiple_loss_fns = False

model

LSTMTagger(
  (e): Embedding(120, 50)
  (rnn): LSTM(50, 256, num_layers=8, dropout=0.5)
  (l_out): Linear(in_features=256, out_features=120, bias=True)
)

In [111]:
trainer.fit(tensor(x_tr), tensor(y_tr), num_epoch=4, batch_size=batch_size, shuffle=False)

Epoch 1/4:   0%|          | 1/721 [00:00<01:19,  9.09 batches/s]


ValueError: Expected input batch_size (256) to match target batch_size (32).

In [None]:
trainer.fit(each_tensor(xs), each_tensor(ys), num_epoch=4, batch_size=batch_size, shuffle=False)