# Attentive Music

I plan to use a Transformer architecture to generate musical MIDI sequences.

In [19]:
from music21 import *
import os, sys
import numpy as np
from tqdm import tqdm_notebook as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torchsample.modules import ModuleTrainer
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda:0")

## Data

I've found a [dataset](https://github.com/jukedeck/nottingham-dataset) of MIDI files.

In [2]:
PATH="../nottingham-dataset/MIDI"
files = [f for f in os.listdir(PATH) if os.path.isfile(PATH+'/'+f)]
files[:10]

['waltzes7.mid',
 'reelsa-c79.mid',
 'reelsr-t57.mid',
 'jigs211.mid',
 'morris29.mid',
 'reelsu-z8.mid',
 'jigs156.mid',
 'ashover5.mid',
 'reelsa-c32.mid',
 'morris10.mid']

From [this](https://www.hackerearth.com/blog/machine-learning/jazz-music-using-deep-learning/) tutorial for parsing MIDI.

In [3]:
def get_notes(file_list, PATH):  
    notes = []  
    for file in tqdm(file_list):  
    # converting .mid file to stream object
        midi = converter.parse(PATH + '/' + file)  
        notes_to_parse = [] 
        try:  
            # Given a single stream, partition into a part for each unique instrument  
            parts = instrument.partitionByInstrument(midi)  
        except:  
            pass  
        if parts: # if parts has instrument parts   
            notes_to_parse = parts.parts[0].recurse()  
        else:  
            notes_to_parse = midi.flat.notes  
        for element in notes_to_parse:   
            if isinstance(element, note.Note):  
                # if element is a note, extract pitch   
                notes.append(str(element.pitch))  
            elif(isinstance(element, chord.Chord)):  
                # if element is a chord, append the normal form of the   
                # chord (a list of integers) to the list of notes.   
                notes.append('.'.join(str(n) for n in element.normalOrder)) 
    
    with open('data/notes', 'wb') as filepath:  
        pickle.dump(notes, filepath)  
    return notes

In [4]:
# Create notes again
# notes = get_notes(files, PATH)

# Load from previously saved version
if os.path.getsize('data/notes') > 0:
    with open('data/notes', 'rb') as f:
        unpickler = pickle.Unpickler(f)
        notes = unpickler.load()

In [5]:
pitchnames = sorted(set(item for item in notes))
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

In [6]:
int_notes = [note_to_int[x] for x in notes]; int_notes[:10]

[88, 111, 34, 108, 103, 88, 34, 110, 88, 94]

In [7]:
bs = 8

In [162]:
xs = np.array([np.array(int_notes[i*bs:(i+1)*bs]) for i in range(len(int_notes)//bs)])
# ys = np.array([int_notes[(i+1)*bs] for i in range(len(int_notes)//bs)])
ys = np.array([np.array(int_notes[i*bs+1:(i+1)*bs+1]) for i in range(len(int_notes)//bs)])

In [163]:
xs[:10]

array([[ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  67, 118,  94,  88,  34, 110],
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  44, 108,  97,  83, 103,  34],
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  67, 118,  94,  88,  34, 110],
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  44, 108,  97,  83, 103,  34],
       [ 88, 111,  34, 108, 103,  88,  34, 110],
       [ 88,  94,  67, 118,  94,  88,  34, 110]])

These are the next notes in the sequence for each sequence in `xs`.

In [164]:
ys[:10]

array([[111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  67, 118,  94,  88,  34, 110,  88],
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  44, 108,  97,  83, 103,  34,  88],
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  67, 118,  94,  88,  34, 110,  88],
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  44, 108,  97,  83, 103,  34,  88],
       [111,  34, 108, 103,  88,  34, 110,  88],
       [ 94,  67, 118,  94,  88,  34, 110,  88]])

But our y data may need to be one-hot encoded for our training to work.

In [165]:
def one_hot(batch,vocab_size):
    ones = torch.eye(vocab_size)
    return ones.index_select(0,batch)

In [166]:
xs.shape

(30727, 8)

In [167]:
x_tr, x_val, y_tr, y_val = train_test_split(xs[:30720], ys[:30720], test_size=0.25)

In [168]:
def tensor(from_int):
    return torch.from_numpy(np.array(from_int)).long()

We need to create a class for our dataset.

In [169]:
class MusicData(Dataset):

    def __init__(self, x_data, y_data):
        self.len = len(x_data)
        self.x_data = tensor(x_data)
        self.y_data = tensor(y_data)
            
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

In [170]:
tr_data = MusicData(x_tr, y_tr)
val_data = MusicData(x_val, y_val)

tr_loader = DataLoader(dataset=tr_data,
                       batch_size=32,
                       shuffle=True,
                       num_workers=1,
                       pin_memory=True)
val_loader = DataLoader(dataset=val_data,
                        batch_size=32,
                        shuffle=True,
                        num_workers=1,
                        pin_memory=True)

## LSTM

Let's first try an LSTM as a simple example.

In [177]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers, batch_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.vocab_size = vocab_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_size)
        self.hidden2tag = nn.Linear(hidden_dim, vocab_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(self.batch_size, self.num_layers, self.hidden_dim).cuda(),
                torch.zeros(self.batch_size, self.num_layers, self.hidden_dim).cuda())

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(sentence.size(0),sentence.size(1), -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(sentence.size(0),sentence.size(1), -1))
        tag_scores = F.log_softmax(tag_space, dim=-1).view(self.batch_size, self.vocab_size, -1)
#         tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

In [178]:
model = LSTMTagger(embedding_dim=50,hidden_dim=128,vocab_size=len(note_to_int), num_layers=8, batch_size=32).cuda()
loss_function = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in tqdm(range(4), desc='Epoch'):  # again, normally you would NOT do 4 epochs, it is toy data
    for i, (inputs, labels) in enumerate(tqdm(tr_loader, desc='Batch')):
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        optimizer.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        inputs, labels = Variable(inputs), Variable(labels)

        # Step 3. Run our forward pass.
        tag_scores = model(inputs.cuda())

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores.cuda(), labels.cuda())
        loss.backward()
        sys.stdout.write('\r'+str(loss))
        optimizer.step()

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=720, style=ProgressStyle(description_width='initi…

tensor(4.4640, device='cuda:0', grad_fn=<NllLoss2DBackward>)

HBox(children=(IntProgress(value=0, description='Batch', max=720, style=ProgressStyle(description_width='initi…

tensor(4.3858, device='cuda:0', grad_fn=<NllLoss2DBackward>)

HBox(children=(IntProgress(value=0, description='Batch', max=720, style=ProgressStyle(description_width='initi…

tensor(4.3572, device='cuda:0', grad_fn=<NllLoss2DBackward>)

HBox(children=(IntProgress(value=0, description='Batch', max=720, style=ProgressStyle(description_width='initi…

tensor(4.4246, device='cuda:0', grad_fn=<NllLoss2DBackward>)

In [180]:
torch.save(model.state_dict(), 'models/lstm_model')

In [192]:
model = LSTMTagger(embedding_dim=50,hidden_dim=128,vocab_size=len(note_to_int), num_layers=8, batch_size=32).cuda()
model.load_state_dict(torch.load('models/lstm_model'))

### Test

In [193]:
next(iter(tr_loader))[0][:10]

tensor([[ 89,  88, 116,  34,  88, 111,  88, 108],
        [ 94, 103,  94,  88, 118,  94,  44, 107],
        [ 88,  94,  97, 103,  34,  88,  94,  67],
        [113, 119,   5, 108, 100, 100, 100, 103],
        [103,  94,  88,  88, 118, 107, 118,  88],
        [ 34, 111, 108,  83, 108, 103,  75, 103],
        [108,  81,  88,  88,  94, 100,  57, 103],
        [108,  94,  97,  88, 103,  34, 103,  97],
        [103, 100,  94, 100, 103,  67, 103,  94],
        [103, 108,  12, 103, 108, 111,  34, 119]])

In [194]:
tags = next(iter(tr_loader))[1]

In [195]:
tags

tensor([[102,  57, 118,  67,  88,  94,  57,  88],
        [103,  88,  94,  44, 108, 108, 103, 108],
        [ 88, 110,  88, 103,  34, 103,  34, 103],
        [118,  88,  83,  97,  83,  94,  88, 108],
        [ 97, 103, 108, 103, 111, 108, 103,  34],
        [103,  25, 100, 103, 110, 118,  12, 110],
        [100,  91, 103, 100,  43,  91,  88, 118],
        [ 88,  88,  88, 103,  34,  88, 118, 110],
        [ 88, 118,  67,  94, 118, 119,  67, 103],
        [118, 110, 118,  88,  57, 103, 103,  67],
        [ 97, 103, 108, 111,  12, 119, 108, 111],
        [111, 108,  97,  12,  97,  97,  94,  88],
        [111, 108,  44,  94,  94,  94,  97,  12],
        [ 94,  94,  94,  94,  12, 107, 110, 118],
        [ 44, 111, 119,  89,  81, 119, 111,  57],
        [103,  94,  75, 108, 103,  97,  12,  88],
        [ 88, 118,  88,  37, 118,  88,  94,  44],
        [ 67,  94, 100,  94, 118,  94,  94,  94],
        [100, 103,  57, 108, 111, 119,  67, 103],
        [119, 119, 111, 108, 103,  34, 103, 110],


In [200]:
preds = model(next(iter(tr_loader))[0].cuda()).cpu().detach().numpy()

In [201]:
preds.shape

(32, 120, 8)

In [207]:
np.argmax(preds, axis=1)

array([[13, 13, 13, 13, 28, 13, 13, 73],
       [28, 13, 13, 28, 28, 73, 13, 13],
       [13, 13, 13, 43, 13, 58, 13, 73],
       [13, 13, 13, 28, 28, 13, 13, 13],
       [13, 13, 13, 28, 28, 58, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 13, 13, 13],
       [28, 13, 13, 43, 28, 13, 13, 13],
       [28, 13, 13, 43, 28, 13, 13, 13],
       [28, 13, 13, 28, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [13, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 13, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [13, 13, 13, 43, 13, 28, 13, 13],
       [13, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 13, 13, 13],
       [13, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 43, 28, 28, 13, 13],
       [28, 13, 13, 13, 28, 28, 13, 13],
       [28, 28, 13, 43, 28, 28, 13, 13],
       [28, 13, 