In [1]:
import os
import string
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# if using google colab - set up path properly
if 'google.colab' in str(get_ipython()):
    !pip install mido

    from google.colab import drive
    drive.mount('/content/drive/')

    import sys
    cwd = '/content/drive/My Drive/school/stat4984/final_proj/'
    sys.path.append(cwd)

else:
    cwd = os.getcwd()+'/'


import mido
from midi_ndarrays import *

# set up directories
midi_data_dir = cwd+'midi_data/'
csv_data_dir  = cwd+'csv_data/'

if not os.path.exists(midi_data_dir):
    os.makedirs(midi_data_dir)

if not os.path.exists(midi_data_dir):
    os.makedirs(csv_data_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
def dir_idx(dir_name, n):
    """
    return the name of the nth file in a directory
    """
    
    return dir_name+os.listdir(dir_name)[n]

# Data Preprocessing

In [3]:
# only download midis if the batch dir doesn't exist and midi dir is empty
if len(os.listdir(midi_data_dir)) == 0 and len(os.listdir(csv_data_dir)) == 0:
    from midi_utils import download_midis
    download_midis(midi_data_dir)


def load_array(midi_filename):
    """
    return midi_filename as a ndarray with start and end tokens
    Replace all notes with 1 - played, or 0 - not played
    """
    midi_tracks = mido.MidiFile(midi_filename, clip=True)
    midi_array = mid2array(midi_tracks)
    
    # set all values to 1 or 0
    midi_array = np.where(midi_array != 0, 1, 0).astype('uint8')
    
    # add padding and encode start token (first column)
    # and end token (last column)
    midi_array = np.pad(midi_array, 1)
    midi_array[0, 0]   = 1
    midi_array[-1, -1] = 1
    
    return midi_array

# if there aren't any csv files, convert all midis arrays and save in csv
if len(os.listdir(csv_data_dir)) == 0:
    for i in range(len(os.listdir(midi_data_dir))):
        midi_filename = dir_idx(midi_data_dir, i)
        midi_array = load_array(midi_filename)
        np.savetxt(midi_filename[:-4]+".csv", midi_array, fmt="%d", delimiter=",")

# The Model

In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_p):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.num_layers  = num_layers
        
        self.dropout   = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn       = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout_p)
        
    def forward(self, x):
        # input dimension [1, 90]
        embedding = self.dropout(self.embedding(x))
        _, (h, c) = self.rnn(embedding)
        return h, c

In [25]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, 
                num_layers, dropout_p):
    
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers  = num_layers

        self.dropout   = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        predictions = self.fc(outputs.squeeze(0))
        
        # sigmoid because we're predicting multiple notes
        predictions = torch.sigmoid(predictions.squeeze(1))

        #predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

In [6]:
class Seq2Seq(nn.Module):
    """
    Encapsulates the encoder and decoder. Pass a song matrix to the forward method, and it
    will encode it, then output the decoder's prediction of it
    """
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
  
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_out).to(device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

# Train

In [7]:
drop_p = 0.5

# data has 88 columns for each piano note, plus two for start and end tokens
midi_dim = 90

# model hyperparameters
hidden_size = 64
num_layers = 2

# training hyperparemeters
num_epochs = 1

In [26]:
encoder = EncoderRNN(midi_dim, hidden_size, num_layers, drop_p)
decoder = DecoderRNN(midi_dim, hidden_size, 1, num_layers, drop_p)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

# TODO
for epoch in range(num_epochs):
    #for i in range(len(os.listdir()))
        pass
        # pass to seq to seq, compute loss, optimize

## WIP

Testing passing input to encoder and decoder on 10 rows of an example midi

In [9]:
csv_data_dir = cwd+"csv_data/"
midi_array   = np.genfromtxt(dir_idx(csv_data_dir, 0), delimiter=',', dtype='int')

midi_tensor = torch.IntTensor(midi_array).to(device)

In [10]:
num_rows = midi_tensor.shape[0]
num_rows = 10

h, c = 0, 0

for i in range(num_rows):
    # pass each row in the midi tensor to the encoder)
    h, c = encoder(midi_tensor[i,:].unsqueeze(0))

In [27]:
pred, h, c = decoder(midi_tensor[0,:], h, c)

In [30]:
# TODO this is used in music generation step
pred_cpu = torch.Tensor.cpu(pred)
np.where(pred_cpu > 0.5, 1, 0)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])