In [1]:
import os
import string
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# if using google colab - set up path properly
if 'google.colab' in str(get_ipython()):
    !pip install mido

    from google.colab import drive
    drive.mount('/content/drive/')

    import sys
    cwd = '/content/drive/My Drive/school/stat4984/final_proj/'
    sys.path.append(cwd)

else:
    cwd = os.getcwd()+'/'

# set up directories
midi_data_dir = cwd+'midi_data/'
csv_data_dir  = cwd+'csv_data/'

if not os.path.exists(midi_data_dir):
    os.makedirs(midi_data_dir)

if not os.path.exists(midi_data_dir):
    os.makedirs(csv_data_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting mido
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[?25l[K     |██████▍                         | 10 kB 18.0 MB/s eta 0:00:01[K     |████████████▉                   | 20 kB 20.1 MB/s eta 0:00:01[K     |███████████████████▎            | 30 kB 11.3 MB/s eta 0:00:01[K     |█████████████████████████▋      | 40 kB 9.0 MB/s eta 0:00:01[K     |████████████████████████████████| 51 kB 3.0 MB/s 
[?25hInstalling collected packages: mido
Successfully installed mido-1.2.10
Mounted at /content/drive/


In [2]:
def dir_idx(dir_name, n):
    """
    return the name of the nth file in a directory
    """
    
    return dir_name+os.listdir(dir_name)[n]

# Data Preprocessing

In [3]:
# only download midis if the batch dir doesn't exist and midi dir is empty
if len(os.listdir(midi_data_dir)) == 0 and len(os.listdir(csv_data_dir)) == 0:
    import mido
    from midi_ndarrays import *
    from midi_utils import download_midis
    download_midis(midi_data_dir)


def load_array(midi_filename):
    """
    return midi_filename as a ndarray with start and end tokens
    Replace all notes with 1 - played, or 0 - not played
    """
    midi_tracks = mido.MidiFile(midi_filename, clip=True)
    midi_array = mid2array(midi_tracks)
    
    # set all values to 1 or 0
    midi_array = np.where(midi_array != 0, 1, 0).astype('uint8')
    
    # add padding and encode start token (first column)
    # and end token (last column)
    midi_array = np.pad(midi_array, 1)
    midi_array[0, 0]   = 1
    midi_array[-1, -1] = 1
    
    return midi_array

# if there aren't any csv files, convert all midis arrays and save in csv
if len(os.listdir(csv_data_dir)) == 0:
    for i in range(len(os.listdir(midi_data_dir))):
        midi_filename = dir_idx(midi_data_dir, i)
        midi_array = load_array(midi_filename)
        np.savetxt(midi_filename[:-4]+".csv", midi_array, fmt="%d", delimiter=",")

# The Model

In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_p):
        super().__init__()
        
        self.input_size  = input_size
        self.hidden_size = hidden_size
        self.num_layers  = num_layers
        
        self.dropout   = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn       = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout_p)
        
    def forward(self, x):
        # input dimension [1, 90]
        embedding = self.dropout(self.embedding(x))
        _, (h, c) = self.rnn(embedding)
        return h, c

In [5]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, 
                num_layers, dropout_p):
    
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers  = num_layers

        self.dropout   = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        predictions = self.fc(outputs.squeeze(0))

        # sigmoid because we're predicting multiple notes
        predictions = torch.sigmoid(predictions.squeeze(1))

        # update all values over 0.5 to 1, else 0
        predictions = (predictions > 0.5).int()
        return predictions, hidden, cell

In [7]:
class Seq2Seq(nn.Module):
    """
    Encapsulates the encoder and decoder. Pass a song matrix to the forward method, and it
    will encode it, then output the decoder's prediction of it
    """
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
  
    def forward(self, x, teacher_forcing_ratio=0.5):
        """
        pre: x is the song on host not device - the gpu runs out of memory
             with the whole song
        """
        # x: the song with shape [len, 90]

        #tensor to store decoder outputs
        outputs = torch.zeros(x.shape)
        
        # encoder input song
        len_song = x.shape[0]

        #h,c = 0,0
        # pass each row to the encoder one-by-one
        # for i in range(len_song):
        #     current_row = x[i,:].to(device)
        #     h,c = self.encoder(current_row.unsqueeze(0))
        
        print("pre encoder:", torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated())
        h,c = self.encoder(x.unsqueeze(0).to(device))
        print("post encoder:", torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated())
        #first input to the decoder is the <sos> tokens
        input = x[0,:].to(device)

        # decode the length of the song and return prediction
        for t in range(1, len_song):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, h, c = self.decoder(input, h, c)

            #place predictions in a tensor holding predictions for each token
            outputs[t] = output.to("cpu")
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = x[t,:].to(device) if teacher_force else output

        memory_used = torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()

        print(memory_used)
        
        return outputs

    def generate_song(self, max_length):
        """
        Generate a song based on the parameters of the encoder and decoder
        Pre: encoder and decoder should be optimized
        Return: The song as an song_length x 88 size numpy array
        """

        num_notes = self.encoder.input_size

        song_matrix = torch.zeros(max_length, num_notes, dtype=torch.int)

        # encode start token
        song_matrix[0, 0] = 1

        # initial state is based on encoding the start token
        input = song_matrix[0, :].to(device)

        h, c = self.encoder(input.unsqueeze(0))

        # Predict the notes of this song following inputting the start token
        for t in range(1, max_length):
            # input is the predicted set of notes, and the input to the next
            # prediction
            input, h, c = self.decoder(input, h, c)

            song_matrix[t] = input.to("cpu")

            # if end token is predicted: end
            if input[-1] == 0:
                print("end token")
                break

        print("t:", t)
        # the song generated until the end token (or max_length)
        song_matrix = song_matrix[:t+1,:]
        
        # convert to np array
        song_matrix = np.array(song_matrix)

        # trim outer padding for start and end tokens
        return song_matrix[1:-1, 1:-1]

# Train

In [8]:
drop_p = 0.5

# data has 88 columns for each piano note, plus two for start and end tokens
midi_dim = 90

# model hyperparameters
hidden_size = 64
num_layers = 2

# training hyperparemeters
num_epochs = 1
num_songs  = 1

# the number of rows of the song matrix to pass to seq2seq --
# this is what I've found is the max you can use in colab without
# the free GPUs they provide running out of memory in decoding
batch_size = 20000

save_model  = True
load_model  = False
train_model = True
model_filename = "model.pt"

In [9]:
encoder = EncoderRNN(midi_dim, hidden_size, num_layers, drop_p)
decoder = DecoderRNN(midi_dim, hidden_size, 1, num_layers, drop_p)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

In [10]:
midi_tensor = np.genfromtxt(dir_idx(csv_data_dir, 0), delimiter=',', dtype='int')
        
# convert to tensor
midi_tensor = torch.IntTensor(midi_tensor)

In [11]:
num_songs = min(num_songs, len(os.listdir(csv_data_dir)))

def train(model, num_songs):
    """
    Pass num_songs to the model and optimize at each
    """
    model.train()

    epoch_loss = 0

    for i in range(num_songs):
        optimizer.zero_grad()

        # load song as numpy array
        # midi_tensor = np.genfromtxt(dir_idx(csv_data_dir, i), delimiter=',', dtype='int')
        
        # # convert to tensor
        # midi_tensor = torch.IntTensor(midi_tensor)
        
        # decode prediction
        predicted_song = model(midi_tensor)
        
        predicted_song.requires_grad = True

        # optimize
        loss = criterion(predicted_song.float())

        loss.backward()
        optimizer.step()

model_loaded = False

# load the model
if load_model and model_filename in os.listdir():
    model.load_state_dict(torch.load(model_filename))
    model_loaded = True

# train the model
if train_model:
    for epoch in range(num_epochs):
        train(model, num_songs)

# save the model
if save_model and not model_loaded:
    torch.save(model.state_dict(), cwd+model_filename)

ZeroDivisionError: ignored

In [70]:
# Passing data by batch
len_song = 100_001

batch_size = 20_000
import math
num_batches = math.ceil(len_song / batch_size)

song = np.arange(0, len_song)

for i in range(num_batches):
    # last batch
    if i == num_batches-1:
        batch = song[i*batch_size:]
    else:
        batch = song[i*batch_size:(i+1)*batch_size]

    print(batch)

[    0     1     2 ... 19997 19998 19999]
[20000 20001 20002 ... 39997 39998 39999]
[40000 40001 40002 ... 59997 59998 59999]
[60000 60001 60002 ... 79997 79998 79999]
[80000 80001 80002 ... 99997 99998 99999]
[100000]


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()