In [1]:
#########################
# Imports 
#########################
import sys, os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms, datasets, models
from torch.utils.data import Dataset, DataLoader
import pypianoroll
from pypianoroll import Multitrack, Track
from matplotlib import pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
has_gpu = torch.cuda.is_available()
has_gpu

True

In [2]:
#########################
# Constant 
#########################
RESOLUTION = 24
PITCH = 128
TS = 4
BAR = RESOLUTION
MEASURE = BAR * TS

In [3]:
#########################
# Bar Collection
#########################
"""
    Bar Collection:
        A collection of multiple training points on the granularity of 
        a music bar.
        
        For simplicity we assume that all Midi input files share the
        following properties
        
        Resolution: 24 (per beat)
        Tempo: 120 bpm
        Time Signature: 4/4
        Note Pitch: [0-127] (128 possibilities)
        # Tracks: 1 (Single-Track Midi)
            - If Midi contains multiple tracks, use only 1st track
        
"""
def midi_to_array(path):
    """
        midi_to_array: Returns binarized midi represention of input file
            as numpy.ndarry
        
        Args:
            path(str): Path to target midi file
        
        Returns:
            data (np.ndarry): Matrix representation of midi file
    """
    # Import midi data to pypianoroll Multitrack
    data = pypianoroll.read(path)

    # Export Multitrack to numpy ndarray
    data = data.stack() #(N x T x P)
    
    # Select 1st track if other tracks present
    data = np.expand_dims(data[0,:,:], axis=0)
    
    # Set all velocity values to zero to binarize data
    data[data >= 1] = 1
    
    return data

def parse_data(datadir):
    """
        parse_data: Reads all midi files from a directory to produce a
            bar collection
        
        Args:
            datadir(str): Directory to import data from
        
        Return:
            bar_collection (np.ndarry: N x T x P): Resulting collection
                from file directory
        
    """
    midi_list = []
    
    for root, dirs, filenames in os.walk(datadir):
        for filename in filenames:
            if filename.endswith(".mid") or filename.endswith(".midi"):
                path = os.path.join(root, filename)
                midi_data = midi_to_array(path)
                midi_list.append(midi_data)
    
    print("Loaded {} files from directory: {}".format(len(midi_list), datadir))
    
    # Concatenate arrays allong time (T) dimension 
    bar_concat = np.concatenate(midi_list, axis=1) # (1 x T x H)
    num_bars = bar_concat.shape[1] / RESOLUTION
    
    print("Resulting Collection has a total of {} bars".format(int(num_bars)))
    
    # Process collection for training 
    bar_concat = np.transpose(bar_concat, axes=(0,2,1)) # (1 x H x T)
    bars = np.array_split(bar_concat, num_bars, axis=2) # List (N): (H x W)
    bars = [np.expand_dims(bar, axis=0) for bar in bars] # List (N): (1 x H x W)
    bar_collection = np.concatenate(bars, axis=0) # (N, 1, H, W)
    
    return bar_collection


In [5]:
#########################
# Dataset
#########################
class BarDataset(Dataset):
    def __init__(self, collection, step_size=BAR):
        self.data, self.data_len = self.extract_data(collection)
        self.step_size = step_size
        
    def __len__(self):
        return self.data_len
    
    @property
    def shape(self):
        return self.data.shape
    
    def __getitem__(self, idx): 
        X = self.data[idx]
        X_prev = torch.zeros(X.shape).cuda() if idx == 0 else self.data[idx-1]
        
        return X, X_prev
    
    def extract_data(self, collection):
        data_len = collection.shape[0]
        
        # Transform data to appropriate format
        data = torch.Tensor(collection)
        data = data.cuda()
        
        return data, data_len
        


In [6]:
#########################
# Dataloader
#########################
train_data = parse_data("../dataset")
train_dataset = BarDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, 
                        num_workers=0)

Loaded 2 files from directory: ../dataset
Resulting Collection has a total of 542 bars


In [7]:
###########################
# Model Defintion 
###########################

##########################
# Model Helpers
##########################

def conv_prev_concat(x, y):
        """Concatenate conditioning vector on feature map axis."""
        x_shapes = x.shape
        y_shapes = y.shape
        if x_shapes[2:] == y_shapes[2:]:
            y2 = y.expand(x_shapes[0],y_shapes[1],x_shapes[2],x_shapes[3])

            return torch.cat((x, y2),1)

        else:
            print(x_shapes[2:])
            print(y_shapes[2:])


##########################
# Model Subunits
##########################
            
class LConv2d(nn.Module):
    def __init__(self, c_in, c_out, k, s, p):
        super(LConv2d, self).__init__()
        self.conv = nn.Conv2d(c_in, c_out, k, s, p, bias=False)
        self.bn = nn.BatchNorm2d(c_out)
        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=False)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.lrelu(x)
        return x
    
class ConvTranspose2d(nn.Module):
    def __init__(self, c_in, c_out, k, s, p):
        super(ConvTranspose2d, self).__init__()
        self.conv = nn.ConvTranspose2d(c_in, c_out, k, s, p, bias=False)
        self.bn = nn.BatchNorm2d(c_out)
        self.relu = nn.ReLU(inplace=False)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x
        
           
##########################
# Model 
##########################        

class Generator(nn.Module):
    def __init__(self, gf_dim=64, nz=100, pitch_range=PITCH, bar_length=BAR):
        super(Generator, self).__init__()
        
        # Define class properties
        self.gf_dim = gf_dim 
        self.nz = nz # length of input vector 'z' (noise signal)
        self.pitch_range = pitch_range
        self.num_filters = 256
        
        # Noise Projection Layer
        self.h0_prev = LConv2d(c_in=1, c_out=self.num_filters, k=(1,pitch_range), s=(1,2), p=0)
        self.h1_prev = LConv2d(c_in=self.num_filters, c_out=self.num_filters, k=(2,1), s=(2,2), p=0)
        self.h2_prev = LConv2d(c_in=self.num_filters, c_out=self.num_filters, k=(2,1), s=(2,2), p=0)
        self.h3_prev = LConv2d(c_in=self.num_filters, c_out=self.num_filters, k=(2,1), s=(2,2), p=0)
        
        # Conditions Layer
        self.h1 = ConvTranspose2d(c_in=384, c_out=pitch_range, k=(2,1), s=(2,2), p=0)
        self.h2 = ConvTranspose2d(c_in=384, c_out=pitch_range, k=(2,1), s=(2,2), p=0)
        self.h3 = ConvTranspose2d(c_in=384, c_out=pitch_range, k=(2,1), s=(2,2), p=0)
        self.h4 = ConvTranspose2d(c_in=384, c_out=1, k=(1,pitch_range), s=(1,2), p=0)
        
        # Linear Transformation layer
        self.linear1 = nn.Linear(self.nz, 1024)
        self.linear2 = nn.Linear(1024, self.gf_dim*3*2*1)
        
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, z, prev_x):
        
        prev_x = prev_x.permute(0,1,3,2)
        b_size = prev_x.shape[0]
        
        # TODO: comment on shape of output
        h0_prev = self.h0_prev(prev_x)
        h1_prev = self.h1_prev(h0_prev)
        h2_prev = self.h2_prev(h1_prev)
        h3_prev = self.h3_prev(h2_prev)
    
        
        # TODO: comment on shape of output
        h0 = self.linear1(z)
        
        h1 = self.linear2(h0)
        h1 = h1.view(b_size, self.gf_dim * 2, 3, 1)
        h1 = conv_prev_concat(h1, h3_prev)
        
        h2 = self.h1(h1)
        h2 = conv_prev_concat(h2, h2_prev)
        
        h3 = self.h2(h2)
        h3 = conv_prev_concat(h3, h1_prev)
        
        h4 = self.h3(h3)
        h4 = conv_prev_concat(h4, h0_prev)
        
        g_x = self.sigmoid(self.h4(h4))
        
        return g_x
    
class Discriminator(nn.Module):
    def __init__(self, df_dim=64, dfc_dim=1024, pitch_range=PITCH, bar_length=BAR):
        super(Discriminator, self).__init__()
        
        self.df_dim = df_dim
        self.dfc_dim = dfc_dim
        self.pitch_range = pitch_range
        self.linear_in = self.df_dim * 40 * 15 # (conv kernel output (H,W))
        
        self.h0 = LConv2d(c_in=1, c_out=64, k=(4,89), s=1, p=0)
        self.h1 = LConv2d(c_in=64, c_out=64, k=(4,1), s=1, p=0)
        self.h2 = LConv2d(c_in=64, c_out=64, k=(4,1), s=1, p=0)
        
        self.linear = nn.Linear(self.linear_in, 1)
        self.sigmoid = nn.Sigmoid()
        

    def forward(self, x):
        #x = x.permute(0,1,3,2)
        b_size = x.shape[0]
        
        h0 = self.h0(x)
        h1 = self.h1(h0)
        h2 = self.h2(h1)
        h2 = h2.reshape(b_size, self.linear_in)
        h3 = self.linear(h2)
        
        h3_sigmoid = self.sigmoid(h3)
        
        return h3_sigmoid, h3     

In [10]:
##########################
# Training Functions 
##########################
lr = 1e-3
epochs = 20
nz = 100

# Model instantiation
torch.cuda.empty_cache()
modelG = Generator(nz=nz)
modelD = Discriminator()
modelG.to(device)
modelD.to(device)

# Model optimizers
optG = torch.optim.Adam(modelG.parameters(), lr=lr)
optD = torch.optim.Adam(modelD.parameters(), lr=lr)

# Model Criterion
criterion = nn.BCELoss()

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Loss accumulators
average_lossD = 0
average_lossG = 0
average_D_x   = 0
average_D_G_z = 0

lossD_list =  []
lossD_list_all = []
lossG_list =  []
lossG_list_all = []
D_x_list = []
D_G_z_list = []

with torch.autograd.set_detect_anomaly(True):
    for epoch in range(epochs):
        sum_lossG = 0
        sum_lossD = 0
        sum_D_x = 0
        sum_D_G_z = 0

        for i, (X, X_prev) in enumerate(train_loader):
            ############################
            # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
            ###########################

            # train with real samples
            modelD.zero_grad()
            X = X.to(device)
            X_prev = X_prev.to(device)

            # Format batch
            b_size = X.size(0)
            label = torch.full((b_size,), real_label, dtype=torch.float, device=device) # Create real labels

            # Forward pass real batch through D
            X = X.permute(0,1,3,2) # Permutate tensor to produce correct shape
            out, out_logits = modelD(X)

            # Calculate loss on all-real batch
            d_loss_real = criterion(out, label)

            # Calculate gradients for D in backward pass
            d_loss_real.backward(retain_graph=True)
            D_x = out.mean().item()
            sum_D_x += D_x 

            ## Train with all-fake batch
            # Generate batch of latent vectors
            noise = torch.randn(b_size, nz, device=device)

            # Generate fake image batch with G
            fake = modelG(noise, X_prev)
            label.fill_(fake_label)

            # Classify all fake batch with D
            out, out_logits = modelD(fake.detach())

            # Calculate D's loss on the all-fake batch
            d_loss_fake = criterion(out, label)

            # Calculate the gradients for this batch
            d_loss_fake.backward(retain_graph=True)
            D_G_z1 = out.mean().item()

            # Add the gradients from the all-real and all-fake batches
            errD = d_loss_real + d_loss_fake
            errD = errD.item()

            # Update D
            sum_lossD += errD
            optD.step()

            ############################
            # (2) Update G network: maximize log(D(G(z)))
            ###########################
            modelG.zero_grad()
            label.fill_(real_label) # fake labels are real for generator cost

            # Since we just updated D, perform another forward pass of all-fake batch through D
            out, out_logits = modelD(fake)

            # Calculate G's loss based on this output
            errG = criterion(out, label)
            sum_lossG += errG

            # Calculate gradients for G
            errG.backward(retain_graph=True)

            D_G_z2 = out.mean().item()
            sum_D_G_z += D_G_z2
            # Update G
            optG.step()

            ############################
            # (3) Update G network again: maximize log(D(G(z)))
            # Done to mitigate strength of Discriminator model 
            ###########################
            #modelG.zero_grad()
            #label.fill_(real_label) 

            # Since we just updated D, perform another forward pass of all-fake batch through D
            #out, out_logits = modelD(fake)

            # Calculate G's loss based on this output
            #errG = criterion(out, label)

            # Calculate gradients for G
            #errG.backward(retain_graph=True)

            #D_G_z2 = out.mean().item()
            #sum_D_G_z += D_G_z2
            # Update G
            #optG.step()

            if epoch % 5 == 0:
                print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
                      % (epoch, epochs, i, len(train_loader),
                         errD, errG, D_x, D_G_z1, D_G_z2))
            
            del X
            del X_prev
            del out 
            del out_logits
            del label
            del fake
            del noise
            del errD
            del errG
            
            torch.cuda.empty_cache()

        average_lossD = (sum_lossD / len(train_loader.dataset))
        average_lossG = (sum_lossG / len(train_loader.dataset))
        average_D_x = (sum_D_x / len(train_loader.dataset))
        average_D_G_z = (sum_D_G_z / len(train_loader.dataset))

        lossD_list.append(average_lossD)
        lossG_list.append(average_lossG)            
        D_x_list.append(average_D_x)
        D_G_z_list.append(average_D_G_z)

        print('==> Epoch: {} Average lossD: {:.10f} average_lossG: {:.10f},average D(x): {:.10f},average D(G(z)): {:.10f} '.format(epoch, average_lossD,average_lossG,average_D_x, average_D_G_z)) 



[0/20][0/34] Loss_D: 1.2515 Loss_G: 7.4963 D(x): 0.5284 D(G(z)): 0.4490 / 0.0017
[0/20][1/34] Loss_D: 0.2590 Loss_G: 6.7503 D(x): 0.8906 D(G(z)): 0.1184 / 0.0028
[0/20][2/34] Loss_D: 0.7201 Loss_G: 6.1514 D(x): 0.7848 D(G(z)): 0.2937 / 0.0108
[0/20][3/34] Loss_D: 1.3591 Loss_G: 7.2561 D(x): 0.7714 D(G(z)): 0.5076 / 0.0032
[0/20][4/34] Loss_D: 2.3735 Loss_G: 6.2123 D(x): 0.4065 D(G(z)): 0.4733 / 0.0098
[0/20][5/34] Loss_D: 3.7816 Loss_G: 4.9362 D(x): 0.1537 D(G(z)): 0.3078 / 0.0405
[0/20][6/34] Loss_D: 2.4275 Loss_G: 4.7245 D(x): 0.4326 D(G(z)): 0.4979 / 0.0220
[0/20][7/34] Loss_D: 2.7327 Loss_G: 5.0519 D(x): 0.7549 D(G(z)): 0.6809 / 0.0231
[0/20][8/34] Loss_D: 0.8601 Loss_G: 9.1218 D(x): 0.8116 D(G(z)): 0.3035 / 0.0029
[0/20][9/34] Loss_D: 1.8182 Loss_G: 8.6732 D(x): 0.5877 D(G(z)): 0.2120 / 0.0270
[0/20][10/34] Loss_D: 2.0812 Loss_G: 8.0428 D(x): 0.6532 D(G(z)): 0.0551 / 0.0074
[0/20][11/34] Loss_D: 1.1480 Loss_G: 7.8436 D(x): 0.8431 D(G(z)): 0.3504 / 0.0137
[0/20][12/34] Loss_D: 0.49

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


==> Epoch: 0 Average lossD: 0.0572566458 average_lossG: 0.6869149804,average D(x): 0.0506629230,average D(G(z)): 0.0003557366 
==> Epoch: 1 Average lossD: 0.0085725662 average_lossG: 1.3508708477,average D(x): 0.0611702494,average D(G(z)): 0.0003877791 
==> Epoch: 2 Average lossD: 0.0035469340 average_lossG: 1.4223216772,average D(x): 0.0617936902,average D(G(z)): 0.0003178745 
==> Epoch: 3 Average lossD: 0.0007312413 average_lossG: 1.0186474323,average D(x): 0.0626616772,average D(G(z)): 0.0002058314 
==> Epoch: 4 Average lossD: 0.0007346997 average_lossG: 1.2449527979,average D(x): 0.0624997813,average D(G(z)): 0.0001418847 
[5/20][0/34] Loss_D: 0.0010 Loss_G: 15.6672 D(x): 0.9999 D(G(z)): 0.0009 / 0.0009
[5/20][1/34] Loss_D: 0.0003 Loss_G: 17.1600 D(x): 1.0000 D(G(z)): 0.0003 / 0.0003
[5/20][2/34] Loss_D: 0.0005 Loss_G: 16.3178 D(x): 0.9999 D(G(z)): 0.0004 / 0.0004
[5/20][3/34] Loss_D: 0.0026 Loss_G: 14.0006 D(x): 1.0000 D(G(z)): 0.0026 / 0.0026
[5/20][4/34] Loss_D: 0.0003 Loss_G: 1

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 8.00 GiB total capacity; 6.32 GiB already allocated; 16.75 MiB free; 6.58 GiB reserved in total by PyTorch)

In [None]:
#############################
# scratch pad
#############################

gen = Generator()
gen.to(device)
batch_size = 1
z = torch.randn(batch_size, 100, device=device)

prev_x = torch.unsqueeze(train[0][0], dim=0)
prev_x.shape

t1 = gen.forward(z, prev_x, batch_size)
print(t1.shape)

dis = Discriminator()
dis.to(device)

dis.forward(t1, batch_size)