In [2]:
%matplotlib inline
import IPython.display as ipd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pylab
import utils
import os.path
from PIL import Image

from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms
import torch
import scipy.misc

In [3]:
# Load metadata and features.
tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')

('track', 'tags')
('album', 'tags')
('artist', 'tags')
('track', 'genres')
('track', 'genres_all')


  'category', categories=SUBSETS, ordered=True)


In [5]:

# return the first genreId of a track (a track can have multiple genres)
# rock genre id is 12, hiphop is 21 and pop is 10
def getGenreId(trackId):
    if len(tracks['track','genres'][trackId]) >= 1:
        return tracks['track','genres'][trackId][0]
    else:
        # if the track does not have a genre
        return None
    
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
array_size = 224
audioDirectory = "data/fma_small/"


# https://stackoverflow.com/questions/7525214/how-to-scale-a-numpy-array
def cropArray(x, newSize):
    midRow = x.shape[0]//2
    midCol = x.shape[1]//2
    x = x[midRow - newSize//2:midRow + newSize//2, midCol - newSize//2:midCol + newSize//2]
    return x


def trackExists(trackIdNum):
    trackId = str(trackIdNum)
    while(len(trackId) < 6):
        trackId = "0" + trackId
    filename = trackId[0:3]+"/"+trackId
    audioFilename = audioDirectory + filename + ".mp3"
    if os.path.isfile(audioFilename):
        return True
    return False

# return processed spectrogram of a track
def loadSpectro(trackIdNum):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    trackId = str(trackIdNum)
    while(len(trackId) < 6):
        trackId = "0" + trackId
    filename = trackId[0:3]+"/"+trackId
    audioFilename = audioDirectory + filename + ".mp3"
    if trackExists(trackIdNum):
        x, sr = librosa.load(audioFilename, sr=None, mono=True)
        #Convert audio to a complex valued spectrogram
        spectro = librosa.core.stft(x)

        #Separate out amplitude and phase from complex valued spectrogram
        mag, phase = librosa.core.magphase(spectro)
        #print ("mag", mag)
        #print ("phase",phase)

        #Get the decibal version from power spectrogram
        #This is the value that should be stored for training
        powerToDb = librosa.power_to_db(mag, ref=np.max)
        scaledSpectro = cropArray(powerToDb, array_size)
        spectroTensor = torch.from_numpy(scaledSpectro).type(torch.DoubleTensor)
        spectroVar = Variable(spectroTensor).unsqueeze(0)
        return spectroVar
    else:
        print("Audio does not exist")
        return None
genres
trackId = 2
print(genres['title'][getGenreId(trackId)])
loadSpectro(trackId)
    

Hip-Hop


Variable containing:
( 0 ,.,.) = 
 -32.9089 -32.9568 -29.7125  ...  -35.9170 -33.7426 -38.4198
 -31.5367 -30.3374 -27.5425  ...  -32.6929 -33.3952 -34.4451
 -36.5218 -30.7120 -27.8699  ...  -34.9015 -33.7515 -35.1991
            ...               ⋱              ...            
 -38.5109 -36.4188 -30.5375  ...  -41.9084 -42.2655 -40.9039
 -35.6341 -34.0811 -29.7336  ...  -39.4294 -36.0721 -36.9820
 -32.6071 -36.9618 -30.5371  ...  -36.3189 -34.3709 -34.7984
[torch.DoubleTensor of size 1x224x224]

In [6]:
#https://github.com/srviest/char-cnn-pytorch/blob/master/model_CharCNN2D.py
class SpectroCNN(nn.Module):
    def __init__(self, output_size):
        super(SpectroCNN, self).__init__()    
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1)),
            nn.ReLU()
        ).double()

        self.maxpool1 = nn.MaxPool2d(kernel_size=(4, 4), stride=(1, 1))
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(222, 222, kernel_size=(2, 2), stride=(1,1)),
            nn.ReLU()
        ).double()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(200, 200), stride=(4, 4))
        
        self.fc1 = nn.Sequential(
            nn.Linear(1800, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.5)
        ).double()
        self.fc2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.5)
        ).double()
        self.fc3 =nn.Linear(1024, output_size).double()
        self.softmax = nn.LogSoftmax()
            # nn.LogSoftmax()

        # self.inference_log_softmax = InferenceBatchLogSoftmax()

    def forward(self, x):
        debug=False
        x = x.unsqueeze(1)
        if debug:
            print('x.size()', x.size())

        x = self.conv1(x)
        if debug:
            print('x after conv1', x.size())

        x = x.transpose(1,3)
        if debug:
            print('x after transpose', x.size())

        x = self.maxpool1(x)
        if debug:
            print('x after maxpool1', x.size())

        x = self.conv2(x)
        if debug:
            print('x after conv2', x.size())

        x = x.transpose(1,3)
        if debug:
            print('x after transpose', x.size())

        x = self.maxpool2(x)
        if debug:
            print('x after maxpool2', x.size())

        x = x.view(x.size(0), -1)
        if debug:
            print('Collapse x:, ', x.size())

        x = self.fc1(x).type(torch.DoubleTensor)
        if debug:
            print('FC1: ', x.size())

        x = self.fc2(x)
        if debug:
            print('FC2: ', x.size())

        x = self.fc3(x)
        if debug:
            print('x: ', x.size())

        x = self.softmax(x)
        # x = self.inference_log_softmax(x)

        return x



In [7]:
train_ids = [2,5,10,140,141,148,182,190,193,194]
tempCat = {'Pop':0, 'Rock':1, 'Hip-Hop':2}

def create_training(start, end):
    training_input = [loadSpectro(train_id).squeeze() for train_id in train_ids[start:end]
                      if trackExists(train_id)]

    # The output data is prepared by representing each output as a binary vector of categories
    training_output = []
    for i in range(start,min(len(train_ids),end)):
        if not(trackExists(train_ids[i])):
            continue
        genre = genres['title'][getGenreId(trackId)]
        training_vector = np.zeros(len(tempCat))
        training_vector[tempCat[genre]] = 1
        training_output.append(training_vector)
    training_output = Variable(torch.FloatTensor(training_output))
    
    return training_input, training_output

def train(model, learning_rate=0.0001, batch_size=1, epochs=1):
    """
    Training function which takes as input a model, a learning rate and a batch size.
  
    After completing a full pass over the data, the function exists, and the input model will be trained.
    """
    # Define the criterion and optimizer.
    criterion = nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Keep track of the losses, for the purposes of plotting.
    train_losses = []

    # Determine number of minibatches
    num_iter = epochs * len(train_ids)//batch_size 
    for i in range(num_iter):
        print("Starting iteration: ", i)
        
        start_idx = i * batch_size % len(train_ids)
        
        training_input, training_output = create_training(start_idx, start_idx + batch_size)

        # Retrieve the next batch of training data.
        x = torch.stack(training_input)
        y = training_output.type(torch.DoubleTensor)

        # Forward pass
        y_pred = model(x)

        # Compute and print loss
        loss = criterion(y_pred, y)

        # Zero gradients, perform backwards pass and update model weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()  
        
        del x,y,y_pred

        if i % 1 == 0:
            train_losses.append(loss.data[0])
            print(i, train_losses[-1])
        
    return train_losses 

# Finally train the model
numClasses = 3
model = SpectroCNN(numClasses)
model.train()
train(model)

Starting iteration:  0




0 5.423885960115509
Starting iteration:  1
1 1.8480728067984742
Starting iteration:  2
2 0.23472917934248896
Starting iteration:  3
3 0.23104908751567466
Starting iteration:  4
4 0.2310490601853169
Starting iteration:  5
5 0.23104906018531735
Starting iteration:  6
6 0.23104906018531504
Starting iteration:  7
7 0.23104908994478238
Starting iteration:  8
8 0.23104906018531504
Starting iteration:  9
9 0.23104906018531504


[5.423885960115509,
 1.8480728067984742,
 0.23472917934248896,
 0.23104908751567466,
 0.2310490601853169,
 0.23104906018531735,
 0.23104906018531504,
 0.23104908994478238,
 0.23104906018531504,
 0.23104906018531504]