In [1]:
import os
import librosa
import pandas as pd
import numpy as np
import csv
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from PIL import Image

root = './'

In [2]:
def get_data():
    data = pd.read_csv(root + 'train.csv')
    temp = np.squeeze(data.values)
    audio_name = temp[ : , 0]
    audio_label = temp[ : , 3].astype('float32')
    
    return audio_name, audio_label

In [3]:
def get_feature(audio_path):
    y, sr = librosa.load(audio_path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.99)
    feature = np.vstack((mfcc))
    
    return feature

In [4]:
class myDataset(Dataset):
    def __init__(self, root):
        self.root = root
        self.audio_name, self.audio_label = get_data()
        print("> Found %d audios..." % (len(self.audio_name)))

    def __getitem__(self, idx):
        audio_path = self.root + 'clips/' + self.audio_name[idx]
        self.feature = get_feature(audio_path)
        self.label = self.audio_label[idx]
        
        return self.feature, self.label

    def __len__(self):
        return len(self.audio_name)

In [5]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.num_layers = num_layers

        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 32, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

In [6]:
def train_model(net, criterion, optimizer, scheduler, num_epochs):
    #train
    for epoch in range(num_epochs):
        print('\nEpoch: %d' % (epoch + 1))
        net.train()
        sum_loss = 0.0
        correct = 0.0
        total = 0.0
        for i, data in enumerate(trainLoader, 0):
            #prepare dataset
            length = len(trainLoader)
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            #forward & backward
            inputs = inputs.float()
            print(inputs.size())
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            #print ac & loss in each batch
            sum_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += predicted.eq(labels.data).cpu().sum()
            print('[epoch:%d, iter:%d] Loss: %.03f' 
                  % (epoch + 1, (i + 1 + epoch * length), sum_loss / (i + 1)))
        
        '''
        #get the ac with testdataset in each epoch
        print('Waiting Test...')
        with torch.no_grad():
            correct = 0
            total = 0
            for data in testLoader:
                net.eval()
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            print('Test\'s ac is: %.3f%%' % (100 * correct / total))'''

    print('Train has finished, total epoch is %d' % EPOCH)
    return net

In [7]:
def OutputTest(root, net):
    data = pd.read_csv(root + 'test.csv')
    temp = np.squeeze(data.values)
    audio_name = temp[:, 0]
    with open('submission.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['track', 'score'])
        for i in audio_name:
            audio_path = root + 'clips/' + i
            inputs = get_feature(audio_path)
            inputs  = torch.tensor(inputs, dtype=torch.float32)
            inputs = inputs.unsqueeze(0).to(device)
            print(inputs.size())
            prediction = net(inputs).cpu().detach().numpy()
            print(i + ': ' + str(prediction[0][0]))
            writer.writerow([i, prediction[0][0]])

In [8]:
trainData = myDataset(root=root)
trainLoader = DataLoader(dataset=trainData, batch_size=10, shuffle=True, num_workers=0)

> Found 220 audios...


In [9]:
#check gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

#set hyperparameter
EPOCH = 3
LR = 0.01

#resnet18 = models.resnet18(pretrained=True)
#net = resnet18.to(device)
net = LSTM(216, 20, 2, 1).to(device)

criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

net  = train_model(net=net, criterion=criterion, optimizer=optimizer,scheduler=exp_lr_scheduler, num_epochs=EPOCH)

cuda:0

Epoch: 1
torch.Size([10, 20, 216])


  return F.mse_loss(input, target, reduction=self.reduction)


[epoch:1, iter:1] Loss: 0.429
torch.Size([10, 20, 216])
[epoch:1, iter:2] Loss: 0.377
torch.Size([10, 20, 216])
[epoch:1, iter:3] Loss: 0.374
torch.Size([10, 20, 216])
[epoch:1, iter:4] Loss: 0.336
torch.Size([10, 20, 216])
[epoch:1, iter:5] Loss: 0.315
torch.Size([10, 20, 216])
[epoch:1, iter:6] Loss: 0.290
torch.Size([10, 20, 216])
[epoch:1, iter:7] Loss: 0.271
torch.Size([10, 20, 216])
[epoch:1, iter:8] Loss: 0.258
torch.Size([10, 20, 216])
[epoch:1, iter:9] Loss: 0.234
torch.Size([10, 20, 216])
[epoch:1, iter:10] Loss: 0.216
torch.Size([10, 20, 216])
[epoch:1, iter:11] Loss: 0.198
torch.Size([10, 20, 216])
[epoch:1, iter:12] Loss: 0.186
torch.Size([10, 20, 216])
[epoch:1, iter:13] Loss: 0.176
torch.Size([10, 20, 216])
[epoch:1, iter:14] Loss: 0.166
torch.Size([10, 20, 216])
[epoch:1, iter:15] Loss: 0.158
torch.Size([10, 20, 216])
[epoch:1, iter:16] Loss: 0.150
torch.Size([10, 20, 216])
[epoch:1, iter:17] Loss: 0.146
torch.Size([10, 20, 216])
[epoch:1, iter:18] Loss: 0.142
torch.Siz

In [10]:
OutputTest(root=root, net=net)

torch.Size([1, 20, 216])
normalize_5s_intro_0EVVKs6DQLo.wav: 0.57825065
torch.Size([1, 20, 216])
normalize_5s_intro_d7to9URtLZ4.wav: 0.63570565
torch.Size([1, 20, 216])
normalize_5s_intro_TzhhbYS9EO4.wav: 0.6036368
torch.Size([1, 20, 216])
normalize_5s_intro_nn5nypm7GG8.wav: 0.70313174
torch.Size([1, 20, 216])
normalize_5s_intro_hed6HkYNA7g.wav: 0.68021864
torch.Size([1, 20, 216])
normalize_5s_intro_rWznOAwxM1g.wav: 0.6275977
torch.Size([1, 20, 216])
normalize_5s_intro_zyQkFh-E4Ak.wav: 0.67731875
torch.Size([1, 20, 216])
normalize_5s_intro_agKkcRXN2iE.wav: 0.6743226
torch.Size([1, 20, 216])
normalize_5s_intro_SZaZU_qi6Xc.wav: 0.622683
torch.Size([1, 20, 216])
normalize_5s_intro_ZpDQJnI4OhU.wav: 0.64592403
torch.Size([1, 20, 216])
normalize_5s_intro_D4nWzd63jV4.wav: 0.66848975
torch.Size([1, 20, 216])
normalize_5s_intro_9odM1BRqop4.wav: 0.6617746
torch.Size([1, 20, 216])
normalize_5s_intro_F64yFFnZfkI.wav: 0.66003776
torch.Size([1, 20, 216])
normalize_5s_intro_Js2JQH_kt0I.wav: 0.6118697

In [11]:
torch.save(net, 'LSTM_3ep.pth')