In [20]:
import librosa
import torch
import os
import pandas as pd
import torch.nn as nn
import torch.optim as optim

Dataset for loading data in training and testing

In [21]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_path, label_path=None, input_len=16, mode='train', sr=32000, input_dim=63):
        self.input_len = input_len
        self.mode = mode
        gt = pd.read_csv(label_path)
        self.mfcc_samples = torch.zeros((gt.shape[0]+input_len, input_dim), dtype=torch.float)

        for i, path in enumerate(gt['track']):
            y, sr = librosa.load(os.path.join(data_path, path), sr=32000)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=1).squeeze()
            mfcc_norm = librosa.util.normalize(mfcc)
            mfcc_norm = torch.from_numpy(mfcc_norm).float()
            self.mfcc_samples[i+input_len] = mfcc_norm

        if mode == 'train':        
            self.target = torch.stack([torch.tensor(score) for score in gt['score']])

    def __getitem__(self, index):
        output = self.mfcc_samples[index:index+self.input_len]
        if self.mode == 'train':
            target = self.target[index]
            return  output, target
        else:
            return output

    def __len__(self):
        return len(self.mfcc_samples)-self.input_len

GRU regression model

In [22]:
class GRU_regression(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, _x):
        x, h_n = self.gru(_x)
        s, b, h = x.shape
        x = x.reshape(s*b, h)
        x = self.fc(x)
        x = x.reshape(s, b, 1)
        return x

hyper-parameter

In [23]:
epochs = 1000
input_len = 32
input_dim = 313
hidden_dim = 32
sr = 16000

trainig

In [24]:
dataset_train = Dataset('audios/clips','train.csv', input_len=input_len, \
                        mode='train', sr=sr, input_dim=input_dim)
    
train_loader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=1,
    shuffle=False,
    num_workers=0,
)
loss_function = nn.MSELoss()
model = GRU_regression(input_dim, hidden_size=hidden_dim, num_layers=1)

opt = optim.SGD(model.parameters(), lr=0.2)
 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
model = model.to(device)
 
model.train()

for epoch in range(epochs):
    losses = 0
    for x, t in train_loader:
        opt.zero_grad()
        x = x.squeeze().reshape(input_len, 1, input_dim).to(device)
        
        t = t.reshape(-1, 1, 1).to(device)
        out = model(x)
        loss = loss_function(out[-1].sigmoid(), t)
        loss.backward()
        opt.step()
        losses += loss.item()
    print(losses)

torch.save(model.state_dict(), 'model_final.pth')

  return F.mse_loss(input, target, reduction=self.reduction)


8.004354401041695


testing

In [25]:
model = model.eval()
dataset_test = Dataset('audios/clips','test.csv', input_len=input_len, mode='test', sr=sr, input_dim=input_dim)
test_loader = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=1,
    shuffle=False,
    num_workers=0,
)
for x in test_loader:
    x = x.squeeze().reshape(input_len, 1, input_dim).to(device)
    out = model(x)
    out = out[-1].sigmoid()
    print(out.item())

0.4982593059539795
0.6349506974220276
0.6489795446395874
0.6753785610198975
0.700973629951477
0.7097406387329102
0.7189907431602478
0.7095629572868347
0.7145441174507141
0.7021209597587585
0.708186149597168
0.7187498211860657
0.7108915448188782
0.6995230913162231
0.6997758150100708
