In [1]:
import os
import sys
import numpy as np
import argparse
import h5py
import math
import time
import logging
import matplotlib.pyplot as plt
import torchvision
from sklearn import metrics
import _pickle as cPickle
import shutil

import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import Linear, NLLLoss, LogSoftmax, Sequential
from torch.optim import Adam

from models import Cnn14_scatter
from data_generator import monoDataset

from stft import Spectrogram, LogmelFilterBank

In [2]:
DATA_ROOT = '/home/laura/monophonic/data'

train_npz_path=DATA_ROOT+'/train.npz'
test_npz_path=DATA_ROOT+'/test.npz'

train_dataset = monoDataset(npz_path=train_npz_path, audio_length=220500, classes_num=7)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    num_workers=8,
    shuffle=True
)

test_dataset = monoDataset(npz_path=test_npz_path, audio_length=220500, classes_num=7)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    num_workers=8,
    shuffle=True
)

In [3]:
T = 110250
J = 6
Q = 8
num_classes=7
log_eps = 1e-6

In [4]:
sample_rate=44100
window_size=400
hop_size=160
mel_bins=64
fmin=50
fmax=14000
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None

spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
    win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
    freeze_parameters=True)

logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
    n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
    freeze_parameters=True)

spectrogram_extractor.cuda()
logmel_extractor.cuda()

LogmelFilterBank()

In [5]:
# Number of epochs.
num_epochs = 50
# Learning rate for Adam.
lr = 1e-4

In [6]:
first = True

for e in range(num_epochs):
    for batch_data_dict in train_loader:

        batch_input = batch_data_dict['waveform'].cuda()
        batch_target = batch_data_dict['target'].cuda()
        
        y_tr = torch.argmax(batch_target,axis=1)
        y_tr = y_tr.long()

        Sx_tr = spectrogram_extractor(batch_input)
        Sx_tr = logmel_extractor(Sx_tr)
        Sx_tr = torch.reshape(Sx_tr, (Sx_tr.shape[0], Sx_tr.shape[2], Sx_tr.shape[3]))
        
        Sx_tr = torch.mean(Sx_tr, dim=1)

        # This part is very wrong!!!
        mu_tr = Sx_tr.mean(dim=0)
        std_tr = Sx_tr.std(dim=0)
        Sx_tr = (Sx_tr - mu_tr) / std_tr

        if (first):
            num_input = Sx_tr.shape[-1]
            model = Sequential(Linear(num_input, num_classes), LogSoftmax(dim=1))
            optimizer = Adam(model.parameters())
            criterion = NLLLoss()
            model.cuda()
            criterion.cuda()
            
            first = False
            
        model.zero_grad()
        resp = model.forward(Sx_tr)
        loss = criterion(resp, y_tr)
        loss.backward()
        optimizer.step()
        
    # Calculate the response of the training data at the end of this epoch and
    # the average loss.
    resp = model.forward(Sx_tr)
    avg_loss = criterion(resp, y_tr)

    # Try predicting the classes of the signals in the training set and compute
    # the accuracy.
    y_hat = resp.argmax(dim=1)
    accuracy = (y_tr == y_hat).float().mean()

    print('Epoch {}, average loss = {:1.3f}, accuracy = {:1.3f}'.format(
        e, avg_loss, accuracy))

Epoch 0, average loss = 1.542, accuracy = 0.406
Epoch 1, average loss = 1.297, accuracy = 0.500
Epoch 2, average loss = 1.080, accuracy = 0.625
Epoch 3, average loss = 0.887, accuracy = 0.750
Epoch 4, average loss = 0.828, accuracy = 0.781
Epoch 5, average loss = 0.757, accuracy = 0.906
Epoch 6, average loss = 0.894, accuracy = 0.750
Epoch 7, average loss = 0.884, accuracy = 0.719
Epoch 8, average loss = 0.666, accuracy = 0.750
Epoch 9, average loss = 0.716, accuracy = 0.781
Epoch 10, average loss = 0.767, accuracy = 0.812
Epoch 11, average loss = 0.724, accuracy = 0.750
Epoch 12, average loss = 0.630, accuracy = 0.875
Epoch 13, average loss = 0.593, accuracy = 0.844
Epoch 14, average loss = 0.753, accuracy = 0.781
Epoch 15, average loss = 0.601, accuracy = 0.875
Epoch 16, average loss = 0.612, accuracy = 0.812
Epoch 17, average loss = 0.556, accuracy = 0.875
Epoch 18, average loss = 0.767, accuracy = 0.656
Epoch 19, average loss = 0.609, accuracy = 0.875
Epoch 20, average loss = 0.647

In [7]:
avg_acc = 0
avg_loss_total = 0
iteration = 0

for batch_data_dict in test_loader:

        batch_input = batch_data_dict['waveform'].cuda()
        batch_target = batch_data_dict['target'].cuda()
        
        y_ts = torch.argmax(batch_target,axis=1)
        y_ts = y_ts.long()

        Sx_ts = spectrogram_extractor(batch_input)
        Sx_ts = logmel_extractor(Sx_ts)
        Sx_ts = torch.reshape(Sx_ts, (Sx_ts.shape[0], Sx_ts.shape[2], Sx_ts.shape[3]))
        
        Sx_ts = torch.mean(Sx_ts, dim=1)

        Sx_ts = (Sx_ts - mu_tr) / std_tr

        resp = model.forward(Sx_ts)
        avg_loss = criterion(resp, y_ts)

        # Try predicting the labels of the signals in the test data and compute the
        # accuracy.
        y_hat = resp.argmax(dim=1)
        accu = (y_ts == y_hat).float().mean()
        
        avg_loss_total += avg_loss
        avg_acc += accu
        
        iteration +=1

print('TEST, average loss = {:1.3f}, accuracy = {:1.3f}'.format(
        avg_loss_total/iteration, avg_acc/iteration))

TEST, average loss = 0.482, accuracy = 0.844
