In [1]:
from pathlib import Path
from scipy.io import wavfile

In [2]:
import numpy as np

In [3]:
import torch

In [36]:
from torch import nn, optim

In [5]:
from torch.utils import data

In [6]:
from  sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [7]:
from collections import OrderedDict

In [8]:
import utils

In [9]:
import visualize

In [10]:
import json

In [11]:
import time
import sys

In [12]:
from loguru import logger

In [13]:
logger.add(sys.stdout, level='INFO', colorize=True, format="<green>{time}</green> <level>{message}</level>")

1

In [14]:
class Config(object):
    """

    """
    __slots__ = ('root', 'train_fname', 'epoch', 'batch_size', 'input_length', 
                 'save_interval', 'val_interval', 'log_interval', 'lr', 'lr_decay', 
                 'lr_decay_period', 'lr_decay_epoch', 'wd')

    def __init__(self, dct):
        """

        :param dct:
        """
        for k in self.__slots__:
            v = dct.get(k, None)
            if k in {"train_root", "val_root", "trainval_root"}:
                v = Path(v)
            setattr(self, k, v)

In [15]:
def parse_config(config_path):
    """

    :param config_path:
    :return:
    """
    with open(config_path, 'r') as fr:
        dct = json.load(fr)
    return Config(dct)

In [16]:
class NamedLayer(nn.Sequential):
    def __init__(self, name, start=None, end=None, bn=False):
        super().__init__()
        self.name = name
        self.start = start
        self.end = end
        self.bn = bn

In [17]:
def get_convolutional_layer(index, in_channels, out_channels, kernel_size=3, stride=1, padding=True, batch_normalize=True, activation='leaky'):
    
    if batch_normalize:
        bias = False
    else:
        bias = True
        
    if padding:
        padding = (kernel_size - 1) // 2
    else:
        padding = 0

    model = NamedLayer('convolutional', bn=batch_normalize)
    conv_layer = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
    model.add_module(f'conv_{index}: ', conv_layer)
    if batch_normalize:
        bn_layer = nn.BatchNorm1d(out_channels)
        model.add_module(f'bn_{index}', bn_layer)
        
    if activation == 'leaky':
        activation_layer = nn.LeakyReLU(0.1, inplace=True)
        model.add_module(f'activation_{index}', activation_layer)
    elif activation == 'relu':
        activation_layer = nn.ReLU(inplace=True)
        model.add_module(f'activation_{index}', activation_layer)
    return model

In [18]:
def get_block(index, in_channels, out_channels, kernel_size, num=2, batch_normalize=True):
    """
    """
    model = NamedLayer(f'convolutional_{index}', bn=batch_normalize)
    for i in range(num):
        if i == 0:
            channels = in_channels
        conv = get_convolutional_layer(index, channels, out_channels, kernel_size, stride=1)
        channels = out_channels
        model.add_module(f'conv_{i}', conv)
    
    model.add_module(f'downsample_{index}', nn.Conv1d(channels, out_channels, kernel_size, stride=2, padding=1))
    return model

In [19]:
def get_linear_layer(in_channels, out_channels, num=2):
    """
    """
    model = NamedLayer('linear')
    for i in range(num):
        if i == 0:
            channels = in_channels
        linear_layer = nn.Linear(channels, out_channels)
        channels = out_channels
        model.add_module(f'linear_{i}', linear_layer)
        activation_layer = nn.ReLU(inplace=True)
        model.add_module(f'activation_{i}', activation_layer)
    return model

In [20]:
def get_softmax_layer(in_channels, num_class):
    """
    """
    model = NamedLayer('softmax')
    linear_layer = nn.Linear(in_channels, num_class)
    model.add_module(f'linear', linear_layer)
    softmax_layer = nn.Softmax(dim=1)
    model.add_module(f'softmax', softmax_layer)
    return model

In [21]:
def get_model():
    model = nn.Sequential()
    layer_0 = get_block(0, in_channels=1, out_channels=16, kernel_size=3)
    model.add_module('layer_0', layer_0)
    layer_1 = get_block(1, in_channels=16, out_channels=32, kernel_size=3)
    model.add_module('layer_1', layer_1)
    layer_2 = get_block(2, in_channels=32, out_channels=64, kernel_size=3)
    model.add_module('layer_2', layer_2)
    layer_3 = get_block(3, in_channels=64, out_channels=128, kernel_size=3)
    model.add_module('layer_3', layer_3)
    return model

In [22]:
class AudioNet(nn.Module):
    """
    """

    def __init__(self,
                 input_length,
                 name_tuple,
                 use_cuda=torch.cuda.is_available()):
        super().__init__()
        self.input_length = input_length
        self.name_tuple = name_tuple
        self.num_class = len(name_tuple)
        self.use_cuda=use_cuda
        self._init()
        
    def _init(self):
        conv_block_0 = get_block(0, in_channels=1, out_channels=16, kernel_size=3)
        self.add_module('conv_block_0',conv_block_0)
        conv_block_1 = get_block(1, in_channels=16, out_channels=32, kernel_size=3)
        self.add_module('conv_block_1', conv_block_1)
        conv_block_2 = get_block(2, in_channels=32, out_channels=64, kernel_size=3)
        self.add_module('conv_block_2', conv_block_2)
        conv_block_3 = get_block(3, in_channels=64, out_channels=64, kernel_size=3)
        self.add_module('conv_block_3', conv_block_3)
        conv_block_4 = get_block(4, in_channels=64, out_channels=32, kernel_size=3)
        self.add_module('conv_block_4', conv_block_4)
        conv_block_5 = get_block(5, in_channels=32, out_channels=16, kernel_size=3)
        self.add_module('conv_block_5', conv_block_5)
        linear_block = get_linear_layer(22064, 1024)
        self.add_module('linear_block',linear_block)
        softmax_block = get_softmax_layer(1024, self.num_class)
        self.add_module('softmax_block', softmax_block)
        
    def forward(self, x):
        """
        """
        x = torch.unsqueeze(x, dim=1)
        for module in self.children():
            if module.name == 'linear':
                b, c, l = x.shape
                size = c * l
                x = x.reshape((b, size, ))
            x = module(x)
        return x
    
    def loss(self, prediction, target):
        """
        """
        loss = nn.CrossEntropyLoss()
        return loss(prediction, target)

In [23]:
root = Path('/data/FSDKaggle2018/FSDKaggle2018.audio_train')
train_fname = Path('/data/FSDKaggle2018/FSDKaggle2018.meta/train_post_competition.csv')

In [24]:
labels, label_dct = utils.get_dataset_meta(train_fname)

In [25]:
full_df = utils.get_test_data(train_fname)

2019-06-21 13:59:06.704 | INFO     | utils:get_test_data:59 - Reading from /data/FSDKaggle2018/FSDKaggle2018.meta/train_validate.csv


[32m2019-06-21T13:59:06.704983+0800[0m [1mReading from /data/FSDKaggle2018/FSDKaggle2018.meta/train_validate.csv[0m


In [26]:
train_df = full_df[full_df.test == False]
train_df.index = range(len(train_df))

In [27]:
test_df = full_df[full_df.test == True]
test_df.index = range(len(test_df))

In [28]:
config = parse_config('config.json')

In [42]:
input_length = config.input_length

In [43]:
net = AudioNet(input_length=input_length, name_tuple=labels)

In [44]:
num_class = len(labels)

In [45]:
learning_rate = config.lr
weight_decay = config.wd

val_interval = config.val_interval
save_interval = config.save_interval
log_interval = config.log_interval

lr_decay_epoch = set(config.lr_decay_epoch)
lr_decay = config.lr_decay

In [46]:
epoch = config.epoch
batch_size = config.batch_size

In [47]:
use_cuda = net.use_cuda
if use_cuda:
    net = net.cuda()

In [48]:
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [49]:
visualizer = visualize.Visualizer('AudioNet')



In [50]:
def validate(net,
             val_data,
             batch_size,
             epoch_num):
    net.eval()

    num_class = val_data.num_class

    val_data_iter = data.DataLoader(val_data, batch_size=batch_size, shuffle=False)
    use_cuda = net.use_cuda

    total_num = 0
    true_detect_num = 0
    with torch.no_grad():
        for i, (val_data, label) in enumerate(val_data_iter):
            if use_cuda:
                val_data = val_data.cuda()
                label = label.cuda()
            prediction = net(val_data)
            _, prediction_label = torch.max(prediction, dim=1)
            
            total_num += label.shape[0]
            true_detect_num += (prediction_label == label).sum().item()
            
    val_mean_precision = true_detect_num / total_num

    validation_res_dct = {'precision': val_mean_precision}
    return validation_res_dct

In [None]:
for e in range(epoch):
    need_save = False
    start_time = time.time()
    net.train()
    train_dataset = utils.Dataset(root, train_df, input_length, num_class=num_class, label_dct=label_dct)
    test_dataset =  utils.Dataset(root, test_df, input_length, num_class=num_class, label_dct=label_dct)
    
    train_data_iter = data.DataLoader(train_dataset, batch_size=batch_size)
    running_loss = 0.0
    for i, (train_data, label) in enumerate(train_data_iter):
        if use_cuda:
            train_data = train_data.cuda()
            label = label.cuda()
        optimizer.zero_grad()
        prediction = net(train_data)
        loss = net.loss(prediction, label)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    valid_train_res = validate(net, train_dataset, batch_size, epoch)
    loss_dct = {'loss': running_loss}
    precision = valid_train_res['precision']
    logger.info(f'epoch num: {e}, loss: {running_loss}, precision: {precision}, time: {time.time() - start_time}')
    visualizer.plot(loss_dct)
    visualizer.plot(valid_train_res)

2019-06-21 14:07:16.279 | INFO     | __main__:<module>:24 - epoch num: 0, loss: 497.5976984500885, precision: 0.053738317757009345, time: 100.32557511329651


[32m2019-06-21T14:07:16.279680+0800[0m [1mepoch num: 0, loss: 497.5976984500885, precision: 0.053738317757009345, time: 100.32557511329651[0m


2019-06-21 14:08:57.472 | INFO     | __main__:<module>:24 - epoch num: 1, loss: 497.34141659736633, precision: 0.04848130841121495, time: 101.17948293685913


[32m2019-06-21T14:08:57.472002+0800[0m [1mepoch num: 1, loss: 497.34141659736633, precision: 0.04848130841121495, time: 101.17948293685913[0m


2019-06-21 14:10:38.602 | INFO     | __main__:<module>:24 - epoch num: 2, loss: 493.8310194015503, precision: 0.058878504672897194, time: 101.11765503883362


[32m2019-06-21T14:10:38.602361+0800[0m [1mepoch num: 2, loss: 493.8310194015503, precision: 0.058878504672897194, time: 101.11765503883362[0m


2019-06-21 14:12:19.795 | INFO     | __main__:<module>:24 - epoch num: 3, loss: 492.5062665939331, precision: 0.07651869158878505, time: 101.1807975769043


[32m2019-06-21T14:12:19.795585+0800[0m [1mepoch num: 3, loss: 492.5062665939331, precision: 0.07651869158878505, time: 101.1807975769043[0m


2019-06-21 14:14:00.951 | INFO     | __main__:<module>:24 - epoch num: 4, loss: 491.9674143791199, precision: 0.08130841121495327, time: 101.14267301559448


[32m2019-06-21T14:14:00.951174+0800[0m [1mepoch num: 4, loss: 491.9674143791199, precision: 0.08130841121495327, time: 101.14267301559448[0m


2019-06-21 14:15:42.155 | INFO     | __main__:<module>:24 - epoch num: 5, loss: 491.3060681819916, precision: 0.07137850467289719, time: 101.19177293777466


[32m2019-06-21T14:15:42.155751+0800[0m [1mepoch num: 5, loss: 491.3060681819916, precision: 0.07137850467289719, time: 101.19177293777466[0m


2019-06-21 14:17:23.310 | INFO     | __main__:<module>:24 - epoch num: 6, loss: 489.1843647956848, precision: 0.08738317757009346, time: 101.14164304733276


[32m2019-06-21T14:17:23.310134+0800[0m [1mepoch num: 6, loss: 489.1843647956848, precision: 0.08738317757009346, time: 101.14164304733276[0m
