In [1]:
from pathlib import Path
from scipy.io import wavfile

In [2]:
import numpy as np

In [3]:
import torch

In [4]:
from torch import nn

In [5]:
from torch.utils import data

In [6]:
from  sklearn.model_selection import train_test_split

In [7]:
from collections import OrderedDict

In [8]:
import utils

In [9]:
class NamedLayer(nn.Sequential):
    def __init__(self, name, start=None, end=None, bn=False):
        super().__init__()
        self.name = name
        self.start = start
        self.end = end
        self.bn = bn

In [10]:
def get_convolutional_layer(index, in_channels, out_channels, kernel_size=3, stride=1, padding=True, batch_normalize=True, activation='leaky'):
    
    if batch_normalize:
        bias = False
    else:
        bias = True
        
    if padding:
        padding = (kernel_size - 1) // 2
    else:
        padding = 0

    model = NamedLayer('convolutional', bn=batch_normalize)
    conv_layer = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
    model.add_module(f'conv_{index}: ', conv_layer)
    if batch_normalize:
        bn_layer = nn.BatchNorm1d(out_channels)
        model.add_module(f'bn_{index}', bn_layer)
        
    if activation == 'leaky':
        activation_layer = nn.LeakyReLU(0.1, inplace=True)
        model.add_module(f'activation_{index}', activation_layer)
    elif activation == 'relu':
        activation_layer = nn.ReLU(inplace=True)
        model.add_module(f'activation_{index}', activation_layer)
    return model

In [11]:
def get_block(index, in_channels, out_channels, kernel_size, num=2, batch_normalize=True):
    """
    """
    model = NamedLayer(f'convolutional_{index}', bn=batch_normalize)
    for i in range(num):
        if i == 0:
            channels = in_channels
        conv = get_convolutional_layer(index, channels, out_channels, kernel_size, stride=1)
        channels = out_channels
        model.add_module(f'conv_{i}', conv)
    
    model.add_module(f'downsample_{index}', nn.Conv1d(channels, out_channels, kernel_size, stride=2, padding=1))
    return model

In [12]:
def get_linear_layer(in_channels, out_channels, num=2):
    """
    """
    model = NamedLayer('linear')
    for i in range(num):
        if i == 0:
            channels = in_channels
        linear_layer = nn.Linear(channels, out_channels)
        channels = out_channels
        model.add_module(f'linear_{i}', linear_layer)
        activation_layer = nn.ReLU(inplace=True)
        model.add_module(f'activation_{i}', activation_layer)
    return model

In [13]:
def get_softmax_layer(in_channels, num_class):
    """
    """
    model = NamedLayer('softmax')
    linear_layer = nn.Linear(in_channels, num_class)
    model.add_module(f'linear', linear_layer)
    softmax_layer = nn.Softmax(dim=1)
    model.add_module(f'softmax', softmax_layer)
    return model

In [14]:
def get_model():
    model = nn.Sequential()
    layer_0 = get_block(0, in_channels=1, out_channels=16, kernel_size=3)
    model.add_module('layer_0', layer_0)
    layer_1 = get_block(1, in_channels=16, out_channels=32, kernel_size=3)
    model.add_module('layer_1', layer_1)
    layer_2 = get_block(2, in_channels=32, out_channels=64, kernel_size=3)
    model.add_module('layer_2', layer_2)
    layer_3 = get_block(3, in_channels=64, out_channels=128, kernel_size=3)
    model.add_module('layer_3', layer_3)
    return model

In [15]:
class AudioNet(nn.Module):
    """
    """

    def __init__(self,
                 input_length,
                 name_tuple,
                 use_cuda=torch.cuda.is_available()):
        super().__init__()
        self.input_length = input_length
        self.name_tuple = name_tuple
        self.num_class = len(name_tuple)
        self.use_cuda=use_cuda
        self._init()
        
    def _init(self):
        conv_block_0 = get_block(0, in_channels=1, out_channels=16, kernel_size=3)
        self.add_module('conv_block_0',conv_block_0)
        conv_block_1 = get_block(1, in_channels=16, out_channels=32, kernel_size=3)
        self.add_module('conv_block_1', conv_block_1)
        conv_block_2 = get_block(2, in_channels=32, out_channels=64, kernel_size=3)
        self.add_module('conv_block_2', conv_block_2)
        conv_block_3 = get_block(3, in_channels=64, out_channels=64, kernel_size=3)
        self.add_module('conv_block_3', conv_block_3)
        conv_block_4 = get_block(4, in_channels=64, out_channels=32, kernel_size=3)
        self.add_module('conv_block_4', conv_block_4)
        conv_block_5 = get_block(5, in_channels=32, out_channels=16, kernel_size=3)
        self.add_module('conv_block_5', conv_block_5)
        linear_block = get_linear_layer(22064, 1024)
        self.add_module('linear_block',linear_block)
        softmax_block = get_softmax_layer(1024, self.num_class)
        self.add_module('softmax_block', softmax_block)
        
    def forward(self, x):
        """
        """
        x = torch.unsqueeze(x, dim=1)
        for module in self.children():
            utils.logger.info(module.name)
            utils.logger.info(x.shape)
            if module.name == 'linear':
                b, c, l = x.shape
                size = c * l
                x = x.reshape((b, size, ))
                utils.logger.info(x.shape)
            x = module(x)
        return x
    
    def loss(self, prediction, target):
        """
        """
        loss = nn.CrossEntropyLoss()
        return loss(prediction, target)

In [16]:
root = Path('/data/FSDKaggle2018/FSDKaggle2018.audio_train')
train_fname = Path('/data/FSDKaggle2018/FSDKaggle2018.meta/train_post_competition.csv')

In [17]:
labels, label_dct = utils.get_dataset_meta(train_fname)

In [18]:
full_df = utils.get_test_data(train_fname)

2019-06-20 18:41:03.344 | INFO     | utils:get_test_data:61 - Reading from /data/FSDKaggle2018/FSDKaggle2018.meta/train_validate.csv


In [19]:
train_df = full_df[full_df.test == False]
train_df.index = range(len(train_df))

In [20]:
test_df = full_df[full_df.test == True]
test_df.index = range(len(test_df))

In [21]:
train_dataset = utils.Dataset(root, train_df, input_length=2 * 44100, label_dct=label_dct)

In [22]:
test_dataset =  utils.Dataset(root, test_df, input_length=2 * 44100, label_dct=label_dct)

In [23]:
net = AudioNet(input_length=train_dataset.input_length, name_tuple=labels)

In [24]:
train_data_iter = data.DataLoader(train_dataset, batch_size=1)

In [25]:
use_cuda = net.use_cuda

In [26]:
if use_cuda:
    net = net.cuda()

In [27]:
for dt, label in train_data_iter:
    if use_cuda:
        dt = dt.cuda()
        label = label.cuda()
    break

In [28]:
dt.shape

torch.Size([1, 88200])

In [29]:
1379 * 16

22064

In [30]:
res = net(dt)

2019-06-20 18:41:10.277 | INFO     | __main__:forward:39 - convolutional_0
2019-06-20 18:41:10.278 | INFO     | __main__:forward:40 - torch.Size([1, 1, 88200])
2019-06-20 18:41:10.287 | INFO     | __main__:forward:39 - convolutional_1
2019-06-20 18:41:10.288 | INFO     | __main__:forward:40 - torch.Size([1, 16, 44100])
2019-06-20 18:41:10.293 | INFO     | __main__:forward:39 - convolutional_2
2019-06-20 18:41:10.294 | INFO     | __main__:forward:40 - torch.Size([1, 32, 22050])
2019-06-20 18:41:10.298 | INFO     | __main__:forward:39 - convolutional_3
2019-06-20 18:41:10.299 | INFO     | __main__:forward:40 - torch.Size([1, 64, 11025])
2019-06-20 18:41:10.303 | INFO     | __main__:forward:39 - convolutional_4
2019-06-20 18:41:10.304 | INFO     | __main__:forward:40 - torch.Size([1, 64, 5513])
2019-06-20 18:41:10.307 | INFO     | __main__:forward:39 - convolutional_5
2019-06-20 18:41:10.308 | INFO     | __main__:forward:40 - torch.Size([1, 32, 2757])
2019-06-20 18:41:10.310 | INFO     | 

In [31]:
res.shape

torch.Size([1, 41])