In [8]:
import torch
from torch import nn
import torch.nn.functional as F


class XVector(nn.Module):
    """
    ref) https://github.com/manojpamk/pytorch_xvectors/blob/master/models.py
    """
    def __init__(self, in_channels, numSpkrs, p_dropout):
        super(XVector, self).__init__()

        self.tdnn_1 = nn.Conv1d(in_channels, 348, kernel_size=3, stride=1, dilation=2)
        self.bn_tdnn_1 = nn.BatchNorm1d(348)
        self.dropout_tdnn_1 = nn.Dropout(p=p_dropout)

        self.tdnn_2 = nn.Conv1d(348, 348, kernel_size=3, stride=1, dilation=3)
        self.bn_tdnn_2 = nn.BatchNorm1d(348)
        self.dropout_tdnn_2 = nn.Dropout(p=p_dropout)

        self.tdnn_3 = nn.Conv1d(348, 348, kernel_size=1, stride=1, dilation=1)
        self.bn_tdnn_3 = nn.BatchNorm1d(348)
        self.dropout_tdnn_3 = nn.Dropout(p=p_dropout)

        self.tdnn_4 = nn.Conv1d(348, 1500, kernel_size=1, stride=1, dilation=1)
        self.bn_tdnn_4 = nn.BatchNorm1d(1500)
        self.dropout_tdnn_4 = nn.Dropout(p=p_dropout)

        self.fc_1 = nn.Linear(3000, 128)
        self.bn_fc_1 = nn.BatchNorm1d(128)
        self.dropout_fc_1 = nn.Dropout(p=p_dropout)

        self.fc_2 = nn.Linear(128, numSpkrs)

    # def forward(self, x, eps):
    def forward(self, x):
        # Note: x must be (batch_size, feat_dim, chunk_len)
        x = self.dropout_tdnn_1(self.bn_tdnn_1(F.relu(self.tdnn_1(x))))
        x = self.dropout_tdnn_2(self.bn_tdnn_2(F.relu(self.tdnn_2(x))))
        x = self.dropout_tdnn_3(self.bn_tdnn_3(F.relu(self.tdnn_3(x))))
        x = self.dropout_tdnn_4(self.bn_tdnn_4(self.tdnn_4(x)))

        # if self.training:
        #     x = x + torch.randn(x.size()).cuda()*eps

        stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
        x = self.dropout_fc_1(self.bn_fc_1(F.relu(self.fc_1(stats))))
        x = self.fc_2(x)
        return x


class Lopez2021Encoder(nn.Module):
    def __init__(self):
        super(Lopez2021Encoder, self).__init__()

        self.conv1d_1 = nn.Conv1d(1, 64, 7, stride=1)
        self.bn_1 = nn.BatchNorm1d(64)

        self.conv1d_2 = nn.Conv1d(64, 32, 5, stride=1)
        self.conv1d_3 = nn.Conv1d(32, 6, 3, stride=1)
        self.max_pool = nn.MaxPool1d(2, stride=2)

    def forward(self, x):
        out = F.relu(self.bn_1(self.conv1d_1(x)))
        out = F.relu(self.conv1d_2(out))
        out = self.conv1d_3(out)
        out = F.relu(self.max_pool(out))
        return out


class Lopez2021(nn.Module):
    """ 
    precit the section ID meta-data parameter using the categorical cross entropy loss fn
    """
    def __init__(self):
        super(Lopez2021, self).__init__()

        self.encoder = Lopez2021Encoder()
        self.xvector = XVector(in_channels=1, numSpkrs=3, p_dropout=0.1)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        x = self.encoder(x)
        x = x.reshape(x.size(0), 1, -1)
        x = self.xvector(x)
        return x


In [38]:
from torchinfo import summary

class Lopez2021Encoder(nn.Module):
    def __init__(self, n_mels):
        super(Lopez2021Encoder, self).__init__()

        self.conv1d_1 = nn.Conv2d(1, 64, kernel_size=(n_mels,3), padding=(0,0), stride=1)
        self.bn_1 = nn.BatchNorm2d(128)

        self.conv1d_2 = nn.Conv1d(128, 192, kernel_size=(n_mels,3), padding=(0,0), stride=1)
        self.bn_2 = nn.BatchNorm1d(192)

        self.conv1d_3 = nn.Conv2d(192, 192, kernel_size=(n_mels,3), padding=(0,0), stride=1)
        self.bn_3 = nn.BatchNorm1d(192)

        self.conv1d_4 = nn.Conv2d(192, 192, kernel_size=(n_mels,3), padding=(0,0), stride=1)
        self.bn_4 = nn.BatchNorm1d(192)

        self.conv1d_5 = nn.Conv2d(192, 192, kernel_size=(n_mels,3), padding=(0,0), stride=1)
        self.bn_5 = nn.BatchNorm1d(192)

        self.conv1d_6 = nn.Conv2d(192, 192, kernel_size=(n_mels,3), padding=(0,0), stride=1)
        self.bn_6 = nn.BatchNorm1d(192)


        # self.conv1d_2 = nn.Conv1d(64, 32, 5, stride=1)
        # self.conv1d_3 = nn.Conv1d(32, 6, 3, stride=1)
        # self.max_pool = nn.MaxPool1d(2, stride=2)

    def forward(self, x):
        x = self.conv1d_1(x)
        # x = F.leaky_relu(self.bn_1(self.conv1d_1(x)), negative_slope=0.01)
        # x = F.leaky_relu(self.bn_2(self.conv1d_2(x)), negative_slope=0.01)
        # x = F.leaky_relu(self.bn_3(self.conv1d_3(x)), negative_slope=0.01)
        # x = F.leaky_relu(self.bn_4(self.conv1d_4(x)), negative_slope=0.01)
        # x = F.leaky_relu(self.bn_5(self.conv1d_5(x)), negative_slope=0.01)
        # out = F.relu(self.conv1d_2(out))
        # out = self.conv1d_3(out)
        # out = F.relu(self.max_pool(out))
        return x

model = Lopez2021Encoder(n_mels=2048)
summary(model, (1, 1, 2048, 2049))

In [2]:
from data_utils import *
from util import *

# @markdown config

config = {
    'dev_directory': 'D:/dcase/dev_data_tmp',
    'eval_directory': 'D:/dcase/eval_data',


    'max_fpr': 0.1,
    'decision_threshold': 0.9,
    'ext': 'wav',

    'training': {
        'learning_rate': 0.001,
        'num_epochs' : 100,
        # 'batch_size' : 512,
        'shuffle' : True,
        'validation_split' : 0.1,
        'weight_decay': 0.0,
        'lr_step_size': 50,
        'lr_gamma': 0.5,
        # 'verbose' : 1,
    },

    ########### machine config ###########
    'ToyCar': {
        'batch_size': 1,
        'input_samples': 16384,
        'n_mels': 2048,
        'n_frames': 2049,
        'n_hop_frames': 1,
        'n_fft': 4096,
        'hop_length': 80,
        'power': 2.0,
    },
}

In [3]:
files, labels = file_list_generator(target_dir='D:/dcase/dev_data_tmp/ToyCar/',
                                    section_name='*',
                                    dir_name='train',
                                    mode='dev')

dcase_dataset = DcaseDataset(files,
                                labels,
                                config=config,
                                machine_config=config['ToyCar'],
                                transform=None)
                                
data_loader = {'train': None, 'val': None}

data_loader['train'], data_loader['val'] = get_dataloader(dcase_dataset,
                                                            config=config,
                                                            machine_type='ToyCar')

target_dir : D:/dcase/dev_data_tmp/ToyCar/_*
Number of audio files : 4


4it [00:02,  1.40it/s]

Feature Shape: (4, 1, 2048, 2049)
train size: 3, val_size: 0



