In [1]:
import os
os.getcwd()

'/home/hj20/dcase_2020_T6'

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)


class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):

        super(ConvBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=(5, 5), stride=(1, 1),
                               padding=(1, 1), bias=False)
        
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
        
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)


    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x


class Cnn14(nn.Module):
    def __init__(self):

        super(Cnn14,self).__init__()

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
                                     
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)

    def forward(self, input):

        x = input.unsqueeze(1)   # (batch_size, 1, time_steps, mel_bins)

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)     #(batch_size, 512, T/16, mel_bins/16)

        return x

class Tag(nn.Module):
    def __init__(self,class_num):
        super(Tag, self).__init__()
        self.feature = Cnn14()
        self.fc1 = nn.Linear(512,512,bias=True)
        self.fc = nn.Linear(512,class_num,bias=True)
        self.init_weights()

    def init_weights(self):
        init_layer(self.fc1)
        init_layer(self.fc)

    def forward(self,input):
        '''
        :param input: (batch_size,time_steps, mel_bins)
        :return: ()
        '''
        x = self.feature(input)     #(batch_size, 512, T/16, mel_bins/16)
        x = torch.mean(x,dim=3)     #(batch_size, 512, T/16)
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu_(self.fc1(x))
        #(batch_size,class_num)
        output = torch.sigmoid(self.fc(x))
        # output = self.fc(x)

        return output
    
    
    




In [None]:
#tag_concate

In [12]:
class_num

300

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time

from audio_tag.data_loader import tag_loader
import numpy as np
from hparams import hparams as hp
#from encoder import Tag
from tqdm import tqdm

class_num = 500
device = torch.device("cuda")
data_dir = hp.data_dir
learning_rate=1e-3
model = Tag(class_num).to(device)
training_data = tag_loader(data_dir=data_dir, split='development',
                                      batch_size=16,class_num=class_num)
test_data = tag_loader(data_dir=data_dir, split='evaluation',
                               batch_size=16,class_num=class_num)
optimizer =torch.optim.Adam(model.parameters(), lr=learning_rate,
        betas=(0.9, 0.999), eps=1e-08, weight_decay=0., amsgrad=True)
# optimizer = torch.optim.Adam(model.parameters(), lr=hp.lr, weight_decay=1e-6)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.98)
tag_loss = nn.BCELoss()


In [4]:
def train(epoch):
    # bar  = tqdm(training_data,total=len(training_data))
    loss_list = []
    model.train()
    with tqdm(training_data,total=len(training_data)) as bar:
        for i, (feature, tag) in enumerate(bar):
            feature = feature.to(device)
            tag = tag.to(device)
            optimizer.zero_grad()
            out_tag = model(feature)
            loss = tag_loss(out_tag,tag)
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())
            bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch, i, np.mean(loss_list)))
    return np.mean(loss_list)

In [5]:

def test(epoch):
    eva_loss = []
    model.eval()
    with torch.no_grad():
        for i, (feature, tag) in enumerate(test_data):
            feature = feature.to(device)
            tag = tag.to(device)
            out_tag = model(feature)
            loss = tag_loss(out_tag, tag)
            eva_loss.append(loss.item())
    mean_loss = np.mean(eva_loss)
    print("epoch:{:d}--testloss:{:.6f}".format(epoch,mean_loss.item()))

    # return  mean_loss


In [9]:
if __name__ == '__main__':
    train_b = True
    epoch_last = 0
    if train_b:
        # model.load_state_dict(torch.load("./models/280/TagModel_{}.pt".format(str(40))))
        for epoch in range(epoch_last+1,300):
            train(epoch)
            scheduler.step(epoch)
            test(epoch)
            if epoch%5==0:
                torch.save(model.state_dict(),'./models/tag_models/TagModel_{}.pt'.format(epoch))
    else:
        for epoch in range(0,225,5):
            model.load_state_dict(torch.load("./models/tag_models/TagModel_{}.pt".format(str(epoch))))
            test(epoch)


epoch:1 idx:195 loss:0.112109:  64%|██████▍   | 196/305 [03:38<02:01,  1.11s/it]


KeyboardInterrupt: 

In [7]:
class MyModelA(nn.Module):
    def __init__(self):
        super(MyModelA, self).__init__()
        self.fc1 = nn.Linear(10, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        return x
    

class MyModelB(nn.Module):
    def __init__(self):
        super(MyModelB, self).__init__()
        self.fc1 = nn.Linear(20, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        return x


class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.classifier = nn.Linear(4, 2)
        
    def forward(self, x1, x2):
        x1 = self.modelA(x1)
        x2 = self.modelB(x2)
        x = torch.cat((x1, x2), dim=1)
        x = self.classifier(F.relu(x))
        return x


In [19]:
!nvidia-smi

/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for
       usage information.

/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for
       usage information.

Thu Oct 28 20:36:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0 Off |                  N/A |
| 35%   34C    P8    21W / 260W |  10375MiB / 11016MiB |      0%      Defaul

In [None]:
!kill 16301

---

image_net으로 학습된 resnet18 사용하여 tag train 학습하기

In [4]:
#프리트레인 된 모델 불러오기
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

model=models.resnet18(pretrained=True).cuda()
num_ftrs=model.fc.in_features
model.fc=nn.Linear(num_ftrs,300)
model=model.cuda()

In [5]:
model = model.to(device)

In [6]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [34]:
import torch.nn as nn


class VGGish(nn.Module):
    """
    PyTorch implementation of the VGGish model.
    Adapted from: https://github.com/harritaylor/torch-vggish
    The following modifications were made: (i) correction for the missing ReLU layers, (ii) correction for the
    improperly formatted data when transitioning from NHWC --> NCHW in the fully-connected layers, and (iii)
    correction for flattening in the fully-connected layers.
    """

    def __init__(self):
        super(VGGish, self).__init__()
        
        self.bn0 = nn.BatchNorm2d(64)
        
        
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, 1, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),

            nn.Conv2d(64, 128, 1, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),

            nn.Conv2d(128, 256, 1, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 1, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),

            nn.Conv2d(256, 512, 1, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, 1, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2)
        )
        self.fc = nn.Sequential(
            nn.Linear(512 * 24, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 300),
            nn.ReLU(inplace=True),
        )
    
    def init_weight(self):
        init_bn(self.bn0)
    
    
    def forward(self, input):
        
        x = input.unsqueeze(-1)
        x = self.features(x)
        x = torch.mean(x,dim=3)  
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        
        x = F.dropout(x, p=0.2, training=self.training)
        
        #x = F.relu_(self.fc(x)) 
        
        output = torch.sigmoid(self.fc(x)) 
        
        return output


def main():
    pass


if __name__ == '__main__':
    main()

In [28]:
x=[[[1, 2],[3, 4]]]
x=torch.tensor(x)

In [29]:
x.shape

torch.Size([1, 2, 2])

In [32]:
x1=x.unsqueeze(-1)

In [33]:
x1.shape

torch.Size([1, 2, 2, 1])

In [35]:
model=VGGish()

In [36]:
model

VGGish(
  (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=

In [37]:
model = model.to(device)

In [38]:
def train(epoch):
    # bar  = tqdm(training_data,total=len(training_data))
    loss_list = []
    model.train()
    with tqdm(training_data,total=len(training_data)) as bar:
        for i, (feature, tag) in enumerate(bar):
            feature = feature.to(device)
            tag = tag.to(device)
            optimizer.zero_grad()
            out_tag = model(feature)
            loss = tag_loss(out_tag,tag)
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())
            bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch, i, np.mean(loss_list)))
    return np.mean(loss_list)

In [39]:
if __name__ == '__main__':
    train_b = True
    epoch_last = 0
    if train_b:
        # model.load_state_dict(torch.load("./models/280/TagModel_{}.pt".format(str(40))))
        for epoch in range(epoch_last+1,epoch_last+5):
            train(epoch)
            scheduler.step(epoch)
            test(epoch)
            if epoch%5==0:
                torch.save(model.state_dict(),'./models/tag_models/TagModel_{}.pt'.format(epoch))
    else:
        for epoch in range(0,225,5):
            model.load_state_dict(torch.load("./models/tag_models/TagModel_{}.pt".format(str(epoch))))
            test(epoch)

  0%|          | 0/239 [00:00<?, ?it/s]


RuntimeError: Given groups=1, weight of size [64, 1, 1, 1], expected input[16, 2559, 64, 1] to have 1 channels, but got 2559 channels instead

In [9]:
import librosa

In [None]:
l

In [None]:
#차원맞추기
y = audio_data
mel_bands = melspectrogram(
        y=y, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power, n_mels=nb_mels,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm).T
logmel_spectrogram = librosa.core.power_to_db(
            mel_bands, ref=1.0, amin=1e-10, 
            top_db=None)
logmel_spectrogram = logmel_spectrogram.astype(np.float32)        




In [39]:
from typing import Tuple, Optional

import torch


class ConvTasNet(torch.nn.Module):
    """Conv-TasNet: a fully-convolutional time-domain audio separation network

    Args:
        num_sources (int): The number of sources to split.
        enc_kernel_size (int): The convolution kernel size of the encoder/decoder, <L>.
        enc_num_feats (int): The feature dimensions passed to mask generator, <N>.
        msk_kernel_size (int): The convolution kernel size of the mask generator, <P>.
        msk_num_feats (int): The input/output feature dimension of conv block in the mask generator, <B, Sc>.
        msk_num_hidden_feats (int): The internal feature dimension of conv block of the mask generator, <H>.
        msk_num_layers (int): The number of layers in one conv block of the mask generator, <X>.
        msk_num_stacks (int): The numbr of conv blocks of the mask generator, <R>.

    Note:
        This implementation corresponds to the "non-causal" setting in the paper.

    Reference:
        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation

          Luo, Yi and Mesgarani, Nima

          https://arxiv.org/abs/1809.07454
    """

    def __init__(
        self,
        num_sources: int = 2,
        # encoder/decoder parameters
        enc_kernel_size: int = 16,
        enc_num_feats: int = 512,
        # mask generator parameters
        msk_kernel_size: int = 3,
        msk_num_feats: int = 128,
        msk_num_hidden_feats: int = 512,
        msk_num_layers: int = 8,
        msk_num_stacks: int = 3,
    ):
        super().__init__()

        self.num_sources = num_sources
        self.enc_num_feats = enc_num_feats
        self.enc_kernel_size = enc_kernel_size
        self.enc_stride = enc_kernel_size // 2

        self.encoder = torch.nn.Conv1d(
            in_channels=1,
            out_channels=enc_num_feats,
            kernel_size=enc_kernel_size,
            stride=self.enc_stride,
            padding=self.enc_stride,
            bias=False,
        )
        self.mask_generator = MaskGenerator(
            input_dim=enc_num_feats,
            num_sources=num_sources,
            kernel_size=msk_kernel_size,
            num_feats=msk_num_feats,
            num_hidden=msk_num_hidden_feats,
            num_layers=msk_num_layers,
            num_stacks=msk_num_stacks,
        )
        self.decoder = torch.nn.ConvTranspose1d(
            in_channels=enc_num_feats,
            out_channels=1,
            kernel_size=enc_kernel_size,
            stride=self.enc_stride,
            padding=self.enc_stride,
            bias=False,
        )

    def _align_num_frames_with_strides(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, int]:
        
        batch_size, num_channels, num_frames = input.shape
        is_odd = self.enc_kernel_size % 2
        num_strides = (num_frames - is_odd) // self.enc_stride
        num_remainings = num_frames - (is_odd + num_strides * self.enc_stride)
        if num_remainings == 0:
            return input, 0

        num_paddings = self.enc_stride - num_remainings
        pad = torch.zeros(
            batch_size,
            num_channels,
            num_paddings,
            dtype=input.dtype,
            device=input.device,
        )
        return torch.cat([input, pad], 2), num_paddings

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """Perform source separation. Generate audio source waveforms.

        Args:
            input (torch.Tensor): 3D Tensor with shape [batch, channel==1, frames]

        Returns:
            torch.Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
        """
        if input.ndim != 3 or input.shape[1] != 1:
            raise ValueError(
                f"Expected 3D tensor (batch, channel==1, frames). Found: {input.shape}"
            )

        # B: batch size
        # L: input frame length
        # L': padded input frame length
        # F: feature dimension
        # M: feature frame length
        # S: number of sources

        padded, num_pads = self._align_num_frames_with_strides(input)  # B, 1, L'
        batch_size, num_padded_frames = padded.shape[0], padded.shape[2]
        feats = self.encoder(padded)  # B, F, M
        masked = self.mask_generator(feats) * feats.unsqueeze(1)  # B, S, F, M
        masked = masked.view(
            batch_size * self.num_sources, self.enc_num_feats, -1
        )  # B*S, F, M
        decoded = self.decoder(masked)  # B*S, 1, L'
        output = decoded.view(
            batch_size, self.num_sources, num_padded_frames
        )  # B, S, L'
        if num_pads > 0:
            output = output[..., :-num_pads]  # B, S, L
        return output

In [41]:
class MaskGenerator(torch.nn.Module):
    """TCN (Temporal Convolution Network) Separation Module

    Generates masks for separation.

    Args:
        input_dim (int): Input feature dimension, <N>.
        num_sources (int): The number of sources to separate.
        kernel_size (int): The convolution kernel size of conv blocks, <P>.
        num_featrs (int): Input/output feature dimenstion of conv blocks, <B, Sc>.
        num_hidden (int): Intermediate feature dimention of conv blocks, <H>
        num_layers (int): The number of conv blocks in one stack, <X>.
        num_stacks (int): The number of conv block stacks, <R>.

    Note:
        This implementation corresponds to the "non-causal" setting in the paper.

    References:
        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
          Luo, Yi and Mesgarani, Nima
          https://arxiv.org/abs/1809.07454
    """

    def __init__(
        self,
        input_dim: int,
        num_sources: int,
        kernel_size: int,
        num_feats: int,
        num_hidden: int,
        num_layers: int,
        num_stacks: int,
    ):
        super().__init__()

        self.input_dim = input_dim
        self.num_sources = num_sources

        self.input_norm = torch.nn.GroupNorm(
            num_groups=1, num_channels=input_dim, eps=1e-8
        )
        self.input_conv = torch.nn.Conv1d(
            in_channels=input_dim, out_channels=num_feats, kernel_size=1
        )

        self.receptive_field = 0
        self.conv_layers = torch.nn.ModuleList([])
        for s in range(num_stacks):
            for l in range(num_layers):
                multi = 2 ** l
                self.conv_layers.append(
                    ConvBlock(
                        io_channels=num_feats,
                        hidden_channels=num_hidden,
                        kernel_size=kernel_size,
                        dilation=multi,
                        padding=multi,
                        # The last ConvBlock does not need residual
                        no_residual=(l == (num_layers - 1) and s == (num_stacks - 1)),
                    )
                )
                self.receptive_field += (
                    kernel_size if s == 0 and l == 0 else (kernel_size - 1) * multi
                )
        self.output_prelu = torch.nn.PReLU()
        self.output_conv = torch.nn.Conv1d(
            in_channels=num_feats, out_channels=input_dim * num_sources, kernel_size=1,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """Generate separation mask.

        Args:
            input (torch.Tensor): 3D Tensor with shape [batch, features, frames]

        Returns:
            torch.Tensor: shape [batch, num_sources, features, frames]
        """
        batch_size = input.shape[0]
        feats = self.input_norm(input)
        feats = self.input_conv(feats)
        output = 0.0
        for layer in self.conv_layers:
            residual, skip = layer(feats)
            if residual is not None:  # the last conv layer does not produce residual
                feats = feats + residual
            output = output + skip
        output = self.output_prelu(output)
        output = self.output_conv(output)
        output = torch.sigmoid(output)
        return output.view(batch_size, self.num_sources, self.input_dim, -1)


In [43]:
class ConvBlock(torch.nn.Module):
    """1D Convolutional block.

    Args:
        io_channels (int): The number of input/output channels, <B, Sc>
        hidden_channels (int): The number of channels in the internal layers, <H>.
        kernel_size (int): The convolution kernel size of the middle layer, <P>.
        padding (int): Padding value of the convolution in the middle layer.
        dilation (int): Dilation value of the convolution in the middle layer.
        no_redisual (bool): Disable residual block/output.

    Note:
        This implementation corresponds to the "non-causal" setting in the paper.

    Reference:
        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation

          Luo, Yi and Mesgarani, Nima

          https://arxiv.org/abs/1809.07454
    """

    def __init__(
        self,
        io_channels: int,
        hidden_channels: int,
        kernel_size: int,
        padding: int,
        dilation: int = 1,
        no_residual: bool = False,
    ):
        super().__init__()

        self.conv_layers = torch.nn.Sequential(
            torch.nn.Conv1d(
                in_channels=io_channels, out_channels=hidden_channels, kernel_size=1
            ),
            torch.nn.PReLU(),
            torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08),
            torch.nn.Conv1d(
                in_channels=hidden_channels,
                out_channels=hidden_channels,
                kernel_size=kernel_size,
                padding=padding,
                dilation=dilation,
                groups=hidden_channels,
            ),
            torch.nn.PReLU(),
            torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08),
        )

        self.res_out = (
            None
            if no_residual
            else torch.nn.Conv1d(
                in_channels=hidden_channels, out_channels=io_channels, kernel_size=1
            )
        )
        self.skip_out = torch.nn.Conv1d(
            in_channels=hidden_channels, out_channels=io_channels, kernel_size=1
        )

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
        feature = self.conv_layers(input)
        if self.res_out is None:
            residual = None
        else:
            residual = self.res_out(feature)
        skip_out = self.skip_out(feature)
        return residual, skip_out

In [44]:
model=ConvTasNet()

ConvTasNet(
  (encoder): Conv1d(1, 512, kernel_size=(16,), stride=(8,), padding=(8,), bias=False)
  (mask_generator): MaskGenerator(
    (input_norm): GroupNorm(1, 512, eps=1e-08, affine=True)
    (input_conv): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
    (conv_layers): ModuleList(
      (0): ConvBlock(
        (conv_layers): Sequential(
          (0): Conv1d(128, 512, kernel_size=(1,), stride=(1,))
          (1): PReLU(num_parameters=1)
          (2): GroupNorm(1, 512, eps=1e-08, affine=True)
          (3): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
          (4): PReLU(num_parameters=1)
          (5): GroupNorm(1, 512, eps=1e-08, affine=True)
        )
        (res_out): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
        (skip_out): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
      )
      (1): ConvBlock(
        (conv_layers): Sequential(
          (0): Conv1d(128, 512, kernel_size=(1,), stride=(1,))
          (1): PReLU(num_parameters=

In [49]:
 # pool of square window of size=3, stride=2
m = nn.AvgPool2d(3, stride=2)
 # pool of non-square window
m = nn.AvgPool2d((3, 2), stride=(2, 1))
input = torch.randn(20, 16, 50, 32)
output = m(input)

In [51]:
output.shape

torch.Size([20, 16, 24, 31])

In [41]:
CHECKPOINT_PATH="Cnn14_mAP=0.431.pth"
!wget -O $CHECKPOINT_PATH https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth?download=1
!MODEL_TYPE="Cnn14"
!CUDA_VISIBLE_DEVICES=0 python3 pytorch/inference.py audio_tagging --model_type=$MODEL_TYPE --checkpoint_path=$CHECKPOINT_PATH --audio_path="resources/R9_ZSCveAHg_7s.wav" --cuda

--2021-05-20 22:03:21--  https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 327428481 (312M) [application/octet-stream]
Saving to: ‘Cnn14_mAP=0.431.pth’


2021-05-20 22:12:03 (615 KB/s) - ‘Cnn14_mAP=0.431.pth’ saved [327428481/327428481]

python3: can't open file 'pytorch/inference.py': [Errno 2] No such file or directory


In [50]:
model = torch.load("Cnn14_mAP=0.431.pth")

In [51]:
model

{'iteration': 660000,
 'model': OrderedDict([('spectrogram_extractor.stft.conv_real.weight',
               tensor([[[ 0.0000e+00,  9.4124e-06,  3.7649e-05,  ...,  8.4709e-05,
                          3.7649e-05,  9.4124e-06]],
               
                       [[ 0.0000e+00,  9.4122e-06,  3.7646e-05,  ...,  8.4695e-05,
                          3.7646e-05,  9.4122e-06]],
               
                       [[ 0.0000e+00,  9.4117e-06,  3.7638e-05,  ...,  8.4652e-05,
                          3.7638e-05,  9.4117e-06]],
               
                       ...,
               
                       [[ 0.0000e+00, -9.4117e-06,  3.7638e-05,  ..., -8.4652e-05,
                          3.7638e-05, -9.4117e-06]],
               
                       [[ 0.0000e+00, -9.4122e-06,  3.7646e-05,  ..., -8.4695e-05,
                          3.7646e-05, -9.4122e-06]],
               
                       [[ 0.0000e+00, -9.4124e-06,  3.7649e-05,  ..., -8.4709e-05,
                    

In [48]:
!pip install config

Collecting config
  Downloading config-0.5.0.post0-py2.py3-none-any.whl (20 kB)
Installing collected packages: config
Successfully installed config-0.5.0.post0


In [49]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
import numpy as np
import argparse
import h5py
import math
import time
import logging
import matplotlib.pyplot as plt

import torch
torch.backends.cudnn.benchmark=True
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
 
#rom utilities import get_filename
from models import *
import config

classes_num=300

class Transfer_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num, freeze_base):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn14, self).__init__()
        audioset_classes_num = 527
        
        self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, 
            fmax, audioset_classes_num)

        # Transfer to another task layer
        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)

        if freeze_base:
            # Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()

    def init_weights(self):
        init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint_path):
        checkpoint = torch.load(pretrained_checkpoint_path)
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        output_dict = self.base(input, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
        output_dict['clipwise_output'] = clipwise_output
 
        return output_dict




In [44]:
!pip install utilities-package

Collecting utilities-package
  Downloading utilities-package-0.0.8.tar.gz (28 kB)
Collecting bash
  Downloading bash-0.6.tar.gz (2.8 kB)
Collecting configparser
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting unidecode
  Downloading Unidecode-1.2.0-py2.py3-none-any.whl (241 kB)
[K     |████████████████████████████████| 241 kB 2.1 MB/s eta 0:00:01
[?25hCollecting uritools
  Downloading uritools-3.0.2-py3-none-any.whl (12 kB)
Collecting simplejson
  Downloading simplejson-3.17.2-cp37-cp37m-manylinux2010_x86_64.whl (128 kB)
[K     |████████████████████████████████| 128 kB 3.2 MB/s eta 0:00:01
[?25hCollecting tailer
  Downloading tailer-0.4.1.tar.gz (7.5 kB)
Collecting schedule
  Downloading schedule-1.1.0-py2.py3-none-any.whl (10 kB)
Collecting cssutils
  Downloading cssutils-2.2.0-py3-none-any.whl (405 kB)
[K     |████████████████████████████████| 405 kB 3.1 MB/s eta 0:00:01
[?25hCollecting base58
  Downloading base58-2.1.0-py3-none-any.whl (5.6 kB)
Collecting 

Collecting jaraco.classes
  Downloading jaraco.classes-3.2.1-py3-none-any.whl (5.6 kB)
Collecting jaraco.text
  Downloading jaraco.text-3.5.0-py3-none-any.whl (8.1 kB)
Collecting pytest
  Downloading pytest-6.2.4-py3-none-any.whl (280 kB)
[K     |████████████████████████████████| 280 kB 4.4 MB/s eta 0:00:01
Collecting lazy-object-proxy==1.4.*
  Downloading lazy_object_proxy-1.4.3-cp37-cp37m-manylinux1_x86_64.whl (56 kB)
[K     |████████████████████████████████| 56 kB 559 kB/s eta 0:00:01
[?25hCollecting wrapt==1.11.*
  Downloading wrapt-1.11.2.tar.gz (27 kB)
Collecting iniconfig
  Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting mando<0.7,>=0.6
  Downloading mando-0.6.4-py2.py3-none-any.whl (29 kB)
Building wheels for collected packages: utilities-package, bash, pycontracts, wrapt, tailer
  Building wheel for utilities-package (setup.py) ... [?25ldone
[?25h  Created wheel for utilities-package: filename=utilities_package-0.0.8-py3-none-any.whl size=38424 sha256

In [None]:
def train(args):

    # Arugments & parameters
    sample_rate = args.sample_rate
    window_size = args.window_size
    hop_size = args.hop_size
    mel_bins = args.mel_bins
    fmin = args.fmin
    fmax = args.fmax
    model_type = args.model_type
    pretrained_checkpoint_path = args.pretrained_checkpoint_path
    freeze_base = args.freeze_base
    device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu'

    classes_num = config.classes_num
    pretrain = True if pretrained_checkpoint_path else False
    
    # Model
    Model = eval(model_type)
    model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax, 
        classes_num, freeze_base)

    # Load pretrained model
    if pretrain:
        logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
        model.load_from_pretrain(pretrained_checkpoint_path)

    # Parallel
    print('GPU number: {}'.format(torch.cuda.device_count()))
    model = torch.nn.DataParallel(model)

    if 'cuda' in device:
        model.to(device)

    print('Load pretrained model successfully!')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Example of parser. ')
    subparsers = parser.add_subparsers(dest='mode')

    # Train
    parser_train = subparsers.add_parser('train')
    parser_train.add_argument('--sample_rate', type=int, required=True)
    parser_train.add_argument('--window_size', type=int, required=True)
    parser_train.add_argument('--hop_size', type=int, required=True)
    parser_train.add_argument('--mel_bins', type=int, required=True)
    parser_train.add_argument('--fmin', type=int, required=True)
    parser_train.add_argument('--fmax', type=int, required=True) 
    parser_train.add_argument('--model_type', type=str, required=True)
    parser_train.add_argument('--pretrained_checkpoint_path', type=str)
    parser_train.add_argument('--freeze_base', action='store_true', default=False)
    parser_train.add_argument('--cuda', action='store_true', default=False)

    # Parse arguments
    args = parser.parse_args()
    args.filename = get_filename(__file__)

    if args.mode == 'train':
        train(args)

    else:
        raise Exception('Error argument!')
© 2021 GitHub, Inc.