In [1]:
import numpy as np
import time
import torch
import torch.nn as nn

In [2]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)

resnet

In [3]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):

        super(ConvBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.conv2 = nn.Conv2d(in_channels=out_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x

In [4]:
class _ResNet(nn.Module):
    def __init__(self, block, layers, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(_ResNet, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            if stride == 1:
                downsample = nn.Sequential(
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[0])
                init_bn(downsample[1])
            elif stride == 2:
                downsample = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2), 
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[1])
                init_bn(downsample[2])

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        return x


In [5]:
class _ResnetBottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(_ResnetBottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        self.stride = stride
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = _resnet_conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = _resnet_conv3x3(width, width)
        self.bn2 = norm_layer(width)
        self.conv3 = _resnet_conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

        self.init_weights()

    def init_weights(self):
        init_layer(self.conv1)
        init_bn(self.bn1)
        init_layer(self.conv2)
        init_bn(self.bn2)
        init_layer(self.conv3)
        init_bn(self.bn3)
        nn.init.constant_(self.bn3.weight, 0)

    def forward(self, x):
        identity = x

        if self.stride == 2:
            x = F.avg_pool2d(x, kernel_size=(2, 2))

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = F.dropout(out, p=0.1, training=self.training)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(identity)

        out += identity
        out = self.relu(out)

        return out


In [6]:
def _resnet_conv1x1(in_planes, out_planes):
    #1x1 convolution
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False)

In [7]:
def _resnet_conv3x3(in_planes, out_planes):
    #3x3 convolution with padding
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1,
                     padding=1, groups=1, bias=False, dilation=1)

In [8]:
class ResNet54(nn.Module):
    def __init__(self, classes_num=527):
        
        super(ResNet54, self).__init__()

      

        # Spectrogram extractor
       # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
       #     win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
       #     freeze_parameters=True)

        # Logmel feature extractor
        #self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
        #   n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
        #   freeze_parameters=True)

        # Spec augmenter
        #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
        #    freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        # self.conv_block2 = ConvBlock(in_channels=64, out_channels=64)

        self.resnet = _ResNet(block=_ResnetBottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)

        self.conv_block_after1 = ConvBlock(in_channels=2048, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weights()

    def init_weights(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)


    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        #x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        #x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        x = input.unsqueeze(1)
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        #if self.training:
        #   x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.resnet(x)
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.conv_block_after1(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        
        #x = torch.mean(x, dim=3)
        
        #(x1, _) = torch.max(x, dim=2)
       # x2 = torch.mean(x, dim=2)
       # x = x1 + x2
       # x = F.dropout(x, p=0.5, training=self.training)
       # x = F.relu_(self.fc1(x))
       # embedding = F.dropout(x, p=0.5, training=self.training)
       # clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        #output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return x



In [9]:
class Transfer_ResNet54(nn.Module):
    def __init__(self, freeze_base, pretrain_checkpoint=None):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_ResNet54, self).__init__()

        audioset_classes_num = 527
        self.base = ResNet54(audioset_classes_num)

        #self.init_weights()

        if pretrain_checkpoint:
            self.load_from_pretrain(pretrain_checkpoint)
            
        #self.base.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        # 안쓰이는이유는 multi-class clasification을 생략하기 때문,

        
        
        #if freeze_base:
        #    ct = 0
        #    for child in self.base.children():
        #        ct += 1
        #        if ct < 4:
        #            for param in child.parameters():
        #                param.requires_grad = False
        
        if freeze_base:
            # 2단계 freeze / 3단계 freeze X
             #Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

    #def init_weights(self):
        #init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint):
        pretrained_checkpoint="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth"
        checkpoint = torch.load(pretrained_checkpoint)
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_imag.weight')#가중치 삭제  
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_real.weight')#가중치 삭제
        checkpoint['model'].pop('logmel_extractor.melW')#가중치 삭제
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input):
        """Input: (batch_size, data_length)
        """
        output = self.base(input)

        #embedding = output_dict['embedding']
        #clipwise_output = output_dict['clipwise_output']

        return output #, clipwise_output
 

In [10]:
freeze_base=True

In [11]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.transformer import TransformerDecoder,TransformerDecoderLayer

from hparams import hparams as hp
from encoder import Cnn14,Transfer_Cnn14,init_layer


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, batch_size, dropout=0.5,pretrain_cnn=None,
                 pretrain_emb=None,freeze_cnn=True):
        super(TransformerModel, self).__init__()

        self.model_type = 'resnet+transformer'
        decoder_layers = TransformerDecoderLayer(d_model=nhid, nhead=nhead, dropout=dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
        self.word_emb = nn.Embedding(ntoken, nhid)
        self.ninp = ninp
        self.nhid = nhid
        self.fc = nn.Linear(2048, 2048, bias=True)
        self.fc1 = nn.Linear(2048, nhid, bias=True)
        self.dec_fc = nn.Linear(nhid, ntoken)
        self.batch_size = batch_size
        self.ntoken = ntoken

        #def __init__(self, freeze_base, pretrain_checkpoint=None):
        pretrain_cnn="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth"
        
        self.encoder = Transfer_ResNet54(freeze_base=freeze_cnn, pretrain_checkpoint=pretrain_cnn)
        self.dropout = nn.Dropout(dropout)
        self.pos_encoder = PositionalEncoding(nhid, dropout)
        self.generator = nn.Softmax(dim=-1)
        self.init_weights()

        '''
        if pretrain_cnn is not None:
            dict_trained = pretrain_cnn
            dict_new = self.encoder.state_dict().copy()
            new_list = list(self.encoder.state_dict().keys())
            trained_list = list(dict_trained.keys())
            for i in range(len(new_list)):
                dict_new[new_list[i]] = dict_trained[trained_list[i]]
            self.encoder.load_state_dict(dict_new)
        
        if freeze_cnn:
            self.freeze_cnn()
        '''

        if pretrain_emb is not None:
            self.word_emb.weight.data = pretrain_emb

    '''
    def freeze_cnn(self):
        for p in self.encoder.parameters():
            p.requires_grad = False
    '''

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        init_layer(self.fc1)
        init_layer(self.fc)
        self.word_emb.weight.data.uniform_(-initrange, initrange)
        self.dec_fc.bias.data.zero_()
        self.dec_fc.weight.data.uniform_(-initrange, initrange)

    def encode(self, src, input_mask=None):
        global x 
        x = self.encoder(src)  # (batch_size, 2048, T/16, mel_bins/16) ,mixup
        x = torch.mean(x, dim=3)  # (batch_size, 2048, T/16)
        x = x.permute(2, 0, 1)  # (T/16,batch_size,2048)
        x = F.relu_(self.fc(x))
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.relu(self.fc1(x))
        return x

    def decode(self, mem, tgt, input_mask=None, target_mask=None, target_padding_mask=None):
        # tgt:(batch_size,T_out)
        # mem:(T_mem,batch_size,nhid)

        tgt = tgt.transpose(0, 1)  # (T_out,batch_size)
        if target_mask is None or target_mask.size(0) != len(tgt):
            device = tgt.device
            target_mask = self.generate_square_subsequent_mask(len(tgt)).to(device)

        tgt = self.dropout(self.word_emb(tgt)) * math.sqrt(self.nhid)
        tgt = self.pos_encoder(tgt)
        # mem = self.pos_encoder(mem)
        output = self.transformer_decoder(tgt, mem, memory_mask=input_mask, tgt_mask=target_mask,
                                          tgt_key_padding_mask=target_padding_mask)
        output = self.dec_fc(output)
        return output

    def forward(self, src, tgt, input_mask=None, target_mask=None, target_padding_mask=None):
        # src:(batch_size,T_in,feature_dim)
        # tgt:(batch_size,T_out)
        mem = self.encode(src)
        output = self.decode(mem, tgt, input_mask=input_mask, target_mask=target_mask,
                             target_padding_mask=target_padding_mask)
        return output


In [12]:
import torch
import torch.nn as nn
import time

from data_handling import get_clotho_loader, get_test_data_loader
#from model import TransformerModel  # , RNNModel, RNNModelSmall
import itertools
import numpy as np
import os
import sys
import logging
import csv

from util import get_file_list, get_padding, print_hparams, greedy_decode, \
    calculate_bleu, calculate_spider, LabelSmoothingLoss, beam_search, align_word_embedding, gen_str
from hparams import hparams
from torch.utils.tensorboard import SummaryWriter

import argparse

hp = hparams()
parser = argparse.ArgumentParser(description='hparams for model')

device = torch.device('cuda')
np.random.seed(hp.seed)
torch.manual_seed(hp.seed)

<torch._C.Generator at 0x7ff1380bb990>

In [13]:
pretrain_emb = align_word_embedding(hp.word_dict_pickle_path, hp.pretrain_emb_path, hp.ntoken,
                                        hp.nhid) if hp.load_pretrain_emb else None

In [14]:
model = TransformerModel(hp.ntoken, hp.ninp, hp.nhead, hp.nhid, hp.nlayers, hp.batch_size, dropout=0.2,
                             pretrain_cnn="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth", pretrain_emb=pretrain_emb, freeze_cnn=True).to(device)

In [19]:
model

TransformerModel(
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(p=0.2, inplace=False)
      )
  

In [15]:
#model parameter 확인
for name, param in model.named_parameters(): 
    print(f'name:{name}') 
    print(type(param)) 
    print(f'param.shape:{param.shape}') 
    print(f'param.requries_grad:{param.requires_grad}') 
    print('=====')

name:transformer_decoder.layers.0.self_attn.in_proj_weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.in_proj_bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.out_proj.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([192, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.out_proj.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.multihead_attn.in_proj_weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.multihead_attn.in_proj_bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576])
param.requries_grad:True
=====
name:transformer_decoder.

In [16]:
#swa
from torchcontrib.optim import SWA
import torchcontrib

base_opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)
optimizer = torchcontrib.optim.SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)

In [21]:
#swa 안할때
optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)

scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)


In [17]:
data_dir = hp.data_dir
eval_data_dir = hp.eval_data_dir
train_data_dir = hp.train_data_dir
word_dict_pickle_path = hp.word_dict_pickle_path
word_freq_pickle_path = hp.word_freq_pickle_path
test_data_dir = hp.test_data_dir

In [18]:
#mixup
#data_dir = hp.data_dir
#eval_data_dir = hp.eval_data_dir
#train_data_dir = hp.train_data_dir
word_dict_pickle_path = hp.word_dict_pickle_path
word_freq_pickle_path = hp.word_freq_pickle_path
#test_data_dir = hp.test_data_dir

In [19]:
training_data = get_clotho_loader(data_dir=data_dir, split='development',
                                      input_field_name='features',
                                      output_field_name='words_ind',
                                      load_into_memory=False,
                                      batch_size=hp.batch_size,
                                      nb_t_steps_pad='max',
                                      num_workers=4, return_reference=True, augment=hp.spec_augmentation)

In [25]:
#전체 데이터 
from tqdm import tqdm
tqdm(training_data)

  0%|          | 0/6103 [00:00<?, ?it/s]

  0%|          | 0/6103 [00:00<?, ?it/s]

In [26]:
import pickle
#워드 개수 확인
with open('./create_dataset/data/pickles/words_frequencies.p','rb') as f:
    words_freq=pickle.load(f)
words_freq

[24420,
 24739,
 1,
 718,
 4808,
 46,
 16,
 13138,
 17,
 24420,
 45,
 28,
 71,
 329,
 873,
 5,
 7333,
 12184,
 768,
 1,
 97,
 149,
 45,
 168,
 132,
 555,
 1,
 49,
 3225,
 1,
 241,
 1844,
 9147,
 81,
 1,
 991,
 455,
 14,
 7,
 3,
 330,
 1935,
 36,
 12,
 62,
 2,
 3654,
 258,
 90,
 84,
 79,
 2134,
 1,
 5,
 75,
 4060,
 1703,
 40,
 2369,
 468,
 67,
 630,
 2,
 114,
 15,
 5,
 2986,
 1905,
 52,
 481,
 2,
 5,
 315,
 3003,
 121,
 811,
 4,
 2,
 2,
 31,
 2541,
 15,
 13,
 172,
 502,
 567,
 301,
 844,
 1,
 2748,
 2229,
 28,
 60,
 133,
 2,
 423,
 262,
 88,
 52,
 1,
 806,
 282,
 22,
 211,
 41,
 759,
 447,
 338,
 142,
 454,
 2337,
 3,
 5,
 1,
 22,
 1,
 129,
 23,
 268,
 809,
 692,
 630,
 417,
 3,
 148,
 20,
 55,
 91,
 38,
 241,
 2309,
 783,
 4,
 2,
 2,
 4,
 52,
 2,
 134,
 428,
 107,
 25,
 1,
 461,
 11,
 129,
 36,
 87,
 492,
 508,
 7,
 16,
 28,
 61,
 27,
 397,
 40,
 15,
 25,
 117,
 22,
 77,
 873,
 68,
 21,
 5,
 1,
 10,
 44,
 298,
 428,
 29,
 103,
 1259,
 128,
 1404,
 1,
 1149,
 271,
 1,
 1,
 274,
 123,
 5

In [35]:
len(words_freq)

4371

In [20]:
evaluation_beam = get_clotho_loader(data_dir=data_dir, split='evaluation',
                                        input_field_name='features',
                                        output_field_name='words_ind',
                                        load_into_memory=False,
                                        batch_size=32,
                                        nb_t_steps_pad='max',
                                        shuffle=False,
                                        return_reference=True)

In [21]:
test_data = get_test_data_loader(data_dir=test_data_dir,
                                     batch_size=hp.batch_size * 2,
                                     nb_t_steps_pad='max',
                                     shuffle=False,
                                     drop_last=False,
                                     input_pad_at='start',
                                     num_workers=8)

  cpuset_checked))


In [22]:
def train():
    model.train()
    total_loss_text = 0.
    start_time = time.time()
    batch = 0
    for src, tgt, tgt_len,ref in training_data:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_pad_mask = get_padding(tgt, tgt_len)
        tgt_in = tgt[:, :-1]
        tgt_pad_mask = tgt_pad_mask[:, :-1]
        tgt_y = tgt[:, 1:]

        optimizer.zero_grad()
        
        output = model(src, tgt_in, target_padding_mask=tgt_pad_mask)

        loss_text = criterion(output.contiguous().view(-1, hp.ntoken), tgt_y.transpose(0, 1).contiguous().view(-1))
        loss = loss_text
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), hp.clip_grad)
        
        optimizer.step()
        
    
        total_loss_text += loss_text.item()

        writer.add_scalar('Loss/train-text', loss_text.item(), (epoch - 1) * len(training_data) + batch)
        
        
        batch += 1
        
        if batch % hp.log_interval == 0 and batch > 0:
            mean_text_loss = total_loss_text / hp.log_interval
            elapsed = time.time() - start_time
            current_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
            logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                         'loss-text {:5.4f}'.format(
                epoch, batch, len(training_data), current_lr,
                elapsed * 1000 / hp.log_interval, mean_text_loss))
            total_loss_text = 0
            start_time = time.time()
        
            optimizer.swap_swa_sgd()

def eval_all(evaluation_data, max_len=30, eos_ind=9, word_dict_pickle_path=None):
    model.eval()
    with torch.no_grad():
        output_sentence_all = []
        ref_all = []
        for src, tgt, _, ref in evaluation_data:
            src = src.to(device)
            output = greedy_decode(model, src, max_len=max_len)

            output_sentence_ind_batch = []
            for i in range(output.size()[0]):
                output_sentence_ind = []
                for j in range(1, output.size(1)):
                    sym = output[i, j]
                    if sym == eos_ind: break
                    output_sentence_ind.append(sym.item())
                output_sentence_ind_batch.append(output_sentence_ind)
            output_sentence_all.extend(output_sentence_ind_batch)
            ref_all.extend(ref)
        score, output_str, ref_str = calculate_spider(output_sentence_all, ref_all, word_dict_pickle_path)

        loss_mean = score
        writer.add_scalar(f'Loss/eval_greddy', loss_mean, epoch)
        msg = f'eval_greddy SPIDEr: {loss_mean:2.4f}'
        logging.info(msg)


def eval_with_beam(evaluation_data, max_len=30, eos_ind=9, word_dict_pickle_path=None, beam_size=3):
    model.eval()
    with torch.no_grad():
        output_sentence_all = []
        ref_all = []
        for src, tgt, _, ref in evaluation_data:
            src = src.to(device)
            output = beam_search(model, src, max_len, start_symbol_ind=0, beam_size=beam_size)

            output_sentence_ind_batch = []
            for single_sample in output:
                output_sentence_ind = []
                for sym in single_sample:
                    if sym == eos_ind: break
                    output_sentence_ind.append(sym.item())
                output_sentence_ind_batch.append(output_sentence_ind)
            output_sentence_all.extend(output_sentence_ind_batch)
            ref_all.extend(ref)

        score, output_str, ref_str = calculate_spider(output_sentence_all, ref_all, word_dict_pickle_path)

        loss_mean = score
        writer.add_scalar(f'Loss/eval_beam', loss_mean, epoch)
        msg = f'eval_beam_{beam_size} SPIDEr: {loss_mean:2.4f}'
        logging.info(msg)


def test_with_beam(test_data, max_len=30, eos_ind=9, beam_size=3):
    model.eval()

    with torch.no_grad():
        with open("test_out.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(['file_name', 'caption_predicted'])
            for src, filename in test_data:
                src = src.to(device)
                output = beam_search(model, src, max_len, start_symbol_ind=0, beam_size=beam_size)

                output_sentence_ind_batch = []
                for single_sample in output:
                    output_sentence_ind = []
                    for sym in single_sample:
                        if sym == eos_ind: break
                        output_sentence_ind.append(sym.item())
                    output_sentence_ind_batch.append(output_sentence_ind)
                out_str = gen_str(output_sentence_ind_batch, hp.word_dict_pickle_path)
                for caption, fn in zip(out_str, filename):
                    writer.writerow(['{}.wav'.format(fn), caption])


In [23]:
if hp.label_smoothing:
    criterion = LabelSmoothingLoss(hp.ntoken, smoothing=0.1)
else:
    criterion = nn.CrossEntropyLoss(ignore_index=hp.ntoken - 1)

now_time = str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())))
log_dir = 'models/{name}'.format(name=hp.name)

writer = SummaryWriter(log_dir=log_dir)

log_path = os.path.join(log_dir, 'train.log')

logging.basicConfig(level=logging.DEBUG,
                        format=
                        '%(asctime)s - %(levelname)s: %(message)s',
                        handlers=[
                            logging.FileHandler(log_path),
                            logging.StreamHandler(sys.stdout)]
                        )


In [24]:
    logging.info(str(model))

    logging.info(str(print_hparams(hp)))

    logging.info('Data loaded!')
    logging.info('Data size: ' + str(len(training_data)))

    logging.info('Total Model parameters: ' + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))

2021-12-01 00:14:07,072 - INFO: TransformerModel(
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(

2021-12-01 00:14:07,073 - INFO: {'batch_size': 8, 'beam_width': 3, 'checkpoint_save_interval': 5, 'clip_grad': 2.5, 'data_dir': PosixPath('/home/hj20/dcase_2020_T6/create_dataset/data/data_splits'), 'device': 'cuda', 'eval_data_dir': '/home/hj20/dcase_2020_T6/create_dataset/data/data_splits/evaluation', 'freeze_cnn': True, 'label_smoothing': True, 'load_pretrain_cnn': True, 'load_pretrain_emb': False, 'load_pretrain_model': True, 'log_interval': 100, 'lr': 0.0001, 'mode': 'train', 'name': '1130resnet', 'nhead': 4, 'nhid': 192, 'ninp': 64, 'nkeyword': 4979, 'nlayers': 2, 'ntoken': 4371, 'pretrain_cnn_path': '/home/hj20/dcase_2020_T6/models/tag_models/TagModel_45.pt', 'pretrain_emb_path': '/home/hj20/dcase_2020_T6/models/w2v_192.mod', 'pretrain_model_path': '/home/hj20/dcase_2020_T6/models/base/46.pt', 'scheduler_decay': 0.98, 'seed': 1111, 'spec_augmentation': True, 'test_data_dir': '/home/hj20/dcase_2020_T6/create_dataset/data/test_data', 'train_data_dir': '/home/hj20/dcase_2020_T6/cre

In [25]:
#일부 레이어 1130
epoch = 1
if hp.mode == 'train':
    while epoch < hp.training_epochs + 1:
        epoch_start_time = time.time()
        train()
        torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
        scheduler.step(epoch)
        eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=2)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=3)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=4)
        epoch += 1

2021-12-01 00:14:32,676 - INFO: | epoch   1 |   100/ 3051 batches | lr 1.00e-04 | ms/batch 247.08 | loss-text 5.7219
2021-12-01 00:14:56,340 - INFO: | epoch   1 |   200/ 3051 batches | lr 1.00e-04 | ms/batch 236.63 | loss-text 5.1582
2021-12-01 00:15:18,873 - INFO: | epoch   1 |   300/ 3051 batches | lr 1.00e-04 | ms/batch 225.32 | loss-text 4.9298
2021-12-01 00:15:41,403 - INFO: | epoch   1 |   400/ 3051 batches | lr 1.00e-04 | ms/batch 225.30 | loss-text 4.8013
2021-12-01 00:16:04,053 - INFO: | epoch   1 |   500/ 3051 batches | lr 1.00e-04 | ms/batch 226.49 | loss-text 4.6737
2021-12-01 00:16:26,674 - INFO: | epoch   1 |   600/ 3051 batches | lr 1.00e-04 | ms/batch 226.20 | loss-text 4.6072
2021-12-01 00:16:49,139 - INFO: | epoch   1 |   700/ 3051 batches | lr 1.00e-04 | ms/batch 224.65 | loss-text 4.6083
2021-12-01 00:17:11,700 - INFO: | epoch   1 |   800/ 3051 batches | lr 1.00e-04 | ms/batch 225.60 | loss-text 4.6089
2021-12-01 00:17:34,328 - INFO: | epoch   1 |   900/ 3051 batche



loading annotations into memory...
0:00:00.004220
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 16659, 'reflen': 11542, 'guess': [16659, 15635, 14611, 13587], 'correct': [3709, 862, 197, 10]}
ratio: 1.4433373765377366
Bleu_1: 0.223
Bleu_2: 0.111
Bleu_3: 0.055
Bleu_4: 0.019
computing METEOR score...
METEOR: 0.082
computing Rouge score...
ROUGE_L: 0.227
computing CIDEr score...
CIDEr: 0.035
computing SPICE score...
SPICE: 0.046
computing SPIDEr score...
SPIDEr: 0.041
2021-12-01 00:27:40,319 - INFO: eval_greddy SPIDEr: 0.0405
loading annotations into memory...
0:00:00.003877
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 16055, 'reflen': 11260, 'guess': [16055, 15031, 14007, 12983], 'correct': [4380, 



loading annotations into memory...
0:00:00.003730
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11518, 'reflen': 11194, 'guess': [11518, 10494, 9470, 8446], 'correct': [4895, 1253, 344, 50]}
ratio: 1.0289440771841138
Bleu_1: 0.425
Bleu_2: 0.225
Bleu_3: 0.123
Bleu_4: 0.057
computing METEOR score...
METEOR: 0.121
computing Rouge score...
ROUGE_L: 0.303
computing CIDEr score...
CIDEr: 0.121
computing SPICE score...
SPICE: 0.070
computing SPIDEr score...
SPIDEr: 0.096
2021-12-01 00:42:26,886 - INFO: eval_greddy SPIDEr: 0.0956
loading annotations into memory...
0:00:00.003777
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9495, 'reflen': 9829, 'guess': [9495, 8471, 7447, 6423], 'correct': [4510, 1398, 4



loading annotations into memory...
0:00:00.003793
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11789, 'reflen': 11241, 'guess': [11789, 10765, 9741, 8717], 'correct': [5357, 1606, 532, 132]}
ratio: 1.0487501111999777
Bleu_1: 0.454
Bleu_2: 0.260
Bleu_3: 0.155
Bleu_4: 0.087
computing METEOR score...
METEOR: 0.135
computing Rouge score...
ROUGE_L: 0.324
computing CIDEr score...
CIDEr: 0.178
computing SPICE score...
SPICE: 0.087
computing SPIDEr score...
SPIDEr: 0.132
2021-12-01 00:56:32,649 - INFO: eval_greddy SPIDEr: 0.1321
loading annotations into memory...
0:00:00.003830
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9097, 'reflen': 9632, 'guess': [9097, 8073, 7049, 6025], 'correct': [4614, 1557, 



loading annotations into memory...
0:00:00.003903
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9744, 'reflen': 10026, 'guess': [9744, 8720, 7696, 6672], 'correct': [4808, 1371, 455, 100]}
ratio: 0.9718731298622609
Bleu_1: 0.479
Bleu_2: 0.271
Bleu_3: 0.161
Bleu_4: 0.088
computing METEOR score...
METEOR: 0.130
computing Rouge score...
ROUGE_L: 0.320
computing CIDEr score...
CIDEr: 0.187
computing SPICE score...
SPICE: 0.086
computing SPIDEr score...
SPIDEr: 0.137
2021-12-01 01:10:36,417 - INFO: eval_greddy SPIDEr: 0.1368
loading annotations into memory...
0:00:00.004003
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8992, 'reflen': 9680, 'guess': [8992, 7968, 6944, 5920], 'correct': [4703, 1594, 590



loading annotations into memory...
0:00:00.003856
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10833, 'reflen': 10630, 'guess': [10833, 9809, 8785, 7761], 'correct': [5358, 1653, 521, 120]}
ratio: 1.0190968955784554
Bleu_1: 0.495
Bleu_2: 0.289
Bleu_3: 0.170
Bleu_4: 0.094
computing METEOR score...
METEOR: 0.142
computing Rouge score...
ROUGE_L: 0.334
computing CIDEr score...
CIDEr: 0.209
computing SPICE score...
SPICE: 0.093
computing SPIDEr score...
SPIDEr: 0.151
2021-12-01 01:24:39,783 - INFO: eval_greddy SPIDEr: 0.1511
loading annotations into memory...
0:00:00.003869
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8948, 'reflen': 9571, 'guess': [8948, 7924, 6900, 5876], 'correct': [4920, 1676, 6



loading annotations into memory...
0:00:00.004076
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10828, 'reflen': 10678, 'guess': [10828, 9804, 8780, 7756], 'correct': [5314, 1622, 531, 118]}
ratio: 1.0140475744520496
Bleu_1: 0.491
Bleu_2: 0.285
Bleu_3: 0.170
Bleu_4: 0.093
computing METEOR score...
METEOR: 0.140
computing Rouge score...
ROUGE_L: 0.332
computing CIDEr score...
CIDEr: 0.211
computing SPICE score...
SPICE: 0.092
computing SPIDEr score...
SPIDEr: 0.151
2021-12-01 01:38:58,206 - INFO: eval_greddy SPIDEr: 0.1513
loading annotations into memory...
0:00:00.004079
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8967, 'reflen': 9646, 'guess': [8967, 7943, 6919, 5895], 'correct': [4886, 1742, 6



loading annotations into memory...
0:00:00.003833
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10702, 'reflen': 10637, 'guess': [10702, 9678, 8654, 7630], 'correct': [5600, 1808, 609, 149]}
ratio: 1.0061107455108578
Bleu_1: 0.523
Bleu_2: 0.313
Bleu_3: 0.190
Bleu_4: 0.108
computing METEOR score...
METEOR: 0.150
computing Rouge score...
ROUGE_L: 0.347
computing CIDEr score...
CIDEr: 0.251
computing SPICE score...
SPICE: 0.098
computing SPIDEr score...
SPIDEr: 0.175
2021-12-01 01:53:17,970 - INFO: eval_greddy SPIDEr: 0.1745
loading annotations into memory...
0:00:00.003892
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9763, 'reflen': 10036, 'guess': [9763, 8739, 7715, 6691], 'correct': [5284, 1901, 



loading annotations into memory...
0:00:00.003825
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9850, 'reflen': 10068, 'guess': [9850, 8826, 7802, 6778], 'correct': [5015, 1553, 540, 146]}
ratio: 0.9783472387762238
Bleu_1: 0.498
Bleu_2: 0.293
Bleu_3: 0.180
Bleu_4: 0.105
computing METEOR score...
METEOR: 0.140
computing Rouge score...
ROUGE_L: 0.330
computing CIDEr score...
CIDEr: 0.228
computing SPICE score...
SPICE: 0.092
computing SPIDEr score...
SPIDEr: 0.160
2021-12-01 02:07:19,577 - INFO: eval_greddy SPIDEr: 0.1602
loading annotations into memory...
0:00:00.003852
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9038, 'reflen': 9649, 'guess': [9038, 8014, 6990, 5966], 'correct': [4959, 1752, 672



loading annotations into memory...
0:00:00.003793
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10141, 'reflen': 10270, 'guess': [10141, 9117, 8093, 7069], 'correct': [5105, 1607, 551, 150]}
ratio: 0.9874391431352495
Bleu_1: 0.497
Bleu_2: 0.294
Bleu_3: 0.180
Bleu_4: 0.105
computing METEOR score...
METEOR: 0.143
computing Rouge score...
ROUGE_L: 0.334
computing CIDEr score...
CIDEr: 0.230
computing SPICE score...
SPICE: 0.092
computing SPIDEr score...
SPIDEr: 0.161
2021-12-01 02:21:23,758 - INFO: eval_greddy SPIDEr: 0.1611
loading annotations into memory...
0:00:00.004008
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9138, 'reflen': 9706, 'guess': [9138, 8114, 7090, 6066], 'correct': [4977, 1746, 6



loading annotations into memory...
0:00:00.003846
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9801, 'reflen': 9995, 'guess': [9801, 8777, 7753, 6729], 'correct': [5037, 1649, 556, 136]}
ratio: 0.9805902951474756
Bleu_1: 0.504
Bleu_2: 0.305
Bleu_3: 0.187
Bleu_4: 0.107
computing METEOR score...
METEOR: 0.141
computing Rouge score...
ROUGE_L: 0.342
computing CIDEr score...
CIDEr: 0.237
computing SPICE score...
SPICE: 0.097
computing SPIDEr score...
SPIDEr: 0.167
2021-12-01 02:35:28,075 - INFO: eval_greddy SPIDEr: 0.1671
loading annotations into memory...
0:00:00.003932
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9095, 'reflen': 9666, 'guess': [9095, 8071, 7047, 6023], 'correct': [4931, 1797, 714,



loading annotations into memory...
0:00:00.003797
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9810, 'reflen': 10049, 'guess': [9810, 8786, 7762, 6738], 'correct': [5144, 1790, 659, 187]}
ratio: 0.9762165389590032
Bleu_1: 0.512
Bleu_2: 0.319
Bleu_3: 0.204
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.146
computing Rouge score...
ROUGE_L: 0.351
computing CIDEr score...
CIDEr: 0.255
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.178
2021-12-01 02:49:29,534 - INFO: eval_greddy SPIDEr: 0.1783
loading annotations into memory...
0:00:00.003765
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9144, 'reflen': 9692, 'guess': [9144, 8120, 7096, 6072], 'correct': [4911, 1752, 662



loading annotations into memory...
0:00:00.003808
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9865, 'reflen': 10077, 'guess': [9865, 8841, 7817, 6793], 'correct': [5038, 1631, 586, 182]}
ratio: 0.9789619926564475
Bleu_1: 0.500
Bleu_2: 0.300
Bleu_3: 0.188
Bleu_4: 0.115
computing METEOR score...
METEOR: 0.143
computing Rouge score...
ROUGE_L: 0.338
computing CIDEr score...
CIDEr: 0.253
computing SPICE score...
SPICE: 0.097
computing SPIDEr score...
SPIDEr: 0.175
2021-12-01 03:03:30,529 - INFO: eval_greddy SPIDEr: 0.1750
loading annotations into memory...
0:00:00.003963
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9065, 'reflen': 9637, 'guess': [9065, 8041, 7017, 5993], 'correct': [4931, 1780, 719



loading annotations into memory...
0:00:00.003801
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10554, 'reflen': 10548, 'guess': [10554, 9530, 8506, 7482], 'correct': [5406, 1804, 656, 201]}
ratio: 1.0005688282137846
Bleu_1: 0.512
Bleu_2: 0.311
Bleu_3: 0.196
Bleu_4: 0.119
computing METEOR score...
METEOR: 0.151
computing Rouge score...
ROUGE_L: 0.348
computing CIDEr score...
CIDEr: 0.271
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.186
2021-12-01 03:17:34,528 - INFO: eval_greddy SPIDEr: 0.1862
loading annotations into memory...
0:00:00.003793
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9407, 'reflen': 9846, 'guess': [9407, 8383, 7359, 6335], 'correct': [5111, 1823, 7



loading annotations into memory...
0:00:00.003526
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10345, 'reflen': 10369, 'guess': [10345, 9321, 8297, 7273], 'correct': [5376, 1739, 630, 191]}
ratio: 0.9976854084288748
Bleu_1: 0.518
Bleu_2: 0.311
Bleu_3: 0.194
Bleu_4: 0.118
computing METEOR score...
METEOR: 0.150
computing Rouge score...
ROUGE_L: 0.349
computing CIDEr score...
CIDEr: 0.273
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.187
2021-12-01 03:31:37,326 - INFO: eval_greddy SPIDEr: 0.1866
loading annotations into memory...
0:00:00.003825
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9230, 'reflen': 9730, 'guess': [9230, 8206, 7182, 6158], 'correct': [5030, 1791, 7



loading annotations into memory...
0:00:00.003917
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10034, 'reflen': 10196, 'guess': [10034, 9010, 7986, 6962], 'correct': [5302, 1777, 639, 170]}
ratio: 0.9841114162415668
Bleu_1: 0.520
Bleu_2: 0.318
Bleu_3: 0.200
Bleu_4: 0.118
computing METEOR score...
METEOR: 0.150
computing Rouge score...
ROUGE_L: 0.347
computing CIDEr score...
CIDEr: 0.270
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.185
2021-12-01 03:45:38,897 - INFO: eval_greddy SPIDEr: 0.1852
loading annotations into memory...
0:00:00.003796
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9068, 'reflen': 9631, 'guess': [9068, 8044, 7020, 5996], 'correct': [5093, 1840, 7



loading annotations into memory...
0:00:00.003789
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9894, 'reflen': 10142, 'guess': [9894, 8870, 7846, 6822], 'correct': [5324, 1819, 661, 203]}
ratio: 0.9755472293432286
Bleu_1: 0.525
Bleu_2: 0.324
Bleu_3: 0.205
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.154
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.293
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.198
2021-12-01 03:59:39,011 - INFO: eval_greddy SPIDEr: 0.1984
loading annotations into memory...
0:00:00.003872
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9105, 'reflen': 9664, 'guess': [9105, 8081, 7057, 6033], 'correct': [5024, 1841, 734



loading annotations into memory...
0:00:00.003908
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9955, 'reflen': 10259, 'guess': [9955, 8931, 7907, 6883], 'correct': [5326, 1817, 676, 201]}
ratio: 0.9703674822106472
Bleu_1: 0.519
Bleu_2: 0.320
Bleu_3: 0.204
Bleu_4: 0.125
computing METEOR score...
METEOR: 0.154
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.289
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.197
2021-12-01 04:13:40,603 - INFO: eval_greddy SPIDEr: 0.1970
loading annotations into memory...
0:00:00.003929
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9225, 'reflen': 9733, 'guess': [9225, 8201, 7177, 6153], 'correct': [5097, 1839, 734



loading annotations into memory...
0:00:00.003827
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9646, 'reflen': 9920, 'guess': [9646, 8622, 7598, 6574], 'correct': [5100, 1668, 591, 157]}
ratio: 0.9723790322579664
Bleu_1: 0.514
Bleu_2: 0.311
Bleu_3: 0.194
Bleu_4: 0.114
computing METEOR score...
METEOR: 0.147
computing Rouge score...
ROUGE_L: 0.342
computing CIDEr score...
CIDEr: 0.270
computing SPICE score...
SPICE: 0.099
computing SPIDEr score...
SPIDEr: 0.185
2021-12-01 04:27:41,480 - INFO: eval_greddy SPIDEr: 0.1847
loading annotations into memory...
0:00:00.004135
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9080, 'reflen': 9638, 'guess': [9080, 8056, 7032, 6008], 'correct': [5055, 1835, 714,



loading annotations into memory...
0:00:00.003837
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10091, 'reflen': 10189, 'guess': [10091, 9067, 8043, 7019], 'correct': [5184, 1706, 589, 168]}
ratio: 0.9903817842770644
Bleu_1: 0.509
Bleu_2: 0.308
Bleu_3: 0.190
Bleu_4: 0.113
computing METEOR score...
METEOR: 0.148
computing Rouge score...
ROUGE_L: 0.343
computing CIDEr score...
CIDEr: 0.277
computing SPICE score...
SPICE: 0.098
computing SPIDEr score...
SPIDEr: 0.187
2021-12-01 04:41:42,491 - INFO: eval_greddy SPIDEr: 0.1872
loading annotations into memory...
0:00:00.003953
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9277, 'reflen': 9748, 'guess': [9277, 8253, 7229, 6205], 'correct': [5170, 1855, 7



loading annotations into memory...
0:00:00.003821
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9785, 'reflen': 10038, 'guess': [9785, 8761, 7737, 6713], 'correct': [5231, 1746, 642, 191]}
ratio: 0.9747957760509091
Bleu_1: 0.521
Bleu_2: 0.318
Bleu_3: 0.201
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.151
computing Rouge score...
ROUGE_L: 0.348
computing CIDEr score...
CIDEr: 0.291
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.196
2021-12-01 04:55:45,377 - INFO: eval_greddy SPIDEr: 0.1959
loading annotations into memory...
0:00:00.003822
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9064, 'reflen': 9631, 'guess': [9064, 8040, 7016, 5992], 'correct': [5179, 1894, 742



loading annotations into memory...
0:00:00.003840
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9883, 'reflen': 10092, 'guess': [9883, 8859, 7835, 6811], 'correct': [5135, 1638, 592, 173]}
ratio: 0.9792905271501209
Bleu_1: 0.509
Bleu_2: 0.303
Bleu_3: 0.190
Bleu_4: 0.114
computing METEOR score...
METEOR: 0.148
computing Rouge score...
ROUGE_L: 0.342
computing CIDEr score...
CIDEr: 0.268
computing SPICE score...
SPICE: 0.100
computing SPIDEr score...
SPIDEr: 0.184
2021-12-01 05:09:47,303 - INFO: eval_greddy SPIDEr: 0.1840
loading annotations into memory...
0:00:00.003920
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9114, 'reflen': 9651, 'guess': [9114, 8090, 7066, 6042], 'correct': [5057, 1792, 693



loading annotations into memory...
0:00:00.003853
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9852, 'reflen': 10105, 'guess': [9852, 8828, 7804, 6780], 'correct': [5216, 1730, 608, 184]}
ratio: 0.9749628896584883
Bleu_1: 0.516
Bleu_2: 0.314
Bleu_3: 0.196
Bleu_4: 0.119
computing METEOR score...
METEOR: 0.151
computing Rouge score...
ROUGE_L: 0.348
computing CIDEr score...
CIDEr: 0.285
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.194
2021-12-01 05:23:48,427 - INFO: eval_greddy SPIDEr: 0.1938
loading annotations into memory...
0:00:00.004016
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9036, 'reflen': 9588, 'guess': [9036, 8012, 6988, 5964], 'correct': [5014, 1789, 697



loading annotations into memory...
0:00:00.003908
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10035, 'reflen': 10240, 'guess': [10035, 9011, 7987, 6963], 'correct': [5485, 1865, 683, 211]}
ratio: 0.9799804687499043
Bleu_1: 0.536
Bleu_2: 0.330
Bleu_3: 0.209
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.308
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.206
2021-12-01 05:37:50,492 - INFO: eval_greddy SPIDEr: 0.2061
loading annotations into memory...
0:00:00.003840
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9114, 'reflen': 9663, 'guess': [9114, 8090, 7066, 6042], 'correct': [5116, 1839, 7



loading annotations into memory...
0:00:00.003978
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10054, 'reflen': 10267, 'guess': [10054, 9030, 8006, 6982], 'correct': [5439, 1882, 697, 217]}
ratio: 0.9792539203271666
Bleu_1: 0.530
Bleu_2: 0.329
Bleu_3: 0.210
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.315
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.211
2021-12-01 05:51:53,775 - INFO: eval_greddy SPIDEr: 0.2108
loading annotations into memory...
0:00:00.003844
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9341, 'reflen': 9803, 'guess': [9341, 8317, 7293, 6269], 'correct': [5230, 1904, 7



loading annotations into memory...
0:00:00.003822
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9916, 'reflen': 10130, 'guess': [9916, 8892, 7868, 6844], 'correct': [5369, 1867, 707, 214]}
ratio: 0.9788746298123416
Bleu_1: 0.530
Bleu_2: 0.330
Bleu_3: 0.212
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.316
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.210
2021-12-01 06:05:57,172 - INFO: eval_greddy SPIDEr: 0.2097
loading annotations into memory...
0:00:00.003981
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9423, 'reflen': 9807, 'guess': [9423, 8399, 7375, 6351], 'correct': [5239, 1941, 793



loading annotations into memory...
0:00:00.004037
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9587, 'reflen': 9940, 'guess': [9587, 8563, 7539, 6515], 'correct': [5258, 1843, 686, 220]}
ratio: 0.964486921529078
Bleu_1: 0.529
Bleu_2: 0.331
Bleu_3: 0.213
Bleu_4: 0.133
computing METEOR score...
METEOR: 0.155
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.315
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.209
2021-12-01 06:20:13,667 - INFO: eval_greddy SPIDEr: 0.2090
loading annotations into memory...
0:00:00.004455
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8923, 'reflen': 9526, 'guess': [8923, 7899, 6875, 5851], 'correct': [5072, 1840, 727, 



loading annotations into memory...
0:00:00.003964
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9836, 'reflen': 10041, 'guess': [9836, 8812, 7788, 6764], 'correct': [5364, 1833, 677, 204]}
ratio: 0.9795837068020138
Bleu_1: 0.534
Bleu_2: 0.330
Bleu_3: 0.210
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.155
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.301
computing SPICE score...
SPICE: 0.100
computing SPIDEr score...
SPIDEr: 0.200
2021-12-01 06:34:28,659 - INFO: eval_greddy SPIDEr: 0.2004
loading annotations into memory...
0:00:00.004177
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9351, 'reflen': 9783, 'guess': [9351, 8327, 7303, 6279], 'correct': [5265, 1940, 769



loading annotations into memory...
0:00:00.004025
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9992, 'reflen': 10149, 'guess': [9992, 8968, 7944, 6920], 'correct': [5400, 1863, 698, 218]}
ratio: 0.9845304956152345
Bleu_1: 0.532
Bleu_2: 0.330
Bleu_3: 0.211
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.316
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.211
2021-12-01 06:48:31,274 - INFO: eval_greddy SPIDEr: 0.2108
loading annotations into memory...
0:00:00.003860
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9244, 'reflen': 9685, 'guess': [9244, 8220, 7196, 6172], 'correct': [5226, 1843, 705



loading annotations into memory...
0:00:00.003829
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10078, 'reflen': 10210, 'guess': [10078, 9054, 8030, 7006], 'correct': [5320, 1800, 663, 214]}
ratio: 0.9870714985307554
Bleu_1: 0.521
Bleu_2: 0.320
Bleu_3: 0.203
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.155
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.298
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.200
2021-12-01 07:02:33,886 - INFO: eval_greddy SPIDEr: 0.2002
loading annotations into memory...
0:00:00.003877
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9241, 'reflen': 9685, 'guess': [9241, 8217, 7193, 6169], 'correct': [5192, 1877, 7



loading annotations into memory...
0:00:00.003916
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9708, 'reflen': 10000, 'guess': [9708, 8684, 7660, 6636], 'correct': [5278, 1821, 688, 213]}
ratio: 0.9707999999999029
Bleu_1: 0.528
Bleu_2: 0.328
Bleu_3: 0.211
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.354
computing CIDEr score...
CIDEr: 0.304
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.204
2021-12-01 07:16:37,543 - INFO: eval_greddy SPIDEr: 0.2040
loading annotations into memory...
0:00:00.003911
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9176, 'reflen': 9651, 'guess': [9176, 8152, 7128, 6104], 'correct': [5137, 1834, 721



loading annotations into memory...
0:00:00.004012
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10052, 'reflen': 10253, 'guess': [10052, 9028, 8004, 6980], 'correct': [5393, 1822, 677, 218]}
ratio: 0.9803959816638076
Bleu_1: 0.526
Bleu_2: 0.323
Bleu_3: 0.205
Bleu_4: 0.127
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.351
computing CIDEr score...
CIDEr: 0.317
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.211
2021-12-01 07:30:40,396 - INFO: eval_greddy SPIDEr: 0.2106
loading annotations into memory...
0:00:00.003898
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9260, 'reflen': 9693, 'guess': [9260, 8236, 7212, 6188], 'correct': [5188, 1831, 6



loading annotations into memory...
0:00:00.003846
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10044, 'reflen': 10177, 'guess': [10044, 9020, 7996, 6972], 'correct': [5425, 1884, 700, 228]}
ratio: 0.9869313157118024
Bleu_1: 0.533
Bleu_2: 0.331
Bleu_3: 0.212
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.312
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.208
2021-12-01 07:44:42,909 - INFO: eval_greddy SPIDEr: 0.2080
loading annotations into memory...
0:00:00.003752
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9347, 'reflen': 9720, 'guess': [9347, 8323, 7299, 6275], 'correct': [5297, 1925, 7



loading annotations into memory...
0:00:00.003768
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9970, 'reflen': 10132, 'guess': [9970, 8946, 7922, 6898], 'correct': [5276, 1760, 661, 217]}
ratio: 0.9840110540859668
Bleu_1: 0.521
Bleu_2: 0.317
Bleu_3: 0.202
Bleu_4: 0.127
computing METEOR score...
METEOR: 0.153
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.301
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.201
2021-12-01 07:58:44,678 - INFO: eval_greddy SPIDEr: 0.2013
loading annotations into memory...
0:00:00.003799
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9113, 'reflen': 9585, 'guess': [9113, 8089, 7065, 6041], 'correct': [5086, 1794, 715



loading annotations into memory...
0:00:00.003905
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10172, 'reflen': 10235, 'guess': [10172, 9148, 8124, 7100], 'correct': [5366, 1801, 653, 200]}
ratio: 0.9938446507082566
Bleu_1: 0.524
Bleu_2: 0.320
Bleu_3: 0.202
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.154
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.304
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.203
2021-12-01 08:12:50,049 - INFO: eval_greddy SPIDEr: 0.2027
loading annotations into memory...
0:00:00.003958
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9677, 'reflen': 9955, 'guess': [9677, 8653, 7629, 6605], 'correct': [5444, 1962, 8



loading annotations into memory...
0:00:00.003887
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10009, 'reflen': 10171, 'guess': [10009, 8985, 7961, 6937], 'correct': [5507, 1913, 724, 242]}
ratio: 0.984072362599451
Bleu_1: 0.541
Bleu_2: 0.337
Bleu_3: 0.217
Bleu_4: 0.137
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.329
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.218
2021-12-01 08:26:52,286 - INFO: eval_greddy SPIDEr: 0.2184
loading annotations into memory...
0:00:00.004065
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9300, 'reflen': 9749, 'guess': [9300, 8276, 7252, 6228], 'correct': [5299, 1949, 79



loading annotations into memory...
0:00:00.003938
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10162, 'reflen': 10249, 'guess': [10162, 9138, 8114, 7090], 'correct': [5277, 1760, 637, 196]}
ratio: 0.9915113669625337
Bleu_1: 0.515
Bleu_2: 0.314
Bleu_3: 0.197
Bleu_4: 0.120
computing METEOR score...
METEOR: 0.153
computing Rouge score...
ROUGE_L: 0.348
computing CIDEr score...
CIDEr: 0.294
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.199
2021-12-01 08:40:55,260 - INFO: eval_greddy SPIDEr: 0.1992
loading annotations into memory...
0:00:00.003954
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9437, 'reflen': 9819, 'guess': [9437, 8413, 7389, 6365], 'correct': [5287, 1888, 7



loading annotations into memory...
0:00:00.003752
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10119, 'reflen': 10280, 'guess': [10119, 9095, 8071, 7047], 'correct': [5287, 1799, 653, 221]}
ratio: 0.9843385214006825
Bleu_1: 0.514
Bleu_2: 0.316
Bleu_3: 0.200
Bleu_4: 0.125
computing METEOR score...
METEOR: 0.155
computing Rouge score...
ROUGE_L: 0.348
computing CIDEr score...
CIDEr: 0.300
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.201
2021-12-01 08:55:00,493 - INFO: eval_greddy SPIDEr: 0.2012
loading annotations into memory...
0:00:00.003866
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9260, 'reflen': 9736, 'guess': [9260, 8236, 7212, 6188], 'correct': [5104, 1828, 7



loading annotations into memory...
0:00:00.003838
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9939, 'reflen': 10128, 'guess': [9939, 8915, 7891, 6867], 'correct': [5398, 1839, 679, 212]}
ratio: 0.9813388625591448
Bleu_1: 0.533
Bleu_2: 0.328
Bleu_3: 0.209
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.315
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.211
2021-12-01 09:09:03,903 - INFO: eval_greddy SPIDEr: 0.2112
loading annotations into memory...
0:00:00.003904
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9329, 'reflen': 9735, 'guess': [9329, 8305, 7281, 6257], 'correct': [5334, 1941, 776



loading annotations into memory...
0:00:00.003712
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9998, 'reflen': 10188, 'guess': [9998, 8974, 7950, 6926], 'correct': [5325, 1779, 649, 205]}
ratio: 0.9813506085589928
Bleu_1: 0.523
Bleu_2: 0.319
Bleu_3: 0.201
Bleu_4: 0.124
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.350
computing CIDEr score...
CIDEr: 0.310
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.207
2021-12-01 09:23:04,717 - INFO: eval_greddy SPIDEr: 0.2066
loading annotations into memory...
0:00:00.004068
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9288, 'reflen': 9739, 'guess': [9288, 8264, 7240, 6216], 'correct': [5240, 1888, 751



loading annotations into memory...
0:00:00.004118
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10216, 'reflen': 10362, 'guess': [10216, 9192, 8168, 7144], 'correct': [5551, 1933, 746, 252]}
ratio: 0.985910055973655
Bleu_1: 0.536
Bleu_2: 0.333
Bleu_3: 0.215
Bleu_4: 0.137
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.363
computing CIDEr score...
CIDEr: 0.329
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.219
2021-12-01 09:37:05,759 - INFO: eval_greddy SPIDEr: 0.2185
loading annotations into memory...
0:00:00.003800
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9392, 'reflen': 9798, 'guess': [9392, 8368, 7344, 6320], 'correct': [5359, 1973, 78



loading annotations into memory...
0:00:00.003805
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10011, 'reflen': 10159, 'guess': [10011, 8987, 7963, 6939], 'correct': [5346, 1841, 664, 191]}
ratio: 0.9854316369720458
Bleu_1: 0.526
Bleu_2: 0.326
Bleu_3: 0.206
Bleu_4: 0.124
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.350
computing CIDEr score...
CIDEr: 0.322
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.214
2021-12-01 09:51:10,421 - INFO: eval_greddy SPIDEr: 0.2143
loading annotations into memory...
0:00:00.003767
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9271, 'reflen': 9714, 'guess': [9271, 8247, 7223, 6199], 'correct': [5205, 1882, 7



loading annotations into memory...
0:00:00.003820
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9818, 'reflen': 10052, 'guess': [9818, 8794, 7770, 6746], 'correct': [5305, 1831, 671, 219]}
ratio: 0.9767210505371093
Bleu_1: 0.528
Bleu_2: 0.328
Bleu_3: 0.208
Bleu_4: 0.130
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.332
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.217
2021-12-01 10:05:12,518 - INFO: eval_greddy SPIDEr: 0.2169
loading annotations into memory...
0:00:00.003883
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9301, 'reflen': 9687, 'guess': [9301, 8277, 7253, 6229], 'correct': [5251, 1908, 754



loading annotations into memory...
0:00:00.003927
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10250, 'reflen': 10289, 'guess': [10250, 9226, 8202, 7178], 'correct': [5388, 1784, 652, 202]}
ratio: 0.9962095441732922
Bleu_1: 0.524
Bleu_2: 0.318
Bleu_3: 0.200
Bleu_4: 0.122
computing METEOR score...
METEOR: 0.155
computing Rouge score...
ROUGE_L: 0.349
computing CIDEr score...
CIDEr: 0.317
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.209
2021-12-01 10:19:15,202 - INFO: eval_greddy SPIDEr: 0.2092
loading annotations into memory...
0:00:00.003768
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9465, 'reflen': 9842, 'guess': [9465, 8441, 7417, 6393], 'correct': [5244, 1862, 7



loading annotations into memory...
0:00:00.003848
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10196, 'reflen': 10307, 'guess': [10196, 9172, 8148, 7124], 'correct': [5415, 1827, 665, 198]}
ratio: 0.9892306199669166
Bleu_1: 0.525
Bleu_2: 0.322
Bleu_3: 0.203
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.354
computing CIDEr score...
CIDEr: 0.320
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.213
2021-12-01 10:33:19,083 - INFO: eval_greddy SPIDEr: 0.2129
loading annotations into memory...
0:00:00.003910
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9326, 'reflen': 9755, 'guess': [9326, 8302, 7278, 6254], 'correct': [5226, 1843, 7



loading annotations into memory...
0:00:00.003966
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9996, 'reflen': 10208, 'guess': [9996, 8972, 7948, 6924], 'correct': [5336, 1831, 676, 223]}
ratio: 0.9792319749215341
Bleu_1: 0.523
Bleu_2: 0.323
Bleu_3: 0.206
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.328
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.216
2021-12-01 10:47:20,039 - INFO: eval_greddy SPIDEr: 0.2165
loading annotations into memory...
0:00:00.004000
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9371, 'reflen': 9762, 'guess': [9371, 8347, 7323, 6299], 'correct': [5275, 1924, 757



loading annotations into memory...
0:00:00.003875
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9930, 'reflen': 10125, 'guess': [9930, 8906, 7882, 6858], 'correct': [5370, 1819, 672, 215]}
ratio: 0.9807407407406439
Bleu_1: 0.530
Bleu_2: 0.326
Bleu_3: 0.207
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.327
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.216
2021-12-01 11:01:21,204 - INFO: eval_greddy SPIDEr: 0.2155
loading annotations into memory...
0:00:00.003875
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9285, 'reflen': 9702, 'guess': [9285, 8261, 7237, 6213], 'correct': [5310, 1923, 771



loading annotations into memory...
0:00:00.003837
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10157, 'reflen': 10272, 'guess': [10157, 9133, 8109, 7085], 'correct': [5559, 1950, 739, 251]}
ratio: 0.9888045171338601
Bleu_1: 0.541
Bleu_2: 0.338
Bleu_3: 0.218
Bleu_4: 0.138
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.363
computing CIDEr score...
CIDEr: 0.336
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.221
2021-12-01 11:15:22,418 - INFO: eval_greddy SPIDEr: 0.2213
loading annotations into memory...
0:00:00.003903
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9317, 'reflen': 9751, 'guess': [9317, 8293, 7269, 6245], 'correct': [5427, 2032, 8



loading annotations into memory...
0:00:00.003882
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9969, 'reflen': 10184, 'guess': [9969, 8945, 7921, 6897], 'correct': [5479, 1933, 754, 257]}
ratio: 0.9788884524743736
Bleu_1: 0.538
Bleu_2: 0.337
Bleu_3: 0.220
Bleu_4: 0.140
computing METEOR score...
METEOR: 0.162
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.351
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.231
2021-12-01 11:29:23,531 - INFO: eval_greddy SPIDEr: 0.2307
loading annotations into memory...
0:00:00.004050
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9293, 'reflen': 9727, 'guess': [9293, 8269, 7245, 6221], 'correct': [5314, 1923, 773



loading annotations into memory...
0:00:00.003779
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10096, 'reflen': 10254, 'guess': [10096, 9072, 8048, 7024], 'correct': [5447, 1874, 705, 229]}
ratio: 0.9845913789739629
Bleu_1: 0.531
Bleu_2: 0.329
Bleu_3: 0.210
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.351
computing CIDEr score...
CIDEr: 0.327
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.216
2021-12-01 11:43:23,741 - INFO: eval_greddy SPIDEr: 0.2161
loading annotations into memory...
0:00:00.004186
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9339, 'reflen': 9752, 'guess': [9339, 8315, 7291, 6267], 'correct': [5296, 1902, 7



loading annotations into memory...
0:00:00.003976
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10251, 'reflen': 10326, 'guess': [10251, 9227, 8203, 7179], 'correct': [5459, 1829, 668, 215]}
ratio: 0.992736780941217
Bleu_1: 0.529
Bleu_2: 0.323
Bleu_3: 0.203
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.154
computing Rouge score...
ROUGE_L: 0.352
computing CIDEr score...
CIDEr: 0.314
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.209
2021-12-01 11:57:26,090 - INFO: eval_greddy SPIDEr: 0.2094
loading annotations into memory...
0:00:00.003891
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9311, 'reflen': 9719, 'guess': [9311, 8287, 7263, 6239], 'correct': [5199, 1835, 73

In [56]:
#mixup
epoch = 1
if hp.mode == 'train':
    while epoch < hp.training_epochs + 1:
        epoch_start_time = time.time()
        train()
        torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
        scheduler.step(epoch)
        eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=2)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=3)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=4)
        epoch += 1

ValueError: not enough values to unpack (expected 4, got 3)

In [60]:
for src, tgt, tgt_len in training_data:
    src = src.to(device)
    tgt = tgt.to(device)

KeyboardInterrupt: 

In [55]:
hp.training_epochs

30

epoch=37 eval_beam_3 SPIDEr: 0.2344 # 2개 layer 만 trainable -06/9  
 SPIDEr: # 5개 layer 만 trainable -06/10 0.2252
별 차이 없음 ;;;;;


model score check (eval)

In [16]:
#if hp.mode == 'eval':
# Evaluation model score
model.load_state_dict(torch.load("./models/base/48.pt"))
eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=2)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=3)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=4)

FileNotFoundError: [Errno 2] No such file or directory: './models/base/48.pt'

In [18]:
model.load_state_dict(torch.load("./models/base/49.pt"))

<All keys matched successfully>

In [None]:
class Mixup(object):
    def __init__(self, mixup_alpha, random_seed=1234):
        """Mixup coefficient generator.
        """
        self.mixup_alpha = mixup_alpha
        self.random_state = np.random.RandomState(random_seed)

    def get_lambda(self, batch_size):
        """Get mixup random coefficients.
        Args:
          batch_size: int
        Returns:
          mixup_lambdas: (batch_size,)
        """
        mixup_lambdas = []
        for n in range(0, batch_size, 2):
            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
            mixup_lambdas.append(lam)
            mixup_lambdas.append(1. - lam)

        return np.array(mixup_lambdas)


In [None]:
def do_mixup(x, mixup_lambda):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
    """
    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
    return out

In [1]:
import numpy as np
from librosa.feature import melspectrogram
from librosa.feature.inverse import mel_to_audio, mel_to_stft

__author__ = 'Konstantinos Drossos -- Tampere University, Nikita Kuzmin -- Lomonosov Moscow State University'
__docformat__ = 'reStructuredText'
__all__ = ['feature_extraction']


def feature_extraction(audio_data: np.ndarray,
                       sr: int,
                       nb_fft: int,
                       hop_size: int,
                       nb_mels: int,
                       f_min: float,
                       f_max: float,
                       htk: bool,
                       power: float,
                       norm: bool,
                       window_function: str,
                       center: bool)\
        -> (np.ndarray, np.float):
    """Feature extraction function.
    :param audio_data: Audio signal.
    :type audio_data: numpy.ndarray
    :param sr: Sampling frequency.
    :type sr: int
    :param nb_fft: Amount of FFT points.
    :type nb_fft: int
    :param hop_size: Hop size in samples.
    :type hop_size: int
    :param nb_mels: Amount of MEL bands.
    :type nb_mels: int
    :param f_min: Minimum frequency in Hertz for MEL band calculation.
    :type f_min: float
    :param f_max: Maximum frequency in Hertz for MEL band calculation.
    :type f_max: float|None
    :param htk: Use the HTK Toolbox formula instead of Auditory toolkit.
    :type htk: bool
    :param power: Power of the magnitude.
    :type power: float
    :param norm: Area normalization of MEL filters.
    :type norm: bool
    :param window_function: Window function.
    :type window_function: str
    :param center: Center the frame for FFT.
    :type center: bool
    :return: Log mel-bands energies of shape=(t, nb_mels)
    :rtype: numpy.ndarray, numpy.float
    """
    y = audio_data/abs(audio_data).max()
    mel_bands = melspectrogram(
        y=y, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power, n_mels=nb_mels,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm).T

    return np.log(mel_bands + np.finfo(float).eps)


def from_mel_to_audio(mel_data: np.ndarray,
                       sr: int,
                       nb_fft: int,
                       hop_size: int,
                       nb_mels: int,
                       f_min: float,
                       f_max: float,
                       htk: bool,
                       power: float,
                       norm: bool,
                       window_function: str,
                       center: bool)\
        -> (np.ndarray, np.float):
    """Feature extraction inverse function.
    :param audio_data: Audio signal.
    :type audio_data: numpy.ndarray
    :param sr: Sampling frequency.
    :type sr: int
    :param nb_fft: Amount of FFT points.
    :type nb_fft: int
    :param hop_size: Hop size in samples.
    :type hop_size: int
    :param nb_mels: Amount of MEL bands.
    :type nb_mels: int
    :param f_min: Minimum frequency in Hertz for MEL band calculation.
    :type f_min: float
    :param f_max: Maximum frequency in Hertz for MEL band calculation.
    :type f_max: float|None
    :param htk: Use the HTK Toolbox formula instead of Auditory toolkit.
    :type htk: bool
    :param power: Power of the magnitude.
    :type power: float
    :param norm: Area normalization of MEL filters.
    :type norm: bool
    :param window_function: Window function.
    :type window_function: str
    :param center: Center the frame for FFT.
    :type center: bool
    :return: audio data
    :rtype: numpy.ndarray
    """

    y = np.exp(mel_data) - np.finfo(float).eps
    audio_data = mel_to_audio(
        M=y.T, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm)

    return audio_data


def from_mel_to_stft(mel_data: np.ndarray,
                       sr: int,
                       nb_fft: int,
                       hop_size: int,
                       nb_mels: int,
                       f_min: float,
                       f_max: float,
                       htk: bool,
                       power: float,
                       norm: bool,
                       window_function: str,
                       center: bool)\
        -> (np.ndarray, np.float):
    """From logmelspectrogram to stft.
    :param audio_data: Audio signal.
    :type audio_data: numpy.ndarray
    :param sr: Sampling frequency.
    :type sr: int
    :param nb_fft: Amount of FFT points.
    :type nb_fft: int
    :param hop_size: Hop size in samples.
    :type hop_size: int
    :param nb_mels: Amount of MEL bands.
    :type nb_mels: int
    :param f_min: Minimum frequency in Hertz for MEL band calculation.
    :type f_min: float
    :param f_max: Maximum frequency in Hertz for MEL band calculation.
    :type f_max: float|None
    :param htk: Use the HTK Toolbox formula instead of Auditory toolkit.
    :type htk: bool
    :param power: Power of the magnitude.
    :type power: float
    :param norm: Area normalization of MEL filters.
    :type norm: bool
    :param window_function: Window function.
    :type window_function: str
    :param center: Center the frame for FFT.
    :type center: bool
    :return: audio data
    :rtype: numpy.ndarray
    """

    y = np.exp(mel_data) - np.finfo(float).eps
    stft = mel_to_stft(
        M=y.T, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm)

    return stft

In [2]:
import numpy as np
import random
#from tools.features_log_mel_bands import feature_extraction, from_mel_to_audio, from_mel_to_stft
from pathlib import Path
import pysndfx
import gc

import copy

#from tools.file_io import load_audio_file
import torch


__author__ = 'Nikita Kuzmin -- Lomonosov Moscow State University'

class MixUp:

    def __init__(self, p, settings_features, simple_concat_captions=True,
                 sample_audio=False):

        self.p = p
        self.sample_audio = sample_audio
        self.settings_features = settings_features
        self.simple_concat_captions = simple_concat_captions

    def from_mel(self, mel):
        return 700 * (10 ** (mel / 2595.0) - 1)

    def to_mel(self, hertz):
        return 2595.0 * np.log10(1 + hertz / 700.0)

    def mix_audio(self, first_audio, second_audio):

        a = np.random.uniform(0.4, 0.6)

        shorter, longer = first_audio, second_audio

        if shorter.shape[0] == longer.shape[0]:
            if self.sample_audio:
                return (longer + shorter) / 2.0
            else:
                longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
                shorter = from_mel_to_audio(shorter,
                                            **self.settings_features['process'])
                return feature_extraction((longer + shorter) / 2, **self.settings_features['process'])

        if first_audio.shape[0] > second_audio.shape[0]:
            shorter, longer = longer, shorter


        if self.sample_audio:
            start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
            end = start + shorter.shape[0]
            longer *= a
            longer[start:end] += shorter * (1 - a)
        else:
            longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
            shorter = from_mel_to_audio(shorter,
                                        **self.settings_features['process'])
            start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
            end = start + shorter.shape[0]
            longer[start:end] += shorter * (1 - a)
            longer = feature_extraction(longer,
                                        **self.settings_features['process'])

        return longer

    def mix_labels(self, first_labels, second_labels):
        if self.simple_concat_captions:
            return np.hstack([first_labels[:-1], second_labels[1:]])
        else:

            first_token = first_labels[0]
            last_token = first_labels[-1]
            first_labels = first_labels[1:-1]
            second_labels = second_labels[1:-1]
            res = np.empty((first_labels.size + second_labels.size,),
                           dtype=first_labels.dtype)
            min_size = min(first_labels.size, second_labels.size)
            res[0:2*min_size:2] = first_labels[:min_size]
            res[1:2*min_size:2] = second_labels[:min_size]
            if first_labels.size > second_labels.size:
                res[min_size * 2:] = first_labels[min_size:]
            elif second_labels.size > first_labels.size:
                res[min_size*2:] = second_labels[min_size:]
            res = np.concatenate(([first_token], res))
            res = np.concatenate((res, [last_token]))
            return res

    def mix_audio_and_labels(self,
                             first_audio, second_audio,
                             first_labels, second_labels):
        mixed_audio = self.mix_audio(first_audio, second_audio)
        mixed_labels = self.mix_labels(first_labels, second_labels)

        return mixed_audio, mixed_labels

    def __call__(self, dataset, inputs):
        resulted_audio, resulted_labels, filename = inputs[0], inputs[1], inputs[2]
        if np.random.uniform() <= self.p:
            random_sample = dataset.random_sample(sample_audio=self.sample_audio)
            resulted_audio, resulted_labels = self.mix_audio_and_labels(
                resulted_audio, random_sample[0],
                resulted_labels, random_sample[1]
            )
        return resulted_audio, resulted_labels


class AudioAugmentation:
    # https://github.com/ex4sperans/freesound-classification
    def __init__(self, p):

        self.p = p
        self.effects_chain = (
            pysndfx.AudioEffectsChain()
                .reverb(
                reverberance=random.randrange(50),
                room_scale=random.randrange(50),
                stereo_depth=random.randrange(50)
            )
                .pitch(shift=random.randrange(-300, 300))
                .overdrive(gain=random.randrange(2, 10))
                .speed(random.uniform(0.9, 1.1))
        )

    def __call__(self, dataset, inputs):

        resulted_audio = inputs[0]
        captions = inputs[1]
        del inputs
        gc.collect()
        if np.random.uniform() < self.p:
            resulted_audio = torch.from_numpy(self.effects_chain(resulted_audio.numpy()))
        return resulted_audio, captions

In [4]:
!pip install pysndfx



In [17]:
from typing import List, Tuple
from pathlib import Path
import random

from torch.utils.data import Dataset
import torch
import torchaudio
from numpy import load as np_load, ndarray

import numpy as np

from pympler import muppy, summary
import pandas as pd


__author__ = 'Konstantinos Drossos -- Tampere University, Nikita Kuzmin -- Lomonosov Moscow State University'
__docformat__ = 'reStructuredText'
__all__ = ['ClothoDataset']


class ClothoDataset(Dataset):

    def __init__(self,
                 data_dir: Path,
                 split: str,
                 input_field_name: str,
                 output_field_name: str,
                 load_into_memory: bool,
                 settings_audio,
                 settings_features,
                 online_preprocessing=True,
                 transforms=None) \
            -> None:
        """Initialization of a Clotho dataset object.
        :param data_dir: Data directory with Clotho dataset files.
        :type data_dir: pathlib.Path
        :param split: The split to use (`development`, `validation`)
        :type split: str
        :param input_field_name: Field name for the input values
        :type input_field_name: str
        :param output_field_name: Field name for the output (target) values.
        :type output_field_name: str
        :param load_into_memory: Load the dataset into memory?
        :type load_into_memory: bool
        :param settings_audio: Settings about audio loading
        :type dict
        :param settings_features: Settings about audio processing
        :type dict
        :param indexes: Indexes of files, which depends on validation strategy
        :type indexes: numpy array
        :param transforms: List of transforms
        :type transforms: list
        """

        super(ClothoDataset, self).__init__()
        self.online_preprocessing = online_preprocessing
        the_dir: Path = data_dir.joinpath(split)
        self.split = split

        self.settings_audio = settings_audio
        self.settings_features = settings_features

        #if indexes is None:
        self.examples: List[Path] = sorted(the_dir.iterdir())
        #else:
        #    self.examples: List[Path] = list(np.array(sorted(the_dir.iterdir()))[indexes])
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.transforms = transforms
        self.resampler = torchaudio.transforms.Resample(orig_freq=settings_features['process']['sr'],
                                                        new_freq=settings_features['process']['sr_resample'])
        if load_into_memory:
            self.examples: List[ndarray] = [
                np_load(str(f), allow_pickle=True)
                for f in self.examples]
        self.cnt = 0

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.
        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int) \
            -> Tuple[ndarray, ndarray, Path]:
        """Gets an example from the dataset.
        :param item: Index of the item.
        :type item: int
        :return: Input and output values, and the Path of the file.
        :rtype: numpy.ndarray. numpy.ndarray, Path
        """

        ex = self.examples[item]
        if not self.load_into_memory:
            ex = np_load(str(ex), allow_pickle=True)
        if self.online_preprocessing:
            in_e = torchaudio.load(Path('data', 'clotho_audio_files', self.split, ex.file_name[0]))[0][0]
            ou_e = ex[self.output_name].item()
        else:
            in_e, ou_e = [ex[i].item()
                          for i in [self.input_name, self.output_name]]
        filename = ex.file_name[0]
        del ex
        if self.transforms is not None:
            for transform in self.transforms:
                in_e, ou_e = transform(dataset=self, inputs=(in_e, ou_e, filename))
        return in_e, ou_e, filename

    def random_sample(self, sample_audio=False):
        """
        Sampling audio or melspectrogram and encoded output
        :return:
        """

        item = random.randint(0, len(self.examples) - 1)
        ex = self.examples[item]
        if not self.load_into_memory:
            ex = np_load(str(ex), allow_pickle=True)
        if sample_audio:
            thedir = Path('./data/clotho_audio_files/').joinpath(self.split)
            filename = Path(thedir, ex.file_name[0])
            in_e = torchaudio.load(filepath=filename)[0][0]
            #in_e = self.resampler.forward(in_e)
            ou_e = ex[self.output_name].item()
        else:
            in_e, ou_e = [ex[i].item()
                          for i in [self.input_name, self.output_name]]

        return in_e, ou_e

In [18]:
from typing import MutableSequence, MutableMapping, Union,\
    Tuple, List
from pathlib import Path

from torch.utils.data import DataLoader
from torch import cat, zeros, from_numpy, ones, Tensor
from numpy import ndarray

#from data_handlers._clotho import ClothoDataset
#from tools.augmentations import MixUp, AudioAugmentation


__author__ = 'Konstantinos Drossos -- Tampere University. Nikita Kuzmin -- Lomonosov Moscow State University'
__docformat__ = 'reStructuredText'
__all__ = ['get_clotho_loader']


def _clotho_collate_fn(batch: MutableSequence[ndarray]) \
        -> Tuple[Tensor, Tensor, List[str]]:
    """Pads data.
    For each batch, the maximum input and output\
    time-steps are calculated. Then, then input and\
    output data are padded to match the maximum time-steps.
    The input data are padded with zeros in front, and\
    the output with] <EOS> tokens at the end.
    :param batch: Batch data of batch x time x features.\
                  First element in the list are the input\
                  data, second the output data.
    :type batch: list[numpy.ndarray]
    :return: Padded data. First tensor is the input data\
             and second the output.
    :rtype: torch.Tensor, torch.Tensor, list[str]
    """
    max_input_t_steps = max([i[0].shape[0] for i in batch])
    max_output_t_steps = max([i[1].shape[0] for i in batch])

    file_names = [i[2] for i in batch]

    #input_features = batch[0][0].shape[-1]
    eos_token = batch[0][1][-1]
    input_tensor = cat([
        cat([zeros(
            max_input_t_steps - i[0].shape[0]).float(),
             i[0].float()]).unsqueeze(0) for i in batch])
    output_tensor = cat([
        cat([
            from_numpy(i[1]).long(),
            ones(max_output_t_steps - len(i[1])).mul(eos_token).long()
        ]).unsqueeze(0) for i in batch])
    return [input_tensor, output_tensor, file_names]


def get_clotho_loader(split: str,
                      is_training: bool,
                      settings_data: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      settings_io: MutableMapping[
                          str, Union[str, bool, MutableMapping[
                              str, Union[str, MutableMapping[str, str]]]]],
                      settings_features: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      settings_dataset: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      ) \
        -> DataLoader:
    """Gets the data loader.
    :param split: Split to be used.
    :type split: str
    :param is_training: Is training data?
    :type is_training: bool
    :param settings_data: Data loading and dataset settings.
    :type settings_data: dict
    :param settings_io: Files I/O settings.
    :type settings_io: dict
    :param settings_features: Audio preprocessing features.
    :type settings_features: dict
    :param settings_dataset: Dataset settings.
    :type settings_dataset: dict
    :param indexes: Indexes of audio files, which depends on validation_strategy.
    :type indexes: numpy array
    :type settings_training: dict
    :return: Data loader.
    :rtype: torch.utils.data.DataLoader
    """
    data_dir = Path(
        settings_io['root_dirs']['data'],
        settings_io['dataset']['features_dirs']['output'])

    transforms = []
    if settings_data['transforms'] == 'None' or (not is_training):
        transforms = None
    else:
        if 'MixUp' in settings_data['transforms']:
            print(settings_features['simple_concat_captions'], 'lalalalalal')
            transforms.append(MixUp(p=settings_data['MixUp_p'],
                              settings_features=settings_features,
                              simple_concat_captions=settings_features['simple_concat_captions'],
                              sample_audio=True))
        if 'another' in settings_data['transforms']:
            transforms.append(AudioAugmentation(p=settings_data['MixUp_p']))

    #if settings_training['validation_strategy']
    dataset = ClothoDataset(
        data_dir=data_dir,
        split=split,
        input_field_name=settings_data['input_field_name'],
        output_field_name=settings_data['output_field_name'],
        load_into_memory=settings_data['load_into_memory'],
        settings_audio=settings_dataset['audio'],
        settings_features=settings_features,
        transforms=transforms)

    shuffle = settings_data['shuffle'] if is_training else False
    drop_last = settings_data['drop_last'] if is_training else False
    if is_training:
        return DataLoader(
            dataset=dataset,
            batch_size=settings_data['batch_size'],
            shuffle=shuffle,
            num_workers=settings_data['num_workers'],
            drop_last=drop_last,
            # pin_memory=True,
            collate_fn=_clotho_collate_fn)
    else:
        return DataLoader(
            dataset=dataset,
            batch_size=40,
            shuffle=shuffle,
            num_workers=2,
            drop_last=drop_last,
            # pin_memory=True,
            collate_fn=_clotho_collate_fn)

In [19]:
config_file='main_settings'
file_ext='yaml'
file_dir='settings' 
settings = file_io.load_yaml_file(Path(
        file_dir, f'{config_file}.{file_ext}'))

In [6]:
from tools.file_io import load_audio_file
from tools import file_io

In [21]:
training_data = get_clotho_loader(
            settings_io['dataset']['features_dirs']['development'],
            is_training=True,
            settings_data=settings_data,
            settings_io=settings_io,
            settings_features=settings_features,
            settings_dataset=settings_dataset)

True lalalalalal


In [23]:
len(training_data)

1525

In [24]:
 =  get_clotho_loader(
            settings_io['dataset']['features_dirs']['evaluation'],
            is_training=False,
            settings_data=settings_data,
            settings_io=settings_io,
            settings_features=settings_features,
            settings_dataset=settings_dataset)

In [25]:
len(evaluation_beam)

131

In [8]:
settings_io=settings['dirs_and_files']

In [17]:
settings_data['transforms']

['MixUp']

In [12]:
settings_io

{'root_dirs': {'outputs': 'outputs', 'data': 'data'},
 'dataset': {'development': 'development',
  'evaluation': 'evaluation',
  'features_dirs': {'output': 'data_splits_mel',
   'development': 'development',
   'evaluation': 'evaluation'},
  'audio_dirs': {'downloaded': 'clotho_audio_files',
   'output': 'data_splits_audio_mel',
   'development': 'development',
   'evaluation': 'evaluation'},
  'annotations_dir': 'clotho_csv_files',
  'pickle_files_dir': 'pickles',
  'files': {'np_file_name_template': 'clotho_file_{audio_file_name}_{caption_index}.npy',
   'words_list_file_name': 'words_list.p',
   'words_counter_file_name': 'words_frequencies.p',
   'characters_list_file_name': 'characters_list.p',
   'characters_frequencies_file_name': 'characters_frequencies.p'}},
 'model': {'model_dir': 'models',
  'checkpoint_model_name': 'dcase_model_baseline.pt',
  'pre_trained_model_name': 'dcase_model_baseline_pre_trained.pt'},
 'logging': {'logger_dir': 'logging',
  'caption_logger_file': 'c

In [9]:
settings_io['dataset']['features_dirs']['development']

'development'

In [10]:
settings_data=settings['dnn_training_settings']['data']

In [11]:
settings_data

{'input_field_name': 'features',
 'output_field_name': 'words_ind',
 'load_into_memory': False,
 'transforms': ['MixUp'],
 'MixUp_p': 0.5,
 'batch_size': 16,
 'shuffle': True,
 'num_workers': 4,
 'drop_last': True}

In [12]:
settings_features=settings['feature_extraction_settings']

In [13]:
settings_features

{'keep_raw_audio_data': False,
 'simple_concat_captions': True,
 'process': {'sr': 44100,
  'sr_resample': 16000,
  'nb_fft': 1024,
  'hop_size': 512,
  'nb_mels': 64,
  'window_function': 'hann',
  'center': True,
  'f_min': 0.0,
  'f_max': None,
  'htk': False,
  'power': 1.0,
  'norm': 1}}

In [14]:
settings_dataset=settings['dataset_creation_settings']

In [15]:
settings_dataset

{'workflow': {'create_dataset': True, 'validate_dataset': False},
 'annotations': {'development_file': 'clotho_captions_development.csv',
  'evaluation_file': 'clotho_captions_evaluation.csv',
  'audio_file_column': 'file_name',
  'captions_fields_prefix': 'caption_{}',
  'use_special_tokens': True,
  'nb_captions': 5,
  'keep_case': False,
  'remove_punctuation_words': True,
  'remove_punctuation_chars': True,
  'use_unique_words_per_caption': False,
  'use_unique_chars_per_caption': False},
 'audio': {'sr': 44100, 'to_mono': True, 'max_abs_value': 1.0}}

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Tuple, List, AnyStr, Union
from pathlib import Path

from numpy import ndarray, recarray
from torch.utils.data import Dataset
from numpy import load as np_load

import torch
import numpy as np
import os

__author__ = 'Konstantinos Drossos -- Tampere University'
__docformat__ = 'reStructuredText'
__all__ = ['ClothoDataset']


class ClothoDataset(Dataset):

    def __init__(self, data_dir: Path,
                 split: AnyStr,
                 input_field_name: AnyStr,
                 output_field_name: AnyStr,
                 load_into_memory: bool,
                 transforms=transforms) \
            -> None:
        """Initialization of a Clotho dataset object.

        :param data_dir: Directory with data.
        :type data_dir: pathlib.Path
        :param split: Split to use (i.e. 'development', 'evaluation')
        :type split: str
        :param input_field_name: Field name of the clotho data\
                                 to be used as input data to the\
                                 method.
        :type input_field_name: str
        :param output_field_name: Field name of the clotho data\
                                 to be used as output data to the\
                                 method.
        :type output_field_name: str
        :param load_into_memory: Load all data into memory?
        :type load_into_memory: bool
        """
        super(ClothoDataset, self).__init__()
        the_dir: Path = data_dir.joinpath(split)

        self.examples: List[Path] = sorted(the_dir.iterdir())
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.transforms=transforms
        if load_into_memory:
            self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
                                             for f in self.examples]

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.

        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int) \
            -> Tuple[ndarray, ndarray]:
        """Gets an example from the dataset.

        :param item: Index of the item.
        :type item: int
        :return: Input and output values.
        :rtype: numpy.ndarray. numpy.ndarray
        """
        ex: Union[Path, recarray] = self.examples[item]
        if not self.load_into_memory:
            ex: recarray = np_load(str(ex), allow_pickle=True)

        in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]

        return in_e, ou_e


class ClothoDatasetEval(Dataset):

    def __init__(self, data_dir: Path,
                 split: AnyStr,
                 input_field_name: AnyStr,
                 output_field_name: AnyStr,
                 load_into_memory: bool) \
            -> None:
        """Initialization of a Clotho dataset object.

        :param data_dir: Directory with data.
        :type data_dir: pathlib.Path
        :param split: Split to use (i.e. 'development', 'evaluation')
        :type split: str
        :param input_field_name: Field name of the clotho data\
                                 to be used as input data to the\
                                 method.
        :type input_field_name: str
        :param output_field_name: Field name of the clotho data\
                                 to be used as output data to the\
                                 method.
        :type output_field_name: str
        :param load_into_memory: Load all data into memory?
        :type load_into_memory: bool
        """
        super(ClothoDatasetEval, self).__init__()
        the_dir: Path = data_dir.joinpath(split)
        if split == 'evaluation':
            self.examples: List[Path] = sorted(the_dir.iterdir())[::5]  # changed
        else:
            self.examples: List[Path] = sorted(the_dir.iterdir())  # changed
        # self.examples: List[Path] = sorted(the_dir.iterdir())
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.data_dir = the_dir

        if load_into_memory:
            self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
                                             for f in self.examples]

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.

        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int):
        """Gets an example from the dataset.

        :param item: Index of the item.
        :type item: int
        :return: Input and output values.
        :rtype: numpy.ndarray. numpy.ndarray
        """
        ex: Union[Path, recarray] = self.examples[item]
        if not self.load_into_memory:
            ex: recarray = np_load(str(ex), allow_pickle=True)

        in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]

        all_ref = get_all_ref(ex['file_name'].item(), self.data_dir)

        filename = str(ex['file_name'].item())
        out_len = len(ou_e)
        return in_e, ou_e, all_ref, filename,out_len


def get_all_ref(filename, data_dir):
    filename = str(filename)
    # tgt = [np.load(d, allow_pickle=True).words_ind.tolist()
    tgt = [np.load(d, allow_pickle=True)['words_ind'].item().tolist()
           for d in [os.path.join(data_dir, 'clotho_file_{filename}.wav_{i}.npy'.
                                  format(filename=filename[:-4],  # 删除'.wav'
                                         i=i)) for i in range(5)]  # wav_0-wav_4
           ]
    return tgt
# EOF


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Callable, Union, Tuple, AnyStr, Optional
from functools import partial
from pathlib import Path

from torch.utils.data.dataloader import DataLoader

from .clotho_dataset import ClothoDataset, ClothoDatasetEval
from .collate_fn import clotho_collate_fn, clotho_collate_fn_eval

__author__ = 'Konstantinos Drossos'
__docformat__ = 'reStructuredText'
__all__ = ['get_clotho_loader']


def get_clotho_loader(data_dir: Path,
                      split: str,
                      input_field_name: str,
                      output_field_name: str,
                      load_into_memory: bool,
                      batch_size: int,
                      nb_t_steps_pad: Union[AnyStr, Tuple[int, int]],
                      shuffle: Optional[bool] = True,
                      drop_last: Optional[bool] = True,
                      input_pad_at: Optional[str] = 'start',
                      output_pad_at: Optional[str] = 'end',
                      num_workers: Optional[int] = 1,
                      return_reference: Optional[bool] = False,
                      augment: Optional[bool] = False) \
        -> DataLoader:
    """Gets the clotho data loader.

    :param return_reference:
    :param data_dir: Directory with data.
    :type data_dir: pathlib.Path
    :param split: Split to use (i.e. 'development', 'evaluation')
    :type split: str
    :param input_field_name: Field name of the clotho data\
                             to be used as input data to the\
                             method.
    :type input_field_name: str
    :param output_field_name: Field name of the clotho data\
                             to be used as output data to the\
                             method.
    :type output_field_name: str
    :param load_into_memory: Load all data into memory?
    :type load_into_memory: bool
    :param batch_size: Batch size to use.
    :type batch_size: int
    :param nb_t_steps_pad: Number of time steps to\
                           pad/truncate to. Cab use\
                           'max', 'min', or exact number\
                           e.g. (1024, 10).
    :type nb_t_steps_pad: str|(int, int)
    :param shuffle: Shuffle examples? Defaults to True.
    :type shuffle: bool, optional
    :param drop_last: Drop the last examples if not making\
                      a batch of `batch_size`? Defaults to True.
    :type drop_last: bool, optional
    :param input_pad_at: Pad input at the start or\
                         at the end?
    :type input_pad_at: str
    :param output_pad_at: Pad output at the start or\
                          at the end?
    :type output_pad_at: str
    :param num_workers: Amount of workers, defaults to 1.
    :type num_workers: int, optional
    :return: Dataloader for Clotho data.
    :rtype: torch.utils.data.dataloader.DataLoader
    """
    if return_reference:
        dataset: ClothoDatasetEval = ClothoDatasetEval(
            data_dir=data_dir, split=split,
            input_field_name=input_field_name,
            output_field_name=output_field_name,
            load_into_memory=load_into_memory
            transforms=trans)

        collate_fn: Callable = partial(
            clotho_collate_fn_eval,
            nb_t_steps=nb_t_steps_pad,
            input_pad_at=input_pad_at,
            output_pad_at=output_pad_at, split=split, augment=augment)
    else:
        dataset: ClothoDataset = ClothoDataset(
            data_dir=data_dir, split=split,
            input_field_name=input_field_name,
            output_field_name=output_field_name,
            load_into_memory=load_into_memory)

        collate_fn: Callable = partial(
            clotho_collate_fn,
            nb_t_steps=nb_t_steps_pad,
            input_pad_at=input_pad_at,
            output_pad_at=output_pad_at)

    return DataLoader(
        dataset=dataset, batch_size=batch_size,
        shuffle=shuffle, num_workers=num_workers,
        drop_last=drop_last, collate_fn=collate_fn)

# EOF

