In [1]:
import numpy as np
import time
import torch
import torch.nn as nn

In [2]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)

resnet

In [3]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):

        super(ConvBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.conv2 = nn.Conv2d(in_channels=out_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x

In [4]:
class _ResNet(nn.Module):
    def __init__(self, block, layers, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(_ResNet, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            if stride == 1:
                downsample = nn.Sequential(
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[0])
                init_bn(downsample[1])
            elif stride == 2:
                downsample = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2), 
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[1])
                init_bn(downsample[2])

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        return x


In [5]:
class _ResnetBottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(_ResnetBottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        self.stride = stride
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = _resnet_conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = _resnet_conv3x3(width, width)
        self.bn2 = norm_layer(width)
        self.conv3 = _resnet_conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

        self.init_weights()

    def init_weights(self):
        init_layer(self.conv1)
        init_bn(self.bn1)
        init_layer(self.conv2)
        init_bn(self.bn2)
        init_layer(self.conv3)
        init_bn(self.bn3)
        nn.init.constant_(self.bn3.weight, 0)

    def forward(self, x):
        identity = x

        if self.stride == 2:
            x = F.avg_pool2d(x, kernel_size=(2, 2))

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = F.dropout(out, p=0.1, training=self.training)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(identity)

        out += identity
        out = self.relu(out)

        return out


In [6]:
def _resnet_conv1x1(in_planes, out_planes):
    #1x1 convolution
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False)

In [7]:
def _resnet_conv3x3(in_planes, out_planes):
    #3x3 convolution with padding
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1,
                     padding=1, groups=1, bias=False, dilation=1)

In [8]:
class ResNet54(nn.Module):
    def __init__(self, classes_num=527):
        
        super(ResNet54, self).__init__()

      

        # Spectrogram extractor
       # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
       #     win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
       #     freeze_parameters=True)

        # Logmel feature extractor
        #self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
        #   n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
        #   freeze_parameters=True)

        # Spec augmenter
        #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
        #    freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        # self.conv_block2 = ConvBlock(in_channels=64, out_channels=64)

        self.resnet = _ResNet(block=_ResnetBottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)

        self.conv_block_after1 = ConvBlock(in_channels=2048, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weights()

    def init_weights(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)


    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        #x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        #x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        x = input.unsqueeze(1)
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        #if self.training:
        #   x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.resnet(x)
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.conv_block_after1(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        
        #x = torch.mean(x, dim=3)
        
        #(x1, _) = torch.max(x, dim=2)
       # x2 = torch.mean(x, dim=2)
       # x = x1 + x2
       # x = F.dropout(x, p=0.5, training=self.training)
       # x = F.relu_(self.fc1(x))
       # embedding = F.dropout(x, p=0.5, training=self.training)
       # clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        #output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return x



In [9]:
class Transfer_ResNet54(nn.Module):
    def __init__(self, freeze_base, pretrain_checkpoint=None):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_ResNet54, self).__init__()

        audioset_classes_num = 527
        self.base = ResNet54(audioset_classes_num)

        #self.init_weights()

        if pretrain_checkpoint:
            self.load_from_pretrain(pretrain_checkpoint)
            
        #self.base.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        # 안쓰이는이유는 multi-class clasification을 생략하기 때문,

        
        
        #if freeze_base:
        #    ct = 0
        #    for child in self.base.children():
        #        ct += 1
        #        if ct < 6:
        #            for param in child.parameters():
        #                print(param)
        #                param.requires_grad = False
        
        if freeze_base:
            # 2단계 freeze / 3단계 freeze X
             #Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

    #def init_weights(self):
        #init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint):
        pretrained_checkpoint="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth"
        checkpoint = torch.load(pretrained_checkpoint)
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_imag.weight')#가중치 삭제  
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_real.weight')#가중치 삭제
        checkpoint['model'].pop('logmel_extractor.melW')#가중치 삭제
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input):
        """Input: (batch_size, data_length)
        """
        output = self.base(input)

        #embedding = output_dict['embedding']
        #clipwise_output = output_dict['clipwise_output']

        return output #, clipwise_output
 

In [62]:
import re

In [64]:
s = '안녕하세요. 좋은 아침이에요'
m = re.search('^안녕', s)
print(m.group())

안녕


In [60]:
for param in resnet.parameters():
    if re.search('')
    param.requires_grad =  True
    print(param)

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       requires_grad=True)
Parameter containing:
tensor([[[[-6.8801e-02,  9.8489e-03,  7.2190e-02],
          [-3.5383e-02,  3.5468e-02, -7.2592e-02],
          [ 7.2946e-02,  1.1742e-02,  5.9962e-02]]],


        [[[ 9.5678e-02,  1.4111e-02,  4.1614e-04],
          [-5.2265e-02, -2.3830e-02,  7.5641e-04],
          [-3.9783e-02,  4.5862e-02, -4.1354e-02]]],


   

In [50]:
params= []
for param in child.parameters():
    params.append(param)

In [43]:
for child in resnet.children():
       conv_block_after1.requires_grad = True

NameError: name 'conv_block_after1' is not defined

In [11]:
resnet=Transfer_ResNet54(freeze_base=True,pretrain_checkpoint="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth")

In [15]:
#model parameter 확인
for name, param in resnet.named_parameters(): 
    print(f'name:{name}') 
    print(type(param)) 
    print(f'param.shape:{param.shape}') 
    print(f'param.requries_grad:{param.requires_grad}') 
    print('=====')

name:base.bn0.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.bn0.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.conv1.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64, 1, 3, 3])
param.requries_grad:False
=====
name:base.conv_block1.conv2.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64, 64, 3, 3])
param.requries_grad:False
=====
name:base.conv_block1.bn1.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.bn1.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.bn2.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.bn2.bias
<class 'torch.nn.parameter.Parameter'>
para

In [None]:
str = 'Hello world, Python!'
if str.startswith('Hello'):

In [79]:
type(name)

str

In [14]:
#layer , fc 풀기
for name, param in resnet.named_parameters():
    if name.startswith('base.resnet.layer4.2'):
        param.requires_grad=True
    
    elif name.startswith('base.conv_block_after1'):
        param.requires_grad=True
    
    elif name.startswith('base.fc'):
        param.requires_grad=True
    else:
        param.requires_grad=False

In [14]:
freeze_base=True

In [16]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.transformer import TransformerDecoder,TransformerDecoderLayer

from hparams import hparams as hp
from encoder import Cnn14,Transfer_Cnn14,init_layer


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, batch_size, dropout=0.5,pretrain_cnn=None,
                 pretrain_emb=None,freeze_cnn=True):
        super(TransformerModel, self).__init__()

        self.model_type = 'resnet+transformer'
        decoder_layers = TransformerDecoderLayer(d_model=nhid, nhead=nhead, dropout=dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
        self.word_emb = nn.Embedding(ntoken, nhid)
        self.ninp = ninp
        self.nhid = nhid
        self.fc = nn.Linear(2048, 2048, bias=True)
        self.fc1 = nn.Linear(2048, nhid, bias=True)
        self.dec_fc = nn.Linear(nhid, ntoken)
        self.batch_size = batch_size
        self.ntoken = ntoken

        #def __init__(self, freeze_base, pretrain_checkpoint=None):
        pretrain_cnn="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth"
        
        #self.encoder = Transfer_ResNet54(freeze_base=freeze_cnn, pretrain_checkpoint=pretrain_cnn)
        self.encoder = resnet
        self.dropout = nn.Dropout(dropout)
        self.pos_encoder = PositionalEncoding(nhid, dropout)
        self.generator = nn.Softmax(dim=-1)
        self.init_weights()

        '''
        if pretrain_cnn is not None:
            dict_trained = pretrain_cnn
            dict_new = self.encoder.state_dict().copy()
            new_list = list(self.encoder.state_dict().keys())
            trained_list = list(dict_trained.keys())
            for i in range(len(new_list)):
                dict_new[new_list[i]] = dict_trained[trained_list[i]]
            self.encoder.load_state_dict(dict_new)
        
        if freeze_cnn:
            self.freeze_cnn()
        '''

        if pretrain_emb is not None:
            self.word_emb.weight.data = pretrain_emb

    '''
    def freeze_cnn(self):
        for p in self.encoder.parameters():
            p.requires_grad = False
    '''

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        init_layer(self.fc1)
        init_layer(self.fc)
        self.word_emb.weight.data.uniform_(-initrange, initrange)
        self.dec_fc.bias.data.zero_()
        self.dec_fc.weight.data.uniform_(-initrange, initrange)

    def encode(self, src, input_mask=None):
        global x 
        x = self.encoder(src)  # (batch_size, 2048, T/16, mel_bins/16) ,mixup
        x = torch.mean(x, dim=3)  # (batch_size, 2048, T/16)
        x = x.permute(2, 0, 1)  # (T/16,batch_size,2048)
        x = F.relu_(self.fc(x))
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.relu(self.fc1(x))
        return x

    def decode(self, mem, tgt, input_mask=None, target_mask=None, target_padding_mask=None):
        # tgt:(batch_size,T_out)
        # mem:(T_mem,batch_size,nhid)

        tgt = tgt.transpose(0, 1)  # (T_out,batch_size)
        if target_mask is None or target_mask.size(0) != len(tgt):
            device = tgt.device
            target_mask = self.generate_square_subsequent_mask(len(tgt)).to(device)

        tgt = self.dropout(self.word_emb(tgt)) * math.sqrt(self.nhid)
        tgt = self.pos_encoder(tgt)
        # mem = self.pos_encoder(mem)
        output = self.transformer_decoder(tgt, mem, memory_mask=input_mask, tgt_mask=target_mask,
                                          tgt_key_padding_mask=target_padding_mask)
        output = self.dec_fc(output)
        return output

    def forward(self, src, tgt, input_mask=None, target_mask=None, target_padding_mask=None):
        # src:(batch_size,T_in,feature_dim)
        # tgt:(batch_size,T_out)
        mem = self.encode(src)
        output = self.decode(mem, tgt, input_mask=input_mask, target_mask=target_mask,
                             target_padding_mask=target_padding_mask)
        return output


In [17]:
import torch
import torch.nn as nn
import time

from data_handling import get_clotho_loader, get_test_data_loader
#from model import TransformerModel  # , RNNModel, RNNModelSmall
import itertools
import numpy as np
import os
import sys
import logging
import csv

from util import get_file_list, get_padding, print_hparams, greedy_decode, \
    calculate_bleu, calculate_spider, LabelSmoothingLoss, beam_search, align_word_embedding, gen_str
from hparams import hparams
from torch.utils.tensorboard import SummaryWriter

import argparse

hp = hparams()
parser = argparse.ArgumentParser(description='hparams for model')

device = torch.device('cuda')
np.random.seed(hp.seed)
torch.manual_seed(hp.seed)

<torch._C.Generator at 0x7f4e441f1990>

In [18]:
pretrain_emb = align_word_embedding(hp.word_dict_pickle_path, hp.pretrain_emb_path, hp.ntoken,
                                        hp.nhid) if hp.load_pretrain_emb else None

In [19]:
model = TransformerModel(hp.ntoken, hp.ninp, hp.nhead, hp.nhid, hp.nlayers, hp.batch_size, dropout=0.2,
                             pretrain_cnn="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth", pretrain_emb=pretrain_emb, freeze_cnn=True).to(device)

In [20]:
model

TransformerModel(
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(p=0.2, inplace=False)
      )
  

In [21]:
#model parameter 확인
for name, param in model.named_parameters(): 
    print(f'name:{name}') 
    print(type(param)) 
    print(f'param.shape:{param.shape}') 
    print(f'param.requries_grad:{param.requires_grad}') 
    print('=====')

name:transformer_decoder.layers.0.self_attn.in_proj_weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.in_proj_bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.out_proj.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([192, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.out_proj.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.multihead_attn.in_proj_weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.multihead_attn.in_proj_bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576])
param.requries_grad:True
=====
name:transformer_decoder.

In [22]:
#swa
from torchcontrib.optim import SWA
import torchcontrib

base_opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)
optimizer = torchcontrib.optim.SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)

In [21]:
#swa 안할때
optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)

scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)


In [23]:
data_dir = hp.data_dir
eval_data_dir = hp.eval_data_dir
train_data_dir = hp.train_data_dir
word_dict_pickle_path = hp.word_dict_pickle_path
word_freq_pickle_path = hp.word_freq_pickle_path
test_data_dir = hp.test_data_dir

In [24]:
#mixup
#data_dir = hp.data_dir
#eval_data_dir = hp.eval_data_dir
#train_data_dir = hp.train_data_dir
word_dict_pickle_path = hp.word_dict_pickle_path
word_freq_pickle_path = hp.word_freq_pickle_path
#test_data_dir = hp.test_data_dir

In [24]:
training_data = get_clotho_loader(data_dir=data_dir, split='development',
                                      input_field_name='features',
                                      output_field_name='words_ind',
                                      load_into_memory=False,
                                      batch_size=hp.batch_size,
                                      nb_t_steps_pad='max',
                                      num_workers=4, return_reference=True, augment=hp.spec_augmentation)

In [26]:
#전체 데이터 
from tqdm import tqdm
tqdm(training_data)

  0%|          | 0/3051 [00:00<?, ?it/s]

  0%|          | 0/3051 [00:00<?, ?it/s]

In [97]:
import pickle
#워드 개수 확인
with open('./create_dataset/data/pickles/words_frequencies.p','rb') as f:
    words_freq=pickle.load(f)
words_freq

[24420,
 24739,
 1,
 718,
 4808,
 46,
 16,
 13138,
 17,
 24420,
 45,
 28,
 71,
 329,
 873,
 5,
 7333,
 12184,
 768,
 1,
 97,
 149,
 45,
 168,
 132,
 555,
 1,
 49,
 3225,
 1,
 241,
 1844,
 9147,
 81,
 1,
 991,
 455,
 14,
 7,
 3,
 330,
 1935,
 36,
 12,
 62,
 2,
 3654,
 258,
 90,
 84,
 79,
 2134,
 1,
 5,
 75,
 4060,
 1703,
 40,
 2369,
 468,
 67,
 630,
 2,
 114,
 15,
 5,
 2986,
 1905,
 52,
 481,
 2,
 5,
 315,
 3003,
 121,
 811,
 4,
 2,
 2,
 31,
 2541,
 15,
 13,
 172,
 502,
 567,
 301,
 844,
 1,
 2748,
 2229,
 28,
 60,
 133,
 2,
 423,
 262,
 88,
 52,
 1,
 806,
 282,
 22,
 211,
 41,
 759,
 447,
 338,
 142,
 454,
 2337,
 3,
 5,
 1,
 22,
 1,
 129,
 23,
 268,
 809,
 692,
 630,
 417,
 3,
 148,
 20,
 55,
 91,
 38,
 241,
 2309,
 783,
 4,
 2,
 2,
 4,
 52,
 2,
 134,
 428,
 107,
 25,
 1,
 461,
 11,
 129,
 36,
 87,
 492,
 508,
 7,
 16,
 28,
 61,
 27,
 397,
 40,
 15,
 25,
 117,
 22,
 77,
 873,
 68,
 21,
 5,
 1,
 10,
 44,
 298,
 428,
 29,
 103,
 1259,
 128,
 1404,
 1,
 1149,
 271,
 1,
 1,
 274,
 123,
 5

In [98]:
len(words_freq)

4371

In [25]:
evaluation_beam = get_clotho_loader(data_dir=data_dir, split='evaluation',
                                        input_field_name='features',
                                        output_field_name='words_ind',
                                        load_into_memory=False,
                                        batch_size=32,
                                        nb_t_steps_pad='max',
                                        shuffle=False,
                                        return_reference=True)

In [26]:
test_data = get_test_data_loader(data_dir=test_data_dir,
                                     batch_size=hp.batch_size * 2,
                                     nb_t_steps_pad='max',
                                     shuffle=False,
                                     drop_last=False,
                                     input_pad_at='start',
                                     num_workers=8)

  cpuset_checked))


In [27]:
def train():
    model.train()
    total_loss_text = 0.
    start_time = time.time()
    batch = 0
    for src, tgt, tgt_len,ref in training_data:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_pad_mask = get_padding(tgt, tgt_len)
        tgt_in = tgt[:, :-1]
        tgt_pad_mask = tgt_pad_mask[:, :-1]
        tgt_y = tgt[:, 1:]

        optimizer.zero_grad()
        
        output = model(src, tgt_in, target_padding_mask=tgt_pad_mask)

        loss_text = criterion(output.contiguous().view(-1, hp.ntoken), tgt_y.transpose(0, 1).contiguous().view(-1))
        loss = loss_text
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), hp.clip_grad)
        
        optimizer.step()
        
    
        total_loss_text += loss_text.item()

        writer.add_scalar('Loss/train-text', loss_text.item(), (epoch - 1) * len(training_data) + batch)
        
        
        batch += 1
        
        if batch % hp.log_interval == 0 and batch > 0:
            mean_text_loss = total_loss_text / hp.log_interval
            elapsed = time.time() - start_time
            current_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
            logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                         'loss-text {:5.4f}'.format(
                epoch, batch, len(training_data), current_lr,
                elapsed * 1000 / hp.log_interval, mean_text_loss))
            total_loss_text = 0
            start_time = time.time()
        
            optimizer.swap_swa_sgd()

def eval_all(evaluation_data, max_len=30, eos_ind=9, word_dict_pickle_path=None):
    model.eval()
    with torch.no_grad():
        output_sentence_all = []
        ref_all = []
        for src, tgt, _, ref in evaluation_data:
            src = src.to(device)
            output = greedy_decode(model, src, max_len=max_len)

            output_sentence_ind_batch = []
            for i in range(output.size()[0]):
                output_sentence_ind = []
                for j in range(1, output.size(1)):
                    sym = output[i, j]
                    if sym == eos_ind: break
                    output_sentence_ind.append(sym.item())
                output_sentence_ind_batch.append(output_sentence_ind)
            output_sentence_all.extend(output_sentence_ind_batch)
            ref_all.extend(ref)
        score, output_str, ref_str = calculate_spider(output_sentence_all, ref_all, word_dict_pickle_path)

        loss_mean = score
        writer.add_scalar(f'Loss/eval_greddy', loss_mean, epoch)
        msg = f'eval_greddy SPIDEr: {loss_mean:2.4f}'
        logging.info(msg)


def eval_with_beam(evaluation_data, max_len=30, eos_ind=9, word_dict_pickle_path=None, beam_size=3):
    model.eval()
    with torch.no_grad():
        output_sentence_all = []
        ref_all = []
        for src, tgt, _, ref in evaluation_data:
            src = src.to(device)
            output = beam_search(model, src, max_len, start_symbol_ind=0, beam_size=beam_size)

            output_sentence_ind_batch = []
            for single_sample in output:
                output_sentence_ind = []
                for sym in single_sample:
                    if sym == eos_ind: break
                    output_sentence_ind.append(sym.item())
                output_sentence_ind_batch.append(output_sentence_ind)
            output_sentence_all.extend(output_sentence_ind_batch)
            ref_all.extend(ref)

        score, output_str, ref_str = calculate_spider(output_sentence_all, ref_all, word_dict_pickle_path)

        loss_mean = score
        writer.add_scalar(f'Loss/eval_beam', loss_mean, epoch)
        msg = f'eval_beam_{beam_size} SPIDEr: {loss_mean:2.4f}'
        logging.info(msg)


def test_with_beam(test_data, max_len=30, eos_ind=9, beam_size=3):
    model.eval()

    with torch.no_grad():
        with open("test_out.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(['file_name', 'caption_predicted'])
            for src, filename in test_data:
                src = src.to(device)
                output = beam_search(model, src, max_len, start_symbol_ind=0, beam_size=beam_size)

                output_sentence_ind_batch = []
                for single_sample in output:
                    output_sentence_ind = []
                    for sym in single_sample:
                        if sym == eos_ind: break
                        output_sentence_ind.append(sym.item())
                    output_sentence_ind_batch.append(output_sentence_ind)
                out_str = gen_str(output_sentence_ind_batch, hp.word_dict_pickle_path)
                for caption, fn in zip(out_str, filename):
                    writer.writerow(['{}.wav'.format(fn), caption])


In [28]:
if hp.label_smoothing:
    criterion = LabelSmoothingLoss(hp.ntoken, smoothing=0.1)
else:
    criterion = nn.CrossEntropyLoss(ignore_index=hp.ntoken - 1)

now_time = str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())))
log_dir = 'models/{name}'.format(name=hp.name)

writer = SummaryWriter(log_dir=log_dir)

log_path = os.path.join(log_dir, 'train.log')

logging.basicConfig(level=logging.DEBUG,
                        format=
                        '%(asctime)s - %(levelname)s: %(message)s',
                        handlers=[
                            logging.FileHandler(log_path),
                            logging.StreamHandler(sys.stdout)]
                        )


In [29]:
    logging.info(str(model))

    logging.info(str(print_hparams(hp)))

    logging.info('Data loaded!')
    logging.info('Data size: ' + str(len(training_data)))

    logging.info('Total Model parameters: ' + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))

2021-12-03 15:22:55,867 - INFO: TransformerModel(
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(

2021-12-03 15:22:55,868 - INFO: {'batch_size': 8, 'beam_width': 3, 'checkpoint_save_interval': 5, 'clip_grad': 2.5, 'data_dir': PosixPath('/home/hj20/dcase_2020_T6/create_dataset/data/data_splits'), 'device': 'cuda', 'eval_data_dir': '/home/hj20/dcase_2020_T6/create_dataset/data/data_splits/evaluation', 'freeze_cnn': True, 'label_smoothing': True, 'load_pretrain_cnn': True, 'load_pretrain_emb': False, 'load_pretrain_model': True, 'log_interval': 100, 'lr': 0.0001, 'mode': 'train', 'name': '1203resnet_layer4 ', 'nhead': 4, 'nhid': 192, 'ninp': 64, 'nkeyword': 4979, 'nlayers': 2, 'ntoken': 4371, 'pretrain_cnn_path': '/home/hj20/dcase_2020_T6/models/tag_models/TagModel_45.pt', 'pretrain_emb_path': '/home/hj20/dcase_2020_T6/models/w2v_192.mod', 'pretrain_model_path': '/home/hj20/dcase_2020_T6/models/base/46.pt', 'scheduler_decay': 0.98, 'seed': 1111, 'spec_augmentation': True, 'test_data_dir': '/home/hj20/dcase_2020_T6/create_dataset/data/test_data', 'train_data_dir': '/home/hj20/dcase_202

In [30]:
#일부 레이어 1131
epoch = 1
if hp.mode == 'train':
    while epoch < hp.training_epochs + 1:
        epoch_start_time = time.time()
        train()
        torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
        scheduler.step(epoch)
        eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=2)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=3)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=4)
        epoch += 1

2021-12-03 15:23:31,004 - INFO: | epoch   1 |   100/ 3051 batches | lr 1.00e-04 | ms/batch 342.32 | loss-text 5.9000
2021-12-03 15:24:05,339 - INFO: | epoch   1 |   200/ 3051 batches | lr 1.00e-04 | ms/batch 343.34 | loss-text 5.1418
2021-12-03 15:24:39,340 - INFO: | epoch   1 |   300/ 3051 batches | lr 1.00e-04 | ms/batch 340.01 | loss-text 4.9829
2021-12-03 15:25:13,556 - INFO: | epoch   1 |   400/ 3051 batches | lr 1.00e-04 | ms/batch 342.15 | loss-text 4.8647
2021-12-03 15:25:47,947 - INFO: | epoch   1 |   500/ 3051 batches | lr 1.00e-04 | ms/batch 343.90 | loss-text 4.7501
2021-12-03 15:26:22,625 - INFO: | epoch   1 |   600/ 3051 batches | lr 1.00e-04 | ms/batch 346.78 | loss-text 4.6166
2021-12-03 15:26:57,142 - INFO: | epoch   1 |   700/ 3051 batches | lr 1.00e-04 | ms/batch 345.17 | loss-text 4.6930
2021-12-03 15:27:31,530 - INFO: | epoch   1 |   800/ 3051 batches | lr 1.00e-04 | ms/batch 343.88 | loss-text 4.5190
2021-12-03 15:28:06,442 - INFO: | epoch   1 |   900/ 3051 batche



loading annotations into memory...
0:00:00.003999
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 17899, 'reflen': 11908, 'guess': [17899, 16875, 15851, 14827], 'correct': [4862, 1225, 296, 43]}
ratio: 1.5031071548537536
Bleu_1: 0.272
Bleu_2: 0.140
Bleu_3: 0.072
Bleu_4: 0.032
computing METEOR score...
METEOR: 0.108
computing Rouge score...
ROUGE_L: 0.259
computing CIDEr score...
CIDEr: 0.093
computing SPICE score...
SPICE: 0.075
computing SPIDEr score...
SPIDEr: 0.084
2021-12-03 15:41:22,422 - INFO: eval_greddy SPIDEr: 0.0838
loading annotations into memory...
0:00:00.003818
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8918, 'reflen': 9514, 'guess': [8918, 7894, 6870, 5846], 'correct': [4852, 1562,



loading annotations into memory...
0:00:00.003764
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9487, 'reflen': 9787, 'guess': [9487, 8463, 7439, 6415], 'correct': [4635, 1433, 474, 121]}
ratio: 0.9693470930825616
Bleu_1: 0.473
Bleu_2: 0.279
Bleu_3: 0.169
Bleu_4: 0.097
computing METEOR score...
METEOR: 0.130
computing Rouge score...
ROUGE_L: 0.321
computing CIDEr score...
CIDEr: 0.195
computing SPICE score...
SPICE: 0.083
computing SPIDEr score...
SPIDEr: 0.139
2021-12-03 16:01:39,574 - INFO: eval_greddy SPIDEr: 0.1392
loading annotations into memory...
0:00:00.003874
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8493, 'reflen': 9338, 'guess': [8493, 7469, 6445, 5421], 'correct': [4485, 1568, 581,



loading annotations into memory...
0:00:00.004069
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10675, 'reflen': 10516, 'guess': [10675, 9651, 8627, 7603], 'correct': [5163, 1697, 579, 162]}
ratio: 1.0151198174209761
Bleu_1: 0.484
Bleu_2: 0.292
Bleu_3: 0.179
Bleu_4: 0.105
computing METEOR score...
METEOR: 0.141
computing Rouge score...
ROUGE_L: 0.332
computing CIDEr score...
CIDEr: 0.224
computing SPICE score...
SPICE: 0.090
computing SPIDEr score...
SPIDEr: 0.157
2021-12-03 16:21:42,527 - INFO: eval_greddy SPIDEr: 0.1569
loading annotations into memory...
0:00:00.003801
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9146, 'reflen': 9652, 'guess': [9146, 8122, 7098, 6074], 'correct': [5024, 1831, 7



loading annotations into memory...
0:00:00.003717
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11097, 'reflen': 10725, 'guess': [11097, 10073, 9049, 8025], 'correct': [5202, 1562, 491, 102]}
ratio: 1.0346853146852182
Bleu_1: 0.469
Bleu_2: 0.270
Bleu_3: 0.158
Bleu_4: 0.084
computing METEOR score...
METEOR: 0.138
computing Rouge score...
ROUGE_L: 0.329
computing CIDEr score...
CIDEr: 0.203
computing SPICE score...
SPICE: 0.091
computing SPIDEr score...
SPIDEr: 0.147
2021-12-03 16:41:43,791 - INFO: eval_greddy SPIDEr: 0.1471
loading annotations into memory...
0:00:00.004007
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9088, 'reflen': 9633, 'guess': [9088, 8064, 7040, 6016], 'correct': [4855, 1690, 



loading annotations into memory...
0:00:00.003849
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10353, 'reflen': 10380, 'guess': [10353, 9329, 8305, 7281], 'correct': [5191, 1696, 586, 156]}
ratio: 0.9973988439305397
Bleu_1: 0.500
Bleu_2: 0.301
Bleu_3: 0.185
Bleu_4: 0.108
computing METEOR score...
METEOR: 0.148
computing Rouge score...
ROUGE_L: 0.340
computing CIDEr score...
CIDEr: 0.255
computing SPICE score...
SPICE: 0.097
computing SPIDEr score...
SPIDEr: 0.176
2021-12-03 17:01:45,020 - INFO: eval_greddy SPIDEr: 0.1763
loading annotations into memory...
0:00:00.003823
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8875, 'reflen': 9535, 'guess': [8875, 7851, 6827, 5803], 'correct': [5093, 1853, 7



loading annotations into memory...
0:00:00.003815
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11396, 'reflen': 10863, 'guess': [11396, 10372, 9348, 8324], 'correct': [5494, 1898, 682, 179]}
ratio: 1.0490656356438324
Bleu_1: 0.482
Bleu_2: 0.297
Bleu_3: 0.186
Bleu_4: 0.108
computing METEOR score...
METEOR: 0.150
computing Rouge score...
ROUGE_L: 0.348
computing CIDEr score...
CIDEr: 0.260
computing SPICE score...
SPICE: 0.100
computing SPIDEr score...
SPIDEr: 0.180
2021-12-03 17:21:45,751 - INFO: eval_greddy SPIDEr: 0.1801
loading annotations into memory...
0:00:00.003908
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9404, 'reflen': 9830, 'guess': [9404, 8380, 7356, 6332], 'correct': [5304, 1944, 



loading annotations into memory...
0:00:00.003889
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10933, 'reflen': 10792, 'guess': [10933, 9909, 8885, 7861], 'correct': [5529, 1858, 664, 180]}
ratio: 1.0130652335062071
Bleu_1: 0.506
Bleu_2: 0.308
Bleu_3: 0.192
Bleu_4: 0.113
computing METEOR score...
METEOR: 0.153
computing Rouge score...
ROUGE_L: 0.346
computing CIDEr score...
CIDEr: 0.261
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.182
2021-12-03 17:41:47,499 - INFO: eval_greddy SPIDEr: 0.1820
loading annotations into memory...
0:00:00.003679
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9848, 'reflen': 10116, 'guess': [9848, 8824, 7800, 6776], 'correct': [5400, 1931, 



loading annotations into memory...
0:00:00.003825
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10490, 'reflen': 10509, 'guess': [10490, 9466, 8442, 7418], 'correct': [5499, 1905, 692, 188]}
ratio: 0.9981920258824818
Bleu_1: 0.523
Bleu_2: 0.324
Bleu_3: 0.205
Bleu_4: 0.121
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.355
computing CIDEr score...
CIDEr: 0.287
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.194
2021-12-03 18:01:50,919 - INFO: eval_greddy SPIDEr: 0.1943
loading annotations into memory...
0:00:00.003875
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9375, 'reflen': 9828, 'guess': [9375, 8351, 7327, 6303], 'correct': [5306, 1970, 7



loading annotations into memory...
0:00:00.003850
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10084, 'reflen': 10206, 'guess': [10084, 9060, 8036, 7012], 'correct': [5463, 1929, 734, 201]}
ratio: 0.9880462473054097
Bleu_1: 0.535
Bleu_2: 0.336
Bleu_3: 0.217
Bleu_4: 0.130
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.301
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.203
2021-12-03 18:21:46,774 - INFO: eval_greddy SPIDEr: 0.2026
loading annotations into memory...
0:00:00.003707
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9240, 'reflen': 9762, 'guess': [9240, 8216, 7192, 6168], 'correct': [5397, 2082, 8



loading annotations into memory...
0:00:00.003960
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10849, 'reflen': 10712, 'guess': [10849, 9825, 8801, 7777], 'correct': [5643, 1922, 679, 194]}
ratio: 1.012789395070854
Bleu_1: 0.520
Bleu_2: 0.319
Bleu_3: 0.199
Bleu_4: 0.118
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.354
computing CIDEr score...
CIDEr: 0.295
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.202
2021-12-03 18:41:43,884 - INFO: eval_greddy SPIDEr: 0.2018
loading annotations into memory...
0:00:00.003906
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9689, 'reflen': 9984, 'guess': [9689, 8665, 7641, 6617], 'correct': [5515, 2035, 77



loading annotations into memory...
0:00:00.003920
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10224, 'reflen': 10316, 'guess': [10224, 9200, 8176, 7152], 'correct': [5530, 1920, 721, 223]}
ratio: 0.9910818146567476
Bleu_1: 0.536
Bleu_2: 0.333
Bleu_3: 0.213
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.314
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.211
2021-12-03 19:01:44,964 - INFO: eval_greddy SPIDEr: 0.2106
loading annotations into memory...
0:00:00.003833
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9305, 'reflen': 9782, 'guess': [9305, 8281, 7257, 6233], 'correct': [5351, 2002, 8



loading annotations into memory...
0:00:00.003883
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10432, 'reflen': 10418, 'guess': [10432, 9408, 8384, 7360], 'correct': [5644, 1907, 686, 216]}
ratio: 1.0013438279899212
Bleu_1: 0.541
Bleu_2: 0.331
Bleu_3: 0.208
Bleu_4: 0.127
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.323
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.216
2021-12-03 19:21:44,290 - INFO: eval_greddy SPIDEr: 0.2159
loading annotations into memory...
0:00:00.003967
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9214, 'reflen': 9682, 'guess': [9214, 8190, 7166, 6142], 'correct': [5354, 1944, 7



loading annotations into memory...
0:00:00.004163
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10032, 'reflen': 10221, 'guess': [10032, 9008, 7984, 6960], 'correct': [5483, 1902, 698, 203]}
ratio: 0.9815086586438723
Bleu_1: 0.536
Bleu_2: 0.333
Bleu_3: 0.212
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.320
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.215
2021-12-03 19:41:42,194 - INFO: eval_greddy SPIDEr: 0.2145
loading annotations into memory...
0:00:00.003856
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9107, 'reflen': 9649, 'guess': [9107, 8083, 7059, 6035], 'correct': [5301, 1990, 7



loading annotations into memory...
0:00:00.003890
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10329, 'reflen': 10381, 'guess': [10329, 9305, 8281, 7257], 'correct': [5660, 2092, 779, 226]}
ratio: 0.994990848665736
Bleu_1: 0.545
Bleu_2: 0.349
Bleu_3: 0.225
Bleu_4: 0.137
computing METEOR score...
METEOR: 0.162
computing Rouge score...
ROUGE_L: 0.366
computing CIDEr score...
CIDEr: 0.328
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.219
2021-12-03 20:01:41,455 - INFO: eval_greddy SPIDEr: 0.2188
loading annotations into memory...
0:00:00.004014
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9615, 'reflen': 9914, 'guess': [9615, 8591, 7567, 6543], 'correct': [5504, 2070, 82



loading annotations into memory...
0:00:00.003955
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10232, 'reflen': 10297, 'guess': [10232, 9208, 8184, 7160], 'correct': [5534, 1934, 701, 193]}
ratio: 0.9936874817907163
Bleu_1: 0.537
Bleu_2: 0.335
Bleu_3: 0.212
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.315
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.210
2021-12-03 20:21:40,630 - INFO: eval_greddy SPIDEr: 0.2101
loading annotations into memory...
0:00:00.003950
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9469, 'reflen': 9814, 'guess': [9469, 8445, 7421, 6397], 'correct': [5442, 2056, 8



loading annotations into memory...
0:00:00.003919
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10431, 'reflen': 10445, 'guess': [10431, 9407, 8383, 7359], 'correct': [5569, 1959, 719, 225]}
ratio: 0.9986596457634276
Bleu_1: 0.533
Bleu_2: 0.333
Bleu_3: 0.212
Bleu_4: 0.130
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.326
computing SPICE score...
SPICE: 0.111
computing SPIDEr score...
SPIDEr: 0.219
2021-12-03 20:41:43,932 - INFO: eval_greddy SPIDEr: 0.2186
loading annotations into memory...
0:00:00.003914
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9572, 'reflen': 9883, 'guess': [9572, 8548, 7524, 6500], 'correct': [5406, 2013, 8



loading annotations into memory...
0:00:00.003859
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10139, 'reflen': 10261, 'guess': [10139, 9115, 8091, 7067], 'correct': [5480, 1901, 664, 187]}
ratio: 0.988110320631421
Bleu_1: 0.534
Bleu_2: 0.332
Bleu_3: 0.207
Bleu_4: 0.124
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.319
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.213
2021-12-03 21:01:41,932 - INFO: eval_greddy SPIDEr: 0.2135
loading annotations into memory...
0:00:00.004057
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9373, 'reflen': 9749, 'guess': [9373, 8349, 7325, 6301], 'correct': [5440, 2023, 81



loading annotations into memory...
0:00:00.004051
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10397, 'reflen': 10414, 'guess': [10397, 9373, 8349, 7325], 'correct': [5610, 1954, 727, 218]}
ratio: 0.9983675821009219
Bleu_1: 0.539
Bleu_2: 0.335
Bleu_3: 0.214
Bleu_4: 0.130
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.336
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.222
2021-12-03 21:21:37,851 - INFO: eval_greddy SPIDEr: 0.2224
loading annotations into memory...
0:00:00.003938
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9671, 'reflen': 9964, 'guess': [9671, 8647, 7623, 6599], 'correct': [5456, 1942, 7



loading annotations into memory...
0:00:00.004018
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10581, 'reflen': 10552, 'guess': [10581, 9557, 8533, 7509], 'correct': [5642, 1929, 683, 209]}
ratio: 1.002748294162149
Bleu_1: 0.533
Bleu_2: 0.328
Bleu_3: 0.205
Bleu_4: 0.124
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.317
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.212
2021-12-03 21:41:38,876 - INFO: eval_greddy SPIDEr: 0.2115
loading annotations into memory...
0:00:00.003931
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9784, 'reflen': 9986, 'guess': [9784, 8760, 7736, 6712], 'correct': [5539, 2021, 79



loading annotations into memory...
0:00:00.003830
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10163, 'reflen': 10290, 'guess': [10163, 9140, 8117, 7094], 'correct': [5460, 1918, 685, 201]}
ratio: 0.9876579203108855
Bleu_1: 0.531
Bleu_2: 0.332
Bleu_3: 0.209
Bleu_4: 0.127
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.355
computing CIDEr score...
CIDEr: 0.313
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.210
2021-12-03 22:01:35,756 - INFO: eval_greddy SPIDEr: 0.2096
loading annotations into memory...
0:00:00.003912
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9417, 'reflen': 9806, 'guess': [9417, 8393, 7369, 6345], 'correct': [5308, 1992, 8



loading annotations into memory...
0:00:00.003751
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9948, 'reflen': 10118, 'guess': [9948, 8924, 7900, 6876], 'correct': [5501, 1905, 695, 218]}
ratio: 0.9831982605256984
Bleu_1: 0.544
Bleu_2: 0.338
Bleu_3: 0.214
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.330
computing SPICE score...
SPICE: 0.111
computing SPIDEr score...
SPIDEr: 0.220
2021-12-03 22:21:31,178 - INFO: eval_greddy SPIDEr: 0.2203
loading annotations into memory...
0:00:00.003969
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9329, 'reflen': 9703, 'guess': [9329, 8305, 7281, 6257], 'correct': [5422, 2003, 805



loading annotations into memory...
0:00:00.003888
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10496, 'reflen': 10509, 'guess': [10496, 9472, 8448, 7424], 'correct': [5749, 2023, 750, 229]}
ratio: 0.9987629650774575
Bleu_1: 0.547
Bleu_2: 0.342
Bleu_3: 0.218
Bleu_4: 0.134
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.363
computing CIDEr score...
CIDEr: 0.342
computing SPICE score...
SPICE: 0.112
computing SPIDEr score...
SPIDEr: 0.227
2021-12-03 22:41:27,848 - INFO: eval_greddy SPIDEr: 0.2273
loading annotations into memory...
0:00:00.003827
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9678, 'reflen': 9912, 'guess': [9678, 8654, 7630, 6606], 'correct': [5445, 1991, 7



loading annotations into memory...
0:00:00.003840
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9981, 'reflen': 10136, 'guess': [9981, 8957, 7933, 6909], 'correct': [5419, 1836, 645, 202]}
ratio: 0.9847079715863274
Bleu_1: 0.535
Bleu_2: 0.328
Bleu_3: 0.205
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.324
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.215
2021-12-03 23:01:26,867 - INFO: eval_greddy SPIDEr: 0.2148
loading annotations into memory...
0:00:00.004195
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9212, 'reflen': 9666, 'guess': [9212, 8188, 7164, 6140], 'correct': [5283, 1933, 768



loading annotations into memory...
0:00:00.003933
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10469, 'reflen': 10407, 'guess': [10469, 9445, 8421, 7397], 'correct': [5619, 1931, 704, 216]}
ratio: 1.0059575285864315
Bleu_1: 0.537
Bleu_2: 0.331
Bleu_3: 0.209
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.162
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.332
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.220
2021-12-03 23:21:21,639 - INFO: eval_greddy SPIDEr: 0.2195
loading annotations into memory...
0:00:00.003924
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9666, 'reflen': 9927, 'guess': [9666, 8642, 7618, 6594], 'correct': [5471, 1995, 7



loading annotations into memory...
0:00:00.003850
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10257, 'reflen': 10294, 'guess': [10257, 9233, 8209, 7185], 'correct': [5447, 1837, 659, 209]}
ratio: 0.996405673207597
Bleu_1: 0.529
Bleu_2: 0.324
Bleu_3: 0.203
Bleu_4: 0.125
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.350
computing CIDEr score...
CIDEr: 0.320
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.214
2021-12-03 23:41:18,919 - INFO: eval_greddy SPIDEr: 0.2135
loading annotations into memory...
0:00:00.003877
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9534, 'reflen': 9836, 'guess': [9534, 8510, 7486, 6462], 'correct': [5342, 1934, 74



loading annotations into memory...
0:00:00.003814
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10267, 'reflen': 10335, 'guess': [10267, 9243, 8219, 7195], 'correct': [5641, 1939, 711, 238]}
ratio: 0.9934204160618293
Bleu_1: 0.546
Bleu_2: 0.337
Bleu_3: 0.214
Bleu_4: 0.134
computing METEOR score...
METEOR: 0.163
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.343
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.226
2021-12-04 00:01:16,005 - INFO: eval_greddy SPIDEr: 0.2263
loading annotations into memory...
0:00:00.008650
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9642, 'reflen': 9953, 'guess': [9642, 8618, 7594, 6570], 'correct': [5520, 2037, 8



loading annotations into memory...
0:00:00.003754
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10147, 'reflen': 10247, 'guess': [10147, 9123, 8099, 7075], 'correct': [5606, 1984, 741, 224]}
ratio: 0.990241046159755
Bleu_1: 0.547
Bleu_2: 0.343
Bleu_3: 0.220
Bleu_4: 0.135
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.362
computing CIDEr score...
CIDEr: 0.351
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.231
2021-12-04 00:21:12,013 - INFO: eval_greddy SPIDEr: 0.2306
loading annotations into memory...
0:00:00.003998
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9545, 'reflen': 9851, 'guess': [9545, 8521, 7497, 6473], 'correct': [5417, 2004, 78



loading annotations into memory...
0:00:00.003971
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10251, 'reflen': 10356, 'guess': [10251, 9227, 8203, 7179], 'correct': [5471, 1867, 659, 187]}
ratio: 0.9898609501737167
Bleu_1: 0.528
Bleu_2: 0.325
Bleu_3: 0.203
Bleu_4: 0.121
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.317
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.212
2021-12-04 00:41:07,866 - INFO: eval_greddy SPIDEr: 0.2122
loading annotations into memory...
0:00:00.003931
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9586, 'reflen': 9878, 'guess': [9586, 8562, 7538, 6514], 'correct': [5427, 1997, 7



loading annotations into memory...
0:00:00.003845
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10159, 'reflen': 10277, 'guess': [10159, 9135, 8111, 7087], 'correct': [5359, 1811, 655, 202]}
ratio: 0.9885180500144994
Bleu_1: 0.521
Bleu_2: 0.320
Bleu_3: 0.201
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.350
computing CIDEr score...
CIDEr: 0.330
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.219
2021-12-04 01:01:03,916 - INFO: eval_greddy SPIDEr: 0.2186
loading annotations into memory...
0:00:00.003846
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9383, 'reflen': 9771, 'guess': [9383, 8359, 7335, 6311], 'correct': [5299, 1895, 7



loading annotations into memory...
0:00:00.003831
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10104, 'reflen': 10187, 'guess': [10104, 9080, 8056, 7032], 'correct': [5500, 1891, 714, 210]}
ratio: 0.9918523608519689
Bleu_1: 0.540
Bleu_2: 0.334
Bleu_3: 0.214
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.332
computing SPICE score...
SPICE: 0.106
computing SPIDEr score...
SPIDEr: 0.219
2021-12-04 01:21:01,312 - INFO: eval_greddy SPIDEr: 0.2191
loading annotations into memory...
0:00:00.004060
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9468, 'reflen': 9799, 'guess': [9468, 8444, 7420, 6396], 'correct': [5357, 1961, 7



loading annotations into memory...
0:00:00.003736
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10288, 'reflen': 10373, 'guess': [10288, 9264, 8240, 7216], 'correct': [5606, 2013, 737, 219]}
ratio: 0.9918056492816936
Bleu_1: 0.540
Bleu_2: 0.341
Bleu_3: 0.218
Bleu_4: 0.133
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.362
computing CIDEr score...
CIDEr: 0.343
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.226
2021-12-04 01:40:57,735 - INFO: eval_greddy SPIDEr: 0.2259
loading annotations into memory...
0:00:00.004216
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9499, 'reflen': 9837, 'guess': [9499, 8475, 7451, 6427], 'correct': [5425, 2031, 8



loading annotations into memory...
0:00:00.003780
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10262, 'reflen': 10347, 'guess': [10262, 9238, 8214, 7190], 'correct': [5558, 1910, 684, 210]}
ratio: 0.9917850584709585
Bleu_1: 0.537
Bleu_2: 0.332
Bleu_3: 0.209
Bleu_4: 0.127
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.335
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.221
2021-12-04 02:00:55,488 - INFO: eval_greddy SPIDEr: 0.2213
loading annotations into memory...
0:00:00.003806
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9434, 'reflen': 9779, 'guess': [9434, 8410, 7386, 6362], 'correct': [5357, 1935, 7



loading annotations into memory...
0:00:00.003955
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10206, 'reflen': 10323, 'guess': [10206, 9182, 8158, 7134], 'correct': [5577, 1952, 731, 223]}
ratio: 0.9886660854401832
Bleu_1: 0.540
Bleu_2: 0.337
Bleu_3: 0.216
Bleu_4: 0.133
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.343
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.225
2021-12-04 02:20:53,246 - INFO: eval_greddy SPIDEr: 0.2249
loading annotations into memory...
0:00:00.003847
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9475, 'reflen': 9810, 'guess': [9475, 8451, 7427, 6403], 'correct': [5404, 1983, 7



loading annotations into memory...
0:00:00.003916
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10197, 'reflen': 10303, 'guess': [10197, 9173, 8149, 7125], 'correct': [5542, 1890, 673, 213]}
ratio: 0.9897117344461817
Bleu_1: 0.538
Bleu_2: 0.331
Bleu_3: 0.208
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.346
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.227
2021-12-04 02:40:52,084 - INFO: eval_greddy SPIDEr: 0.2271
loading annotations into memory...
0:00:00.003933
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9445, 'reflen': 9810, 'guess': [9445, 8421, 7397, 6373], 'correct': [5362, 1907, 7



loading annotations into memory...
0:00:00.003859
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10278, 'reflen': 10320, 'guess': [10278, 9254, 8230, 7206], 'correct': [5571, 1972, 707, 202]}
ratio: 0.995930232558043
Bleu_1: 0.540
Bleu_2: 0.338
Bleu_3: 0.214
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.349
computing SPICE score...
SPICE: 0.113
computing SPIDEr score...
SPIDEr: 0.231
2021-12-04 03:00:51,807 - INFO: eval_greddy SPIDEr: 0.2309
loading annotations into memory...
0:00:00.003878
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9579, 'reflen': 9867, 'guess': [9579, 8555, 7531, 6507], 'correct': [5432, 2006, 78



loading annotations into memory...
0:00:00.003896
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10170, 'reflen': 10288, 'guess': [10170, 9146, 8122, 7098], 'correct': [5585, 1956, 726, 238]}
ratio: 0.9885303265939941
Bleu_1: 0.543
Bleu_2: 0.339
Bleu_3: 0.216
Bleu_4: 0.135
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.362
computing CIDEr score...
CIDEr: 0.350
computing SPICE score...
SPICE: 0.113
computing SPIDEr score...
SPIDEr: 0.232
2021-12-04 03:20:52,515 - INFO: eval_greddy SPIDEr: 0.2318
loading annotations into memory...
0:00:00.003995
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9477, 'reflen': 9834, 'guess': [9477, 8453, 7429, 6405], 'correct': [5381, 1999, 7



loading annotations into memory...
0:00:00.003842
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10338, 'reflen': 10363, 'guess': [10338, 9314, 8290, 7266], 'correct': [5541, 1945, 686, 189]}
ratio: 0.9975875711665543
Bleu_1: 0.535
Bleu_2: 0.334
Bleu_3: 0.210
Bleu_4: 0.124
computing METEOR score...
METEOR: 0.163
computing Rouge score...
ROUGE_L: 0.355
computing CIDEr score...
CIDEr: 0.326
computing SPICE score...
SPICE: 0.113
computing SPIDEr score...
SPIDEr: 0.219
2021-12-04 03:40:55,170 - INFO: eval_greddy SPIDEr: 0.2193
loading annotations into memory...
0:00:00.004046
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9618, 'reflen': 9903, 'guess': [9618, 8594, 7570, 6546], 'correct': [5342, 2005, 7



loading annotations into memory...
0:00:00.003979
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10406, 'reflen': 10408, 'guess': [10406, 9382, 8358, 7334], 'correct': [5685, 1993, 724, 194]}
ratio: 0.9998078401228863
Bleu_1: 0.546
Bleu_2: 0.341
Bleu_3: 0.216
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.165
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.351
computing SPICE score...
SPICE: 0.113
computing SPIDEr score...
SPIDEr: 0.232
2021-12-04 04:00:58,353 - INFO: eval_greddy SPIDEr: 0.2319
loading annotations into memory...
0:00:00.003942
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9668, 'reflen': 9956, 'guess': [9668, 8644, 7620, 6596], 'correct': [5427, 1937, 7



loading annotations into memory...
0:00:00.003896
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10307, 'reflen': 10398, 'guess': [10307, 9283, 8259, 7235], 'correct': [5533, 1925, 729, 245]}
ratio: 0.99124831698394
Bleu_1: 0.532
Bleu_2: 0.331
Bleu_3: 0.212
Bleu_4: 0.134
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.354
computing CIDEr score...
CIDEr: 0.348
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.229
2021-12-04 04:21:01,674 - INFO: eval_greddy SPIDEr: 0.2290
loading annotations into memory...
0:00:00.004212
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9508, 'reflen': 9854, 'guess': [9508, 8484, 7460, 6436], 'correct': [5314, 1924, 743



loading annotations into memory...
0:00:00.003791
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10187, 'reflen': 10274, 'guess': [10187, 9163, 8139, 7115], 'correct': [5481, 1926, 730, 231]}
ratio: 0.9915320225811766
Bleu_1: 0.533
Bleu_2: 0.333
Bleu_3: 0.215
Bleu_4: 0.134
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.344
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.226
2021-12-04 04:41:02,448 - INFO: eval_greddy SPIDEr: 0.2265
loading annotations into memory...
0:00:00.003932
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9403, 'reflen': 9803, 'guess': [9403, 8379, 7355, 6331], 'correct': [5303, 1983, 7



loading annotations into memory...
0:00:00.003965
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10167, 'reflen': 10249, 'guess': [10167, 9143, 8119, 7095], 'correct': [5547, 1933, 689, 199]}
ratio: 0.9919992194359457
Bleu_1: 0.541
Bleu_2: 0.337
Bleu_3: 0.212
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.340
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.225
2021-12-04 05:01:04,123 - INFO: eval_greddy SPIDEr: 0.2248
loading annotations into memory...
0:00:00.003950
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9464, 'reflen': 9801, 'guess': [9464, 8440, 7416, 6392], 'correct': [5318, 1959, 7



loading annotations into memory...
0:00:00.003897
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10428, 'reflen': 10456, 'guess': [10428, 9404, 8380, 7356], 'correct': [5680, 1976, 725, 213]}
ratio: 0.997322111706102
Bleu_1: 0.543
Bleu_2: 0.337
Bleu_3: 0.214
Bleu_4: 0.130
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.353
computing SPICE score...
SPICE: 0.113
computing SPIDEr score...
SPIDEr: 0.233
2021-12-04 05:21:04,490 - INFO: eval_greddy SPIDEr: 0.2333
loading annotations into memory...
0:00:00.003912
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9703, 'reflen': 9983, 'guess': [9703, 8679, 7655, 6631], 'correct': [5481, 1993, 76



loading annotations into memory...
0:00:00.003888
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10520, 'reflen': 10474, 'guess': [10520, 9496, 8472, 7448], 'correct': [5600, 1921, 698, 193]}
ratio: 1.004391827381993
Bleu_1: 0.532
Bleu_2: 0.328
Bleu_3: 0.207
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.355
computing CIDEr score...
CIDEr: 0.328
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.219
2021-12-04 05:41:06,549 - INFO: eval_greddy SPIDEr: 0.2192
loading annotations into memory...
0:00:00.003778
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9788, 'reflen': 10007, 'guess': [9788, 8764, 7740, 6716], 'correct': [5347, 1878, 7



loading annotations into memory...
0:00:00.004416
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10473, 'reflen': 10421, 'guess': [10473, 9449, 8425, 7401], 'correct': [5677, 2003, 719, 204]}
ratio: 1.0049899241914397
Bleu_1: 0.542
Bleu_2: 0.339
Bleu_3: 0.214
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.344
computing SPICE score...
SPICE: 0.112
computing SPIDEr score...
SPIDEr: 0.228
2021-12-04 06:01:10,809 - INFO: eval_greddy SPIDEr: 0.2279
loading annotations into memory...
0:00:00.003965
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9638, 'reflen': 9907, 'guess': [9638, 8614, 7590, 6566], 'correct': [5464, 2027, 7



loading annotations into memory...
0:00:00.003838
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10378, 'reflen': 10441, 'guess': [10378, 9354, 8330, 7306], 'correct': [5580, 1965, 724, 204]}
ratio: 0.9939660952015138
Bleu_1: 0.534
Bleu_2: 0.334
Bleu_3: 0.213
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.355
computing CIDEr score...
CIDEr: 0.344
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.227
2021-12-04 06:21:12,363 - INFO: eval_greddy SPIDEr: 0.2270
loading annotations into memory...
0:00:00.004229
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9694, 'reflen': 9978, 'guess': [9694, 8670, 7646, 6622], 'correct': [5410, 1988, 7



loading annotations into memory...
0:00:00.003766
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10361, 'reflen': 10388, 'guess': [10361, 9337, 8313, 7289], 'correct': [5574, 1921, 698, 213]}
ratio: 0.9974008471312092
Bleu_1: 0.537
Bleu_2: 0.332
Bleu_3: 0.210
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.163
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.349
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.230
2021-12-04 06:41:13,699 - INFO: eval_greddy SPIDEr: 0.2296
loading annotations into memory...
0:00:00.003939
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9557, 'reflen': 9832, 'guess': [9557, 8533, 7509, 6485], 'correct': [5280, 1857, 6



loading annotations into memory...
0:00:00.003932
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10458, 'reflen': 10484, 'guess': [10458, 9434, 8410, 7386], 'correct': [5585, 1929, 698, 205]}
ratio: 0.9975200305226061
Bleu_1: 0.533
Bleu_2: 0.330
Bleu_3: 0.208
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.355
computing CIDEr score...
CIDEr: 0.344
computing SPICE score...
SPICE: 0.112
computing SPIDEr score...
SPIDEr: 0.228
2021-12-04 07:01:16,897 - INFO: eval_greddy SPIDEr: 0.2282
loading annotations into memory...
0:00:00.003680
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9731, 'reflen': 9971, 'guess': [9731, 8707, 7683, 6659], 'correct': [5332, 1920, 7



loading annotations into memory...
0:00:00.004070
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10604, 'reflen': 10567, 'guess': [10604, 9580, 8556, 7532], 'correct': [5568, 1909, 682, 195]}
ratio: 1.0035014668306044
Bleu_1: 0.525
Bleu_2: 0.323
Bleu_3: 0.203
Bleu_4: 0.121
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.349
computing CIDEr score...
CIDEr: 0.330
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.218
2021-12-04 07:21:20,502 - INFO: eval_greddy SPIDEr: 0.2181
loading annotations into memory...
0:00:00.003988
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9820, 'reflen': 10068, 'guess': [9820, 8796, 7772, 6748], 'correct': [5364, 1930, 



loading annotations into memory...
0:00:00.003800
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10199, 'reflen': 10298, 'guess': [10199, 9175, 8151, 7127], 'correct': [5430, 1796, 617, 157]}
ratio: 0.9903864828121003
Bleu_1: 0.527
Bleu_2: 0.320
Bleu_3: 0.197
Bleu_4: 0.114
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.351
computing CIDEr score...
CIDEr: 0.327
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.217
2021-12-04 07:41:23,819 - INFO: eval_greddy SPIDEr: 0.2174
loading annotations into memory...
0:00:00.003973
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9447, 'reflen': 9809, 'guess': [9447, 8423, 7399, 6375], 'correct': [5219, 1831, 6



loading annotations into memory...
0:00:00.003959
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10298, 'reflen': 10363, 'guess': [10298, 9274, 8250, 7226], 'correct': [5575, 1922, 713, 235]}
ratio: 0.9937276850331955
Bleu_1: 0.538
Bleu_2: 0.333
Bleu_3: 0.212
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.165
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.355
computing SPICE score...
SPICE: 0.112
computing SPIDEr score...
SPIDEr: 0.234
2021-12-04 08:01:25,650 - INFO: eval_greddy SPIDEr: 0.2337
loading annotations into memory...
0:00:00.003650
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9574, 'reflen': 9835, 'guess': [9574, 8550, 7526, 6502], 'correct': [5343, 1919, 7

In [56]:
#mixup
epoch = 1
if hp.mode == 'train':
    while epoch < hp.training_epochs + 1:
        epoch_start_time = time.time()
        train()
        torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
        scheduler.step(epoch)
        eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=2)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=3)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=4)
        epoch += 1

ValueError: not enough values to unpack (expected 4, got 3)

In [60]:
for src, tgt, tgt_len in training_data:
    src = src.to(device)
    tgt = tgt.to(device)

KeyboardInterrupt: 

In [55]:
hp.training_epochs

30

epoch=37 eval_beam_3 SPIDEr: 0.2344 # 2개 layer 만 trainable -06/9  
 SPIDEr: # 5개 layer 만 trainable -06/10 0.2252
별 차이 없음 ;;;;;


model score check (eval)

In [16]:
#if hp.mode == 'eval':
# Evaluation model score
model.load_state_dict(torch.load("./models/base/48.pt"))
eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=2)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=3)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=4)

FileNotFoundError: [Errno 2] No such file or directory: './models/base/48.pt'

In [18]:
model.load_state_dict(torch.load("./models/base/49.pt"))

<All keys matched successfully>

In [None]:
class Mixup(object):
    def __init__(self, mixup_alpha, random_seed=1234):
        """Mixup coefficient generator.
        """
        self.mixup_alpha = mixup_alpha
        self.random_state = np.random.RandomState(random_seed)

    def get_lambda(self, batch_size):
        """Get mixup random coefficients.
        Args:
          batch_size: int
        Returns:
          mixup_lambdas: (batch_size,)
        """
        mixup_lambdas = []
        for n in range(0, batch_size, 2):
            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
            mixup_lambdas.append(lam)
            mixup_lambdas.append(1. - lam)

        return np.array(mixup_lambdas)


In [None]:
def do_mixup(x, mixup_lambda):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
    """
    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
    return out

In [1]:
import numpy as np
from librosa.feature import melspectrogram
from librosa.feature.inverse import mel_to_audio, mel_to_stft

__author__ = 'Konstantinos Drossos -- Tampere University, Nikita Kuzmin -- Lomonosov Moscow State University'
__docformat__ = 'reStructuredText'
__all__ = ['feature_extraction']


def feature_extraction(audio_data: np.ndarray,
                       sr: int,
                       nb_fft: int,
                       hop_size: int,
                       nb_mels: int,
                       f_min: float,
                       f_max: float,
                       htk: bool,
                       power: float,
                       norm: bool,
                       window_function: str,
                       center: bool)\
        -> (np.ndarray, np.float):
    """Feature extraction function.
    :param audio_data: Audio signal.
    :type audio_data: numpy.ndarray
    :param sr: Sampling frequency.
    :type sr: int
    :param nb_fft: Amount of FFT points.
    :type nb_fft: int
    :param hop_size: Hop size in samples.
    :type hop_size: int
    :param nb_mels: Amount of MEL bands.
    :type nb_mels: int
    :param f_min: Minimum frequency in Hertz for MEL band calculation.
    :type f_min: float
    :param f_max: Maximum frequency in Hertz for MEL band calculation.
    :type f_max: float|None
    :param htk: Use the HTK Toolbox formula instead of Auditory toolkit.
    :type htk: bool
    :param power: Power of the magnitude.
    :type power: float
    :param norm: Area normalization of MEL filters.
    :type norm: bool
    :param window_function: Window function.
    :type window_function: str
    :param center: Center the frame for FFT.
    :type center: bool
    :return: Log mel-bands energies of shape=(t, nb_mels)
    :rtype: numpy.ndarray, numpy.float
    """
    y = audio_data/abs(audio_data).max()
    mel_bands = melspectrogram(
        y=y, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power, n_mels=nb_mels,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm).T

    return np.log(mel_bands + np.finfo(float).eps)


def from_mel_to_audio(mel_data: np.ndarray,
                       sr: int,
                       nb_fft: int,
                       hop_size: int,
                       nb_mels: int,
                       f_min: float,
                       f_max: float,
                       htk: bool,
                       power: float,
                       norm: bool,
                       window_function: str,
                       center: bool)\
        -> (np.ndarray, np.float):
    """Feature extraction inverse function.
    :param audio_data: Audio signal.
    :type audio_data: numpy.ndarray
    :param sr: Sampling frequency.
    :type sr: int
    :param nb_fft: Amount of FFT points.
    :type nb_fft: int
    :param hop_size: Hop size in samples.
    :type hop_size: int
    :param nb_mels: Amount of MEL bands.
    :type nb_mels: int
    :param f_min: Minimum frequency in Hertz for MEL band calculation.
    :type f_min: float
    :param f_max: Maximum frequency in Hertz for MEL band calculation.
    :type f_max: float|None
    :param htk: Use the HTK Toolbox formula instead of Auditory toolkit.
    :type htk: bool
    :param power: Power of the magnitude.
    :type power: float
    :param norm: Area normalization of MEL filters.
    :type norm: bool
    :param window_function: Window function.
    :type window_function: str
    :param center: Center the frame for FFT.
    :type center: bool
    :return: audio data
    :rtype: numpy.ndarray
    """

    y = np.exp(mel_data) - np.finfo(float).eps
    audio_data = mel_to_audio(
        M=y.T, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm)

    return audio_data


def from_mel_to_stft(mel_data: np.ndarray,
                       sr: int,
                       nb_fft: int,
                       hop_size: int,
                       nb_mels: int,
                       f_min: float,
                       f_max: float,
                       htk: bool,
                       power: float,
                       norm: bool,
                       window_function: str,
                       center: bool)\
        -> (np.ndarray, np.float):
    """From logmelspectrogram to stft.
    :param audio_data: Audio signal.
    :type audio_data: numpy.ndarray
    :param sr: Sampling frequency.
    :type sr: int
    :param nb_fft: Amount of FFT points.
    :type nb_fft: int
    :param hop_size: Hop size in samples.
    :type hop_size: int
    :param nb_mels: Amount of MEL bands.
    :type nb_mels: int
    :param f_min: Minimum frequency in Hertz for MEL band calculation.
    :type f_min: float
    :param f_max: Maximum frequency in Hertz for MEL band calculation.
    :type f_max: float|None
    :param htk: Use the HTK Toolbox formula instead of Auditory toolkit.
    :type htk: bool
    :param power: Power of the magnitude.
    :type power: float
    :param norm: Area normalization of MEL filters.
    :type norm: bool
    :param window_function: Window function.
    :type window_function: str
    :param center: Center the frame for FFT.
    :type center: bool
    :return: audio data
    :rtype: numpy.ndarray
    """

    y = np.exp(mel_data) - np.finfo(float).eps
    stft = mel_to_stft(
        M=y.T, sr=sr, n_fft=nb_fft, hop_length=hop_size, win_length=nb_fft,
        window=window_function, center=center, power=power,
        fmin=f_min, fmax=f_max, htk=htk, norm=norm)

    return stft

In [2]:
import numpy as np
import random
#from tools.features_log_mel_bands import feature_extraction, from_mel_to_audio, from_mel_to_stft
from pathlib import Path
import pysndfx
import gc

import copy

#from tools.file_io import load_audio_file
import torch


__author__ = 'Nikita Kuzmin -- Lomonosov Moscow State University'

class MixUp:

    def __init__(self, p, settings_features, simple_concat_captions=True,
                 sample_audio=False):

        self.p = p
        self.sample_audio = sample_audio
        self.settings_features = settings_features
        self.simple_concat_captions = simple_concat_captions

    def from_mel(self, mel):
        return 700 * (10 ** (mel / 2595.0) - 1)

    def to_mel(self, hertz):
        return 2595.0 * np.log10(1 + hertz / 700.0)

    def mix_audio(self, first_audio, second_audio):

        a = np.random.uniform(0.4, 0.6)

        shorter, longer = first_audio, second_audio

        if shorter.shape[0] == longer.shape[0]:
            if self.sample_audio:
                return (longer + shorter) / 2.0
            else:
                longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
                shorter = from_mel_to_audio(shorter,
                                            **self.settings_features['process'])
                return feature_extraction((longer + shorter) / 2, **self.settings_features['process'])

        if first_audio.shape[0] > second_audio.shape[0]:
            shorter, longer = longer, shorter


        if self.sample_audio:
            start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
            end = start + shorter.shape[0]
            longer *= a
            longer[start:end] += shorter * (1 - a)
        else:
            longer = from_mel_to_audio(longer, **self.settings_features['process']) * a
            shorter = from_mel_to_audio(shorter,
                                        **self.settings_features['process'])
            start = random.randint(0, longer.shape[0] - 1 - shorter.shape[0])
            end = start + shorter.shape[0]
            longer[start:end] += shorter * (1 - a)
            longer = feature_extraction(longer,
                                        **self.settings_features['process'])

        return longer

    def mix_labels(self, first_labels, second_labels):
        if self.simple_concat_captions:
            return np.hstack([first_labels[:-1], second_labels[1:]])
        else:

            first_token = first_labels[0]
            last_token = first_labels[-1]
            first_labels = first_labels[1:-1]
            second_labels = second_labels[1:-1]
            res = np.empty((first_labels.size + second_labels.size,),
                           dtype=first_labels.dtype)
            min_size = min(first_labels.size, second_labels.size)
            res[0:2*min_size:2] = first_labels[:min_size]
            res[1:2*min_size:2] = second_labels[:min_size]
            if first_labels.size > second_labels.size:
                res[min_size * 2:] = first_labels[min_size:]
            elif second_labels.size > first_labels.size:
                res[min_size*2:] = second_labels[min_size:]
            res = np.concatenate(([first_token], res))
            res = np.concatenate((res, [last_token]))
            return res

    def mix_audio_and_labels(self,
                             first_audio, second_audio,
                             first_labels, second_labels):
        mixed_audio = self.mix_audio(first_audio, second_audio)
        mixed_labels = self.mix_labels(first_labels, second_labels)

        return mixed_audio, mixed_labels

    def __call__(self, dataset, inputs):
        resulted_audio, resulted_labels, filename = inputs[0], inputs[1], inputs[2]
        if np.random.uniform() <= self.p:
            random_sample = dataset.random_sample(sample_audio=self.sample_audio)
            resulted_audio, resulted_labels = self.mix_audio_and_labels(
                resulted_audio, random_sample[0],
                resulted_labels, random_sample[1]
            )
        return resulted_audio, resulted_labels


class AudioAugmentation:
    # https://github.com/ex4sperans/freesound-classification
    def __init__(self, p):

        self.p = p
        self.effects_chain = (
            pysndfx.AudioEffectsChain()
                .reverb(
                reverberance=random.randrange(50),
                room_scale=random.randrange(50),
                stereo_depth=random.randrange(50)
            )
                .pitch(shift=random.randrange(-300, 300))
                .overdrive(gain=random.randrange(2, 10))
                .speed(random.uniform(0.9, 1.1))
        )

    def __call__(self, dataset, inputs):

        resulted_audio = inputs[0]
        captions = inputs[1]
        del inputs
        gc.collect()
        if np.random.uniform() < self.p:
            resulted_audio = torch.from_numpy(self.effects_chain(resulted_audio.numpy()))
        return resulted_audio, captions

In [4]:
!pip install pysndfx



In [17]:
from typing import List, Tuple
from pathlib import Path
import random

from torch.utils.data import Dataset
import torch
import torchaudio
from numpy import load as np_load, ndarray

import numpy as np

from pympler import muppy, summary
import pandas as pd


__author__ = 'Konstantinos Drossos -- Tampere University, Nikita Kuzmin -- Lomonosov Moscow State University'
__docformat__ = 'reStructuredText'
__all__ = ['ClothoDataset']


class ClothoDataset(Dataset):

    def __init__(self,
                 data_dir: Path,
                 split: str,
                 input_field_name: str,
                 output_field_name: str,
                 load_into_memory: bool,
                 settings_audio,
                 settings_features,
                 online_preprocessing=True,
                 transforms=None) \
            -> None:
        """Initialization of a Clotho dataset object.
        :param data_dir: Data directory with Clotho dataset files.
        :type data_dir: pathlib.Path
        :param split: The split to use (`development`, `validation`)
        :type split: str
        :param input_field_name: Field name for the input values
        :type input_field_name: str
        :param output_field_name: Field name for the output (target) values.
        :type output_field_name: str
        :param load_into_memory: Load the dataset into memory?
        :type load_into_memory: bool
        :param settings_audio: Settings about audio loading
        :type dict
        :param settings_features: Settings about audio processing
        :type dict
        :param indexes: Indexes of files, which depends on validation strategy
        :type indexes: numpy array
        :param transforms: List of transforms
        :type transforms: list
        """

        super(ClothoDataset, self).__init__()
        self.online_preprocessing = online_preprocessing
        the_dir: Path = data_dir.joinpath(split)
        self.split = split

        self.settings_audio = settings_audio
        self.settings_features = settings_features

        #if indexes is None:
        self.examples: List[Path] = sorted(the_dir.iterdir())
        #else:
        #    self.examples: List[Path] = list(np.array(sorted(the_dir.iterdir()))[indexes])
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.transforms = transforms
        self.resampler = torchaudio.transforms.Resample(orig_freq=settings_features['process']['sr'],
                                                        new_freq=settings_features['process']['sr_resample'])
        if load_into_memory:
            self.examples: List[ndarray] = [
                np_load(str(f), allow_pickle=True)
                for f in self.examples]
        self.cnt = 0

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.
        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int) \
            -> Tuple[ndarray, ndarray, Path]:
        """Gets an example from the dataset.
        :param item: Index of the item.
        :type item: int
        :return: Input and output values, and the Path of the file.
        :rtype: numpy.ndarray. numpy.ndarray, Path
        """

        ex = self.examples[item]
        if not self.load_into_memory:
            ex = np_load(str(ex), allow_pickle=True)
        if self.online_preprocessing:
            in_e = torchaudio.load(Path('data', 'clotho_audio_files', self.split, ex.file_name[0]))[0][0]
            ou_e = ex[self.output_name].item()
        else:
            in_e, ou_e = [ex[i].item()
                          for i in [self.input_name, self.output_name]]
        filename = ex.file_name[0]
        del ex
        if self.transforms is not None:
            for transform in self.transforms:
                in_e, ou_e = transform(dataset=self, inputs=(in_e, ou_e, filename))
        return in_e, ou_e, filename

    def random_sample(self, sample_audio=False):
        """
        Sampling audio or melspectrogram and encoded output
        :return:
        """

        item = random.randint(0, len(self.examples) - 1)
        ex = self.examples[item]
        if not self.load_into_memory:
            ex = np_load(str(ex), allow_pickle=True)
        if sample_audio:
            thedir = Path('./data/clotho_audio_files/').joinpath(self.split)
            filename = Path(thedir, ex.file_name[0])
            in_e = torchaudio.load(filepath=filename)[0][0]
            #in_e = self.resampler.forward(in_e)
            ou_e = ex[self.output_name].item()
        else:
            in_e, ou_e = [ex[i].item()
                          for i in [self.input_name, self.output_name]]

        return in_e, ou_e

In [18]:
from typing import MutableSequence, MutableMapping, Union,\
    Tuple, List
from pathlib import Path

from torch.utils.data import DataLoader
from torch import cat, zeros, from_numpy, ones, Tensor
from numpy import ndarray

#from data_handlers._clotho import ClothoDataset
#from tools.augmentations import MixUp, AudioAugmentation


__author__ = 'Konstantinos Drossos -- Tampere University. Nikita Kuzmin -- Lomonosov Moscow State University'
__docformat__ = 'reStructuredText'
__all__ = ['get_clotho_loader']


def _clotho_collate_fn(batch: MutableSequence[ndarray]) \
        -> Tuple[Tensor, Tensor, List[str]]:
    """Pads data.
    For each batch, the maximum input and output\
    time-steps are calculated. Then, then input and\
    output data are padded to match the maximum time-steps.
    The input data are padded with zeros in front, and\
    the output with] <EOS> tokens at the end.
    :param batch: Batch data of batch x time x features.\
                  First element in the list are the input\
                  data, second the output data.
    :type batch: list[numpy.ndarray]
    :return: Padded data. First tensor is the input data\
             and second the output.
    :rtype: torch.Tensor, torch.Tensor, list[str]
    """
    max_input_t_steps = max([i[0].shape[0] for i in batch])
    max_output_t_steps = max([i[1].shape[0] for i in batch])

    file_names = [i[2] for i in batch]

    #input_features = batch[0][0].shape[-1]
    eos_token = batch[0][1][-1]
    input_tensor = cat([
        cat([zeros(
            max_input_t_steps - i[0].shape[0]).float(),
             i[0].float()]).unsqueeze(0) for i in batch])
    output_tensor = cat([
        cat([
            from_numpy(i[1]).long(),
            ones(max_output_t_steps - len(i[1])).mul(eos_token).long()
        ]).unsqueeze(0) for i in batch])
    return [input_tensor, output_tensor, file_names]


def get_clotho_loader(split: str,
                      is_training: bool,
                      settings_data: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      settings_io: MutableMapping[
                          str, Union[str, bool, MutableMapping[
                              str, Union[str, MutableMapping[str, str]]]]],
                      settings_features: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      settings_dataset: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      ) \
        -> DataLoader:
    """Gets the data loader.
    :param split: Split to be used.
    :type split: str
    :param is_training: Is training data?
    :type is_training: bool
    :param settings_data: Data loading and dataset settings.
    :type settings_data: dict
    :param settings_io: Files I/O settings.
    :type settings_io: dict
    :param settings_features: Audio preprocessing features.
    :type settings_features: dict
    :param settings_dataset: Dataset settings.
    :type settings_dataset: dict
    :param indexes: Indexes of audio files, which depends on validation_strategy.
    :type indexes: numpy array
    :type settings_training: dict
    :return: Data loader.
    :rtype: torch.utils.data.DataLoader
    """
    data_dir = Path(
        settings_io['root_dirs']['data'],
        settings_io['dataset']['features_dirs']['output'])

    transforms = []
    if settings_data['transforms'] == 'None' or (not is_training):
        transforms = None
    else:
        if 'MixUp' in settings_data['transforms']:
            print(settings_features['simple_concat_captions'], 'lalalalalal')
            transforms.append(MixUp(p=settings_data['MixUp_p'],
                              settings_features=settings_features,
                              simple_concat_captions=settings_features['simple_concat_captions'],
                              sample_audio=True))
        if 'another' in settings_data['transforms']:
            transforms.append(AudioAugmentation(p=settings_data['MixUp_p']))

    #if settings_training['validation_strategy']
    dataset = ClothoDataset(
        data_dir=data_dir,
        split=split,
        input_field_name=settings_data['input_field_name'],
        output_field_name=settings_data['output_field_name'],
        load_into_memory=settings_data['load_into_memory'],
        settings_audio=settings_dataset['audio'],
        settings_features=settings_features,
        transforms=transforms)

    shuffle = settings_data['shuffle'] if is_training else False
    drop_last = settings_data['drop_last'] if is_training else False
    if is_training:
        return DataLoader(
            dataset=dataset,
            batch_size=settings_data['batch_size'],
            shuffle=shuffle,
            num_workers=settings_data['num_workers'],
            drop_last=drop_last,
            # pin_memory=True,
            collate_fn=_clotho_collate_fn)
    else:
        return DataLoader(
            dataset=dataset,
            batch_size=40,
            shuffle=shuffle,
            num_workers=2,
            drop_last=drop_last,
            # pin_memory=True,
            collate_fn=_clotho_collate_fn)

In [19]:
config_file='main_settings'
file_ext='yaml'
file_dir='settings' 
settings = file_io.load_yaml_file(Path(
        file_dir, f'{config_file}.{file_ext}'))

In [6]:
from tools.file_io import load_audio_file
from tools import file_io

In [21]:
training_data = get_clotho_loader(
            settings_io['dataset']['features_dirs']['development'],
            is_training=True,
            settings_data=settings_data,
            settings_io=settings_io,
            settings_features=settings_features,
            settings_dataset=settings_dataset)

True lalalalalal


In [23]:
len(training_data)

1525

In [24]:
 =  get_clotho_loader(
            settings_io['dataset']['features_dirs']['evaluation'],
            is_training=False,
            settings_data=settings_data,
            settings_io=settings_io,
            settings_features=settings_features,
            settings_dataset=settings_dataset)

In [25]:
len(evaluation_beam)

131

In [8]:
settings_io=settings['dirs_and_files']

In [17]:
settings_data['transforms']

['MixUp']

In [12]:
settings_io

{'root_dirs': {'outputs': 'outputs', 'data': 'data'},
 'dataset': {'development': 'development',
  'evaluation': 'evaluation',
  'features_dirs': {'output': 'data_splits_mel',
   'development': 'development',
   'evaluation': 'evaluation'},
  'audio_dirs': {'downloaded': 'clotho_audio_files',
   'output': 'data_splits_audio_mel',
   'development': 'development',
   'evaluation': 'evaluation'},
  'annotations_dir': 'clotho_csv_files',
  'pickle_files_dir': 'pickles',
  'files': {'np_file_name_template': 'clotho_file_{audio_file_name}_{caption_index}.npy',
   'words_list_file_name': 'words_list.p',
   'words_counter_file_name': 'words_frequencies.p',
   'characters_list_file_name': 'characters_list.p',
   'characters_frequencies_file_name': 'characters_frequencies.p'}},
 'model': {'model_dir': 'models',
  'checkpoint_model_name': 'dcase_model_baseline.pt',
  'pre_trained_model_name': 'dcase_model_baseline_pre_trained.pt'},
 'logging': {'logger_dir': 'logging',
  'caption_logger_file': 'c

In [9]:
settings_io['dataset']['features_dirs']['development']

'development'

In [10]:
settings_data=settings['dnn_training_settings']['data']

In [11]:
settings_data

{'input_field_name': 'features',
 'output_field_name': 'words_ind',
 'load_into_memory': False,
 'transforms': ['MixUp'],
 'MixUp_p': 0.5,
 'batch_size': 16,
 'shuffle': True,
 'num_workers': 4,
 'drop_last': True}

In [12]:
settings_features=settings['feature_extraction_settings']

In [13]:
settings_features

{'keep_raw_audio_data': False,
 'simple_concat_captions': True,
 'process': {'sr': 44100,
  'sr_resample': 16000,
  'nb_fft': 1024,
  'hop_size': 512,
  'nb_mels': 64,
  'window_function': 'hann',
  'center': True,
  'f_min': 0.0,
  'f_max': None,
  'htk': False,
  'power': 1.0,
  'norm': 1}}

In [14]:
settings_dataset=settings['dataset_creation_settings']

In [15]:
settings_dataset

{'workflow': {'create_dataset': True, 'validate_dataset': False},
 'annotations': {'development_file': 'clotho_captions_development.csv',
  'evaluation_file': 'clotho_captions_evaluation.csv',
  'audio_file_column': 'file_name',
  'captions_fields_prefix': 'caption_{}',
  'use_special_tokens': True,
  'nb_captions': 5,
  'keep_case': False,
  'remove_punctuation_words': True,
  'remove_punctuation_chars': True,
  'use_unique_words_per_caption': False,
  'use_unique_chars_per_caption': False},
 'audio': {'sr': 44100, 'to_mono': True, 'max_abs_value': 1.0}}

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Tuple, List, AnyStr, Union
from pathlib import Path

from numpy import ndarray, recarray
from torch.utils.data import Dataset
from numpy import load as np_load

import torch
import numpy as np
import os

__author__ = 'Konstantinos Drossos -- Tampere University'
__docformat__ = 'reStructuredText'
__all__ = ['ClothoDataset']


class ClothoDataset(Dataset):

    def __init__(self, data_dir: Path,
                 split: AnyStr,
                 input_field_name: AnyStr,
                 output_field_name: AnyStr,
                 load_into_memory: bool,
                 transforms=transforms) \
            -> None:
        """Initialization of a Clotho dataset object.

        :param data_dir: Directory with data.
        :type data_dir: pathlib.Path
        :param split: Split to use (i.e. 'development', 'evaluation')
        :type split: str
        :param input_field_name: Field name of the clotho data\
                                 to be used as input data to the\
                                 method.
        :type input_field_name: str
        :param output_field_name: Field name of the clotho data\
                                 to be used as output data to the\
                                 method.
        :type output_field_name: str
        :param load_into_memory: Load all data into memory?
        :type load_into_memory: bool
        """
        super(ClothoDataset, self).__init__()
        the_dir: Path = data_dir.joinpath(split)

        self.examples: List[Path] = sorted(the_dir.iterdir())
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.transforms=transforms
        if load_into_memory:
            self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
                                             for f in self.examples]

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.

        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int) \
            -> Tuple[ndarray, ndarray]:
        """Gets an example from the dataset.

        :param item: Index of the item.
        :type item: int
        :return: Input and output values.
        :rtype: numpy.ndarray. numpy.ndarray
        """
        ex: Union[Path, recarray] = self.examples[item]
        if not self.load_into_memory:
            ex: recarray = np_load(str(ex), allow_pickle=True)

        in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]

        return in_e, ou_e


class ClothoDatasetEval(Dataset):

    def __init__(self, data_dir: Path,
                 split: AnyStr,
                 input_field_name: AnyStr,
                 output_field_name: AnyStr,
                 load_into_memory: bool) \
            -> None:
        """Initialization of a Clotho dataset object.

        :param data_dir: Directory with data.
        :type data_dir: pathlib.Path
        :param split: Split to use (i.e. 'development', 'evaluation')
        :type split: str
        :param input_field_name: Field name of the clotho data\
                                 to be used as input data to the\
                                 method.
        :type input_field_name: str
        :param output_field_name: Field name of the clotho data\
                                 to be used as output data to the\
                                 method.
        :type output_field_name: str
        :param load_into_memory: Load all data into memory?
        :type load_into_memory: bool
        """
        super(ClothoDatasetEval, self).__init__()
        the_dir: Path = data_dir.joinpath(split)
        if split == 'evaluation':
            self.examples: List[Path] = sorted(the_dir.iterdir())[::5]  # changed
        else:
            self.examples: List[Path] = sorted(the_dir.iterdir())  # changed
        # self.examples: List[Path] = sorted(the_dir.iterdir())
        self.input_name: str = input_field_name
        self.output_name: str = output_field_name
        self.load_into_memory: bool = load_into_memory
        self.data_dir = the_dir

        if load_into_memory:
            self.examples: List[recarray] = [np_load(str(f), allow_pickle=True)
                                             for f in self.examples]

    def __len__(self) \
            -> int:
        """Gets the amount of examples in the dataset.

        :return: Amount of examples in the dataset.
        :rtype: int
        """
        return len(self.examples)

    def __getitem__(self,
                    item: int):
        """Gets an example from the dataset.

        :param item: Index of the item.
        :type item: int
        :return: Input and output values.
        :rtype: numpy.ndarray. numpy.ndarray
        """
        ex: Union[Path, recarray] = self.examples[item]
        if not self.load_into_memory:
            ex: recarray = np_load(str(ex), allow_pickle=True)

        in_e, ou_e = [ex[i].item() for i in [self.input_name, self.output_name]]

        all_ref = get_all_ref(ex['file_name'].item(), self.data_dir)

        filename = str(ex['file_name'].item())
        out_len = len(ou_e)
        return in_e, ou_e, all_ref, filename,out_len


def get_all_ref(filename, data_dir):
    filename = str(filename)
    # tgt = [np.load(d, allow_pickle=True).words_ind.tolist()
    tgt = [np.load(d, allow_pickle=True)['words_ind'].item().tolist()
           for d in [os.path.join(data_dir, 'clotho_file_{filename}.wav_{i}.npy'.
                                  format(filename=filename[:-4],  # 删除'.wav'
                                         i=i)) for i in range(5)]  # wav_0-wav_4
           ]
    return tgt
# EOF


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from typing import Callable, Union, Tuple, AnyStr, Optional
from functools import partial
from pathlib import Path

from torch.utils.data.dataloader import DataLoader

from .clotho_dataset import ClothoDataset, ClothoDatasetEval
from .collate_fn import clotho_collate_fn, clotho_collate_fn_eval

__author__ = 'Konstantinos Drossos'
__docformat__ = 'reStructuredText'
__all__ = ['get_clotho_loader']


def get_clotho_loader(data_dir: Path,
                      split: str,
                      input_field_name: str,
                      output_field_name: str,
                      load_into_memory: bool,
                      batch_size: int,
                      nb_t_steps_pad: Union[AnyStr, Tuple[int, int]],
                      shuffle: Optional[bool] = True,
                      drop_last: Optional[bool] = True,
                      input_pad_at: Optional[str] = 'start',
                      output_pad_at: Optional[str] = 'end',
                      num_workers: Optional[int] = 1,
                      return_reference: Optional[bool] = False,
                      augment: Optional[bool] = False) \
        -> DataLoader:
    """Gets the clotho data loader.

    :param return_reference:
    :param data_dir: Directory with data.
    :type data_dir: pathlib.Path
    :param split: Split to use (i.e. 'development', 'evaluation')
    :type split: str
    :param input_field_name: Field name of the clotho data\
                             to be used as input data to the\
                             method.
    :type input_field_name: str
    :param output_field_name: Field name of the clotho data\
                             to be used as output data to the\
                             method.
    :type output_field_name: str
    :param load_into_memory: Load all data into memory?
    :type load_into_memory: bool
    :param batch_size: Batch size to use.
    :type batch_size: int
    :param nb_t_steps_pad: Number of time steps to\
                           pad/truncate to. Cab use\
                           'max', 'min', or exact number\
                           e.g. (1024, 10).
    :type nb_t_steps_pad: str|(int, int)
    :param shuffle: Shuffle examples? Defaults to True.
    :type shuffle: bool, optional
    :param drop_last: Drop the last examples if not making\
                      a batch of `batch_size`? Defaults to True.
    :type drop_last: bool, optional
    :param input_pad_at: Pad input at the start or\
                         at the end?
    :type input_pad_at: str
    :param output_pad_at: Pad output at the start or\
                          at the end?
    :type output_pad_at: str
    :param num_workers: Amount of workers, defaults to 1.
    :type num_workers: int, optional
    :return: Dataloader for Clotho data.
    :rtype: torch.utils.data.dataloader.DataLoader
    """
    if return_reference:
        dataset: ClothoDatasetEval = ClothoDatasetEval(
            data_dir=data_dir, split=split,
            input_field_name=input_field_name,
            output_field_name=output_field_name,
            load_into_memory=load_into_memory
            transforms=trans)

        collate_fn: Callable = partial(
            clotho_collate_fn_eval,
            nb_t_steps=nb_t_steps_pad,
            input_pad_at=input_pad_at,
            output_pad_at=output_pad_at, split=split, augment=augment)
    else:
        dataset: ClothoDataset = ClothoDataset(
            data_dir=data_dir, split=split,
            input_field_name=input_field_name,
            output_field_name=output_field_name,
            load_into_memory=load_into_memory)

        collate_fn: Callable = partial(
            clotho_collate_fn,
            nb_t_steps=nb_t_steps_pad,
            input_pad_at=input_pad_at,
            output_pad_at=output_pad_at)

    return DataLoader(
        dataset=dataset, batch_size=batch_size,
        shuffle=shuffle, num_workers=num_workers,
        drop_last=drop_last, collate_fn=collate_fn)

# EOF

