## 파이썬을 이용한 딥러닝-기말프로젝트
### AAC(Automated Audio Captioning)  

- input-오디오 (예:창문이 닫히는 소리, 태풍이 들리는 소리 등 15초~30초 사이의 음성 wav file)
- output-caption(문장: 창문이 닫히고 있다. 태풍이 들리고 있다. 등 5개의 캡션이 주어짐)  

최종 아웃풋은 1개의 캡션
### 모델 구조 

- 인코더-디코더   
- 인코더(resnet)-디코더(transformer)

In [1]:
import numpy as np
import time
import torch
import torch.nn as nn

resnet

In [2]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)

In [3]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):

        super(ConvBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.conv2 = nn.Conv2d(in_channels=out_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x

In [4]:
class _ResNet(nn.Module):
    def __init__(self, block, layers, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(_ResNet, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            if stride == 1:
                downsample = nn.Sequential(
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[0])
                init_bn(downsample[1])
            elif stride == 2:
                downsample = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2), 
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[1])
                init_bn(downsample[2])

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        return x


In [5]:
class _ResnetBottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(_ResnetBottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        self.stride = stride
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = _resnet_conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = _resnet_conv3x3(width, width)
        self.bn2 = norm_layer(width)
        self.conv3 = _resnet_conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

        self.init_weights()

    def init_weights(self):
        init_layer(self.conv1)
        init_bn(self.bn1)
        init_layer(self.conv2)
        init_bn(self.bn2)
        init_layer(self.conv3)
        init_bn(self.bn3)
        nn.init.constant_(self.bn3.weight, 0)

    def forward(self, x):
        identity = x

        if self.stride == 2:
            x = F.avg_pool2d(x, kernel_size=(2, 2))

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = F.dropout(out, p=0.1, training=self.training)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(identity)

        out += identity
        out = self.relu(out)

        return out


In [6]:
def _resnet_conv1x1(in_planes, out_planes):
    #1x1 convolution
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False)

In [7]:
def _resnet_conv3x3(in_planes, out_planes):
    #3x3 convolution with padding
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1,
                     padding=1, groups=1, bias=False, dilation=1)

In [8]:
class ResNet54(nn.Module):
    def __init__(self, classes_num=527):
        
        super(ResNet54, self).__init__()

      

        # Spectrogram extractor
       # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
       #     win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
       #     freeze_parameters=True)

        # Logmel feature extractor
        #self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
        #   n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
        #   freeze_parameters=True)

        # Spec augmenter
        #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
        #    freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        # self.conv_block2 = ConvBlock(in_channels=64, out_channels=64)

        self.resnet = _ResNet(block=_ResnetBottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)

        self.conv_block_after1 = ConvBlock(in_channels=2048, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weights()

    def init_weights(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)


    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        #x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        #x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        x = input.unsqueeze(1)
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        #if self.training:
        #   x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.resnet(x)
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.conv_block_after1(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        
        #x = torch.mean(x, dim=3)
        
        #(x1, _) = torch.max(x, dim=2)
       # x2 = torch.mean(x, dim=2)
       # x = x1 + x2
       # x = F.dropout(x, p=0.5, training=self.training)
       # x = F.relu_(self.fc1(x))
       # embedding = F.dropout(x, p=0.5, training=self.training)
       # clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        #output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return x



구글의 audioset 으로 사전 학습된 가중치를 불러오기 위해 transfer learning 정의

In [9]:
class Transfer_ResNet54(nn.Module):
    def __init__(self, freeze_base=None, pretrain_checkpoint=None):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_ResNet54, self).__init__()

        audioset_classes_num = 527
        self.base = ResNet54(audioset_classes_num)

        #self.init_weights()

        if pretrain_checkpoint:
            self.load_from_pretrain(pretrain_checkpoint)
            
        #self.base.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        # 안쓰이는이유는 multi-class clasification을 생략하기 때문,

        if freeze_base:
            # 2단계 freeze / 3단계 freeze X
            # Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

    #def init_weights(self):
        #init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint):
        pretrained_checkpoint="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth"
        checkpoint = torch.load(pretrained_checkpoint)
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_imag.weight')#가중치 삭제  
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_real.weight')#가중치 삭제
        checkpoint['model'].pop('logmel_extractor.melW')#가중치 삭제
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input):
        """Input: (batch_size, data_length)
        """
        output = self.base(input)

        #embedding = output_dict['embedding']
        #clipwise_output = output_dict['clipwise_output']

        return output #, clipwise_output
 

In [10]:
#resnet54()구조 확인 
model=ResNet54()

In [11]:
model

ResNet54(
  (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_block1): ConvBlock(
    (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (resnet): _ResNet(
    (layer1): Sequential(
      (0): _ResnetBottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1

In [12]:
#model parameter
for name, param in model.named_parameters(): 
    print(f'name:{name}') 
    print(type(param)) 
    print(f'param.shape:{param.shape}') 
    print(f'param.requries_grad:{param.requires_grad}') 
    print('=====')


name:bn0.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:True
=====
name:bn0.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:True
=====
name:conv_block1.conv1.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64, 1, 3, 3])
param.requries_grad:True
=====
name:conv_block1.conv2.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64, 64, 3, 3])
param.requries_grad:True
=====
name:conv_block1.bn1.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:True
=====
name:conv_block1.bn1.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:True
=====
name:conv_block1.bn2.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:True
=====
name:conv_block1.bn2.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:Tr

In [13]:
#transfer learning , 불러온 가중치 고정
model_trans=Transfer_ResNet54(freeze_base=True,pretrain_checkpoint="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth")

In [14]:
#model parameter->param.requries_grad:False freezing 확인
for name, param in model_trans.named_parameters(): 
    print(f'name:{name}') 
    print(type(param)) 
    print(f'param.shape:{param.shape}') 
    print(f'param.requries_grad:{param.requires_grad}') 
    print('=====')


name:base.bn0.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.bn0.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.conv1.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64, 1, 3, 3])
param.requries_grad:False
=====
name:base.conv_block1.conv2.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64, 64, 3, 3])
param.requries_grad:False
=====
name:base.conv_block1.bn1.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.bn1.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.bn2.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([64])
param.requries_grad:False
=====
name:base.conv_block1.bn2.bias
<class 'torch.nn.parameter.Parameter'>
para

디코더로 transformer 모델 사용(파이토치 튜토리얼 차용)  
출처: https://tutorials.pytorch.kr/beginner/transformer_tutorial.html
 

In [15]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.transformer import TransformerDecoder,TransformerDecoderLayer

from hparams import hparams as hp
from encoder import Cnn14,Transfer_Cnn14,init_layer


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, batch_size, dropout=0.5,pretrain_cnn=None,
                 pretrain_emb=None,freeze_cnn=True):
        super(TransformerModel, self).__init__()

        self.model_type = 'cnn+transformer'
        decoder_layers = TransformerDecoderLayer(d_model=nhid, nhead=nhead, dropout=dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
        self.word_emb = nn.Embedding(ntoken, nhid)
        self.ninp = ninp
        self.nhid = nhid
        self.fc = nn.Linear(2048, 2048, bias=True)
        self.fc1 = nn.Linear(2048, nhid, bias=True)
        self.dec_fc = nn.Linear(nhid, ntoken)
        self.batch_size = batch_size
        self.ntoken = ntoken

        #def __init__(self, freeze_base, pretrain_checkpoint=None):
        pretrain_cnn="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth"
        
        self.encoder = Transfer_ResNet54(freeze_base=freeze_cnn, pretrain_checkpoint=pretrain_cnn)
        self.dropout = nn.Dropout(dropout)
        self.pos_encoder = PositionalEncoding(nhid, dropout)
        self.generator = nn.Softmax(dim=-1)
        self.init_weights()

        '''
        if pretrain_cnn is not None:
            dict_trained = pretrain_cnn
            dict_new = self.encoder.state_dict().copy()
            new_list = list(self.encoder.state_dict().keys())
            trained_list = list(dict_trained.keys())
            for i in range(len(new_list)):
                dict_new[new_list[i]] = dict_trained[trained_list[i]]
            self.encoder.load_state_dict(dict_new)
        
        if freeze_cnn:
            self.freeze_cnn()
        '''

        if pretrain_emb is not None:
            self.word_emb.weight.data = pretrain_emb

    '''
    def freeze_cnn(self):
        for p in self.encoder.parameters():
            p.requires_grad = False
    '''

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        init_layer(self.fc1)
        init_layer(self.fc)
        self.word_emb.weight.data.uniform_(-initrange, initrange)
        self.dec_fc.bias.data.zero_()
        self.dec_fc.weight.data.uniform_(-initrange, initrange)

    def encode(self, src, input_mask=None):
        global x 
        x = self.encoder(src)  # (batch_size, 2048, T/16, mel_bins/16) ,mixup
        x = torch.mean(x, dim=3)  # (batch_size, 2048, T/16)
        x = x.permute(2, 0, 1)  # (T/16,batch_size,2048)
        x = F.relu_(self.fc(x))
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.relu(self.fc1(x))
        return x

    def decode(self, mem, tgt, input_mask=None, target_mask=None, target_padding_mask=None):
        # tgt:(batch_size,T_out)
        # mem:(T_mem,batch_size,nhid)

        tgt = tgt.transpose(0, 1)  # (T_out,batch_size)
        if target_mask is None or target_mask.size(0) != len(tgt):
            device = tgt.device
            target_mask = self.generate_square_subsequent_mask(len(tgt)).to(device)

        tgt = self.dropout(self.word_emb(tgt)) * math.sqrt(self.nhid)
        tgt = self.pos_encoder(tgt)
        # mem = self.pos_encoder(mem)
        output = self.transformer_decoder(tgt, mem, memory_mask=input_mask, tgt_mask=target_mask,
                                          tgt_key_padding_mask=target_padding_mask)
        output = self.dec_fc(output)
        return output

    def forward(self, src, tgt, input_mask=None, target_mask=None, target_padding_mask=None):
        # src:(batch_size,T_in,feature_dim)
        # tgt:(batch_size,T_out)
        mem = self.encode(src)
        output = self.decode(mem, tgt, input_mask=input_mask, target_mask=target_mask,
                             target_padding_mask=target_padding_mask)
        return output


In [16]:
import torch
import torch.nn as nn
import time

from data_handling import get_clotho_loader, get_test_data_loader
#from model import TransformerModel  # , RNNModel, RNNModelSmall
import itertools
import numpy as np
import os
import sys
import logging
import csv

from util import get_file_list, get_padding, print_hparams, greedy_decode, \
    calculate_bleu, calculate_spider, LabelSmoothingLoss, beam_search, align_word_embedding, gen_str
from hparams import hparams
from torch.utils.tensorboard import SummaryWriter

import argparse

hp = hparams()
parser = argparse.ArgumentParser(description='hparams for model')

device = torch.device('cuda')
np.random.seed(hp.seed)
torch.manual_seed(hp.seed)

<torch._C.Generator at 0x7ff6d4036960>

In [17]:
#word_embedding으로 디코더 pre-trained
pretrain_emb = align_word_embedding(hp.word_dict_pickle_path, hp.pretrain_emb_path, hp.ntoken,
                                        hp.nhid) if hp.load_pretrain_emb else None

In [18]:
model = TransformerModel(hp.ntoken, hp.ninp, hp.nhead, hp.nhid, hp.nlayers,8, dropout=0.2,
                             pretrain_cnn="/home/hj20/dcase_2020_T6/models/ResNet54_mAP=0.429.pth", pretrain_emb=pretrain_emb, freeze_cnn=True).to(device)

In [19]:
model

TransformerModel(
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(p=0.2, inplace=False)
      )
  

In [20]:
#model parameter 확인 (freezing)인코더만 false 여야 함
for name, param in model.named_parameters(): 
    print(f'name:{name}') 
    print(type(param)) 
    print(f'param.shape:{param.shape}') 
    print(f'param.requries_grad:{param.requires_grad}') 
    print('=====')

name:transformer_decoder.layers.0.self_attn.in_proj_weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.in_proj_bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.out_proj.weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([192, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.self_attn.out_proj.bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.multihead_attn.in_proj_weight
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576, 192])
param.requries_grad:True
=====
name:transformer_decoder.layers.0.multihead_attn.in_proj_bias
<class 'torch.nn.parameter.Parameter'>
param.shape:torch.Size([576])
param.requries_grad:True
=====
name:transformer_decoder.

In [21]:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)

In [24]:
#swa 사용시 
from torchcontrib.optim import SWA
import torchcontrib

base_opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)

In [23]:
!pip install torchcontrib

Collecting torchcontrib
  Using cached torchcontrib-0.0.2-py3-none-any.whl
Installing collected packages: torchcontrib
Successfully installed torchcontrib-0.0.2


In [25]:
#경로 지정
data_dir = hp.data_dir
eval_data_dir = hp.eval_data_dir
train_data_dir = hp.train_data_dir
word_dict_pickle_path = hp.word_dict_pickle_path
word_freq_pickle_path = hp.word_freq_pickle_path
test_data_dir = hp.test_data_dir

In [26]:
training_data = get_clotho_loader(data_dir=data_dir, split='development',
                                      input_field_name='features',
                                      output_field_name='words_ind',
                                      load_into_memory=False,
                                      batch_size=8,
                                      nb_t_steps_pad='max',
                                      num_workers=4, return_reference=True, augment=hp.spec_augmentation)

In [27]:
evaluation_beam = get_clotho_loader(data_dir=data_dir, split='evaluation',
                                        input_field_name='features',
                                        output_field_name='words_ind',
                                        load_into_memory=False,
                                        batch_size=32,
                                        nb_t_steps_pad='max',
                                        shuffle=False,
                                        return_reference=True)

In [29]:
test_data = get_test_data_loader(data_dir=test_data_dir,
                                     batch_size=hp.batch_size * 2,
                                     nb_t_steps_pad='max',
                                     shuffle=False,
                                     drop_last=False,
                                     input_pad_at='start',
                                     num_workers=8)

In [30]:
def train():
    model.train()
    total_loss_text = 0.
    start_time = time.time()
    batch = 0
    for src, tgt, tgt_len, ref in training_data:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_pad_mask = get_padding(tgt, tgt_len)
        tgt_in = tgt[:, :-1]
        tgt_pad_mask = tgt_pad_mask[:, :-1]
        tgt_y = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_in, target_padding_mask=tgt_pad_mask)

        loss_text = criterion(output.contiguous().view(-1, hp.ntoken), tgt_y.transpose(0, 1).contiguous().view(-1))
        loss = loss_text
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), hp.clip_grad)
        optimizer.step()
        total_loss_text += loss_text.item()

        writer.add_scalar('Loss/train-text', loss_text.item(), (epoch - 1) * len(training_data) + batch)
        batch += 1

        if batch % hp.log_interval == 0 and batch > 0:
            mean_text_loss = total_loss_text / hp.log_interval
            elapsed = time.time() - start_time
            current_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
            logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                         'loss-text {:5.4f}'.format(
                epoch, batch, len(training_data), current_lr,
                elapsed * 1000 / hp.log_interval, mean_text_loss))
            total_loss_text = 0
            start_time = time.time()
          
        #optimizer.swap_swa_sgd()      

def eval_all(evaluation_data, max_len=30, eos_ind=9, word_dict_pickle_path=None):
    model.eval()
    with torch.no_grad():
        output_sentence_all = []
        ref_all = []
        for src, tgt, _, ref in evaluation_data:
            src = src.to(device)
            output = greedy_decode(model, src, max_len=max_len)

            output_sentence_ind_batch = []
            for i in range(output.size()[0]):
                output_sentence_ind = []
                for j in range(1, output.size(1)):
                    sym = output[i, j]
                    if sym == eos_ind: break
                    output_sentence_ind.append(sym.item())
                output_sentence_ind_batch.append(output_sentence_ind)
            output_sentence_all.extend(output_sentence_ind_batch)
            ref_all.extend(ref)
        score, output_str, ref_str = calculate_spider(output_sentence_all, ref_all, word_dict_pickle_path)

        loss_mean = score
        writer.add_scalar(f'Loss/eval_greddy', loss_mean, epoch)
        msg = f'eval_greddy SPIDEr: {loss_mean:2.4f}'
        logging.info(msg)


def eval_with_beam(evaluation_data, max_len=30, eos_ind=9, word_dict_pickle_path=None, beam_size=3):
    model.eval()
    with torch.no_grad():
        output_sentence_all = []
        ref_all = []
        for src, tgt, _, ref in evaluation_data:
            src = src.to(device)
            output = beam_search(model, src, max_len, start_symbol_ind=0, beam_size=beam_size)

            output_sentence_ind_batch = []
            for single_sample in output:
                output_sentence_ind = []
                for sym in single_sample:
                    if sym == eos_ind: break
                    output_sentence_ind.append(sym.item())
                output_sentence_ind_batch.append(output_sentence_ind)
            output_sentence_all.extend(output_sentence_ind_batch)
            ref_all.extend(ref)

        score, output_str, ref_str = calculate_spider(output_sentence_all, ref_all, word_dict_pickle_path)

        loss_mean = score
        writer.add_scalar(f'Loss/eval_beam', loss_mean, epoch)
        msg = f'eval_beam_{beam_size} SPIDEr: {loss_mean:2.4f}'
        logging.info(msg)


def test_with_beam(test_data, max_len=30, eos_ind=9, beam_size=3):
    model.eval()

    with torch.no_grad():
        with open("test_out.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(['file_name', 'caption_predicted'])
            for src, filename in test_data:
                src = src.to(device)
                output = beam_search(model, src, max_len, start_symbol_ind=0, beam_size=beam_size)

                output_sentence_ind_batch = []
                for single_sample in output:
                    output_sentence_ind = []
                    for sym in single_sample:
                        if sym == eos_ind: break
                        output_sentence_ind.append(sym.item())
                    output_sentence_ind_batch.append(output_sentence_ind)
                out_str = gen_str(output_sentence_ind_batch, hp.word_dict_pickle_path)
                for caption, fn in zip(out_str, filename):
                    writer.writerow(['{}.wav'.format(fn), caption])


In [31]:
if hp.label_smoothing:
    criterion = LabelSmoothingLoss(hp.ntoken, smoothing=0.1)
else:
    criterion = nn.CrossEntropyLoss(ignore_index=hp.ntoken - 1)

now_time = str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())))
log_dir = 'models/{name}'.format(name=hp.name)

writer = SummaryWriter(log_dir=log_dir)

log_path = os.path.join(log_dir, 'train.log')

logging.basicConfig(level=logging.DEBUG,
                        format=
                        '%(asctime)s - %(levelname)s: %(message)s',
                        handlers=[
                            logging.FileHandler(log_path),
                            logging.StreamHandler(sys.stdout)]
                        )


In [34]:
#기존 파라미터 출력하기
logging.info(str(model))

logging.info(str(print_hparams(hp)))

logging.info('Data loaded!')
logging.info('Data size: ' + str(len(training_data)))

logging.info('Total Model parameters: ' + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))

2021-06-21 12:06:18,433 - INFO: TransformerModel(
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(

In [35]:
epoch = 1
if hp.mode == 'train':
    while epoch < hp.training_epochs + 1:
        epoch_start_time = time.time()
        train()
        torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch))
        scheduler.step(epoch)
        eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=2)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=3)
        eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                           beam_size=4)
        epoch += 1

2021-06-21 12:06:54,233 - INFO: | epoch   1 |   100/ 3051 batches | lr 1.00e-04 | ms/batch 246.76 | loss-text 6.2814
2021-06-21 12:07:19,165 - INFO: | epoch   1 |   200/ 3051 batches | lr 1.00e-04 | ms/batch 249.31 | loss-text 5.1971
2021-06-21 12:07:44,404 - INFO: | epoch   1 |   300/ 3051 batches | lr 1.00e-04 | ms/batch 252.38 | loss-text 4.9006
2021-06-21 12:08:09,216 - INFO: | epoch   1 |   400/ 3051 batches | lr 1.00e-04 | ms/batch 248.11 | loss-text 4.7772
2021-06-21 12:08:34,032 - INFO: | epoch   1 |   500/ 3051 batches | lr 1.00e-04 | ms/batch 248.15 | loss-text 4.6201
2021-06-21 12:08:59,295 - INFO: | epoch   1 |   600/ 3051 batches | lr 1.00e-04 | ms/batch 252.62 | loss-text 4.5796
2021-06-21 12:09:24,662 - INFO: | epoch   1 |   700/ 3051 batches | lr 1.00e-04 | ms/batch 253.66 | loss-text 4.5534
2021-06-21 12:09:49,912 - INFO: | epoch   1 |   800/ 3051 batches | lr 1.00e-04 | ms/batch 252.49 | loss-text 4.4171
2021-06-21 12:10:15,097 - INFO: | epoch   1 |   900/ 3051 batche



loading annotations into memory...
0:00:00.004194
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 12439, 'reflen': 11493, 'guess': [12439, 11415, 10391, 9367], 'correct': [4937, 1433, 455, 107]}
ratio: 1.0823109718958424
Bleu_1: 0.397
Bleu_2: 0.223
Bleu_3: 0.130
Bleu_4: 0.071
computing METEOR score...
METEOR: 0.124
computing Rouge score...
ROUGE_L: 0.300
computing CIDEr score...
CIDEr: 0.134
computing SPICE score...
SPICE: 0.075
computing SPIDEr score...
SPIDEr: 0.105
2021-06-21 12:20:05,224 - INFO: eval_greddy SPIDEr: 0.1045
loading annotations into memory...
0:00:00.004107
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8335, 'reflen': 9546, 'guess': [8335, 7311, 6287, 5263], 'correct': [4275, 1308,



loading annotations into memory...
0:00:00.004034
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10920, 'reflen': 10587, 'guess': [10920, 9896, 8872, 7848], 'correct': [4961, 1494, 488, 110]}
ratio: 1.0314536695946885
Bleu_1: 0.454
Bleu_2: 0.262
Bleu_3: 0.156
Bleu_4: 0.085
computing METEOR score...
METEOR: 0.130
computing Rouge score...
ROUGE_L: 0.318
computing CIDEr score...
CIDEr: 0.164
computing SPICE score...
SPICE: 0.083
computing SPIDEr score...
SPIDEr: 0.123
2021-06-21 12:36:09,505 - INFO: eval_greddy SPIDEr: 0.1235
loading annotations into memory...
0:00:00.004563
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8528, 'reflen': 9486, 'guess': [8528, 7504, 6480, 5456], 'correct': [4356, 1358, 4



loading annotations into memory...
0:00:00.004132
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11549, 'reflen': 11158, 'guess': [11549, 10525, 9501, 8477], 'correct': [5550, 1702, 584, 142]}
ratio: 1.035042122244037
Bleu_1: 0.481
Bleu_2: 0.279
Bleu_3: 0.168
Bleu_4: 0.095
computing METEOR score...
METEOR: 0.142
computing Rouge score...
ROUGE_L: 0.335
computing CIDEr score...
CIDEr: 0.205
computing SPICE score...
SPICE: 0.094
computing SPIDEr score...
SPIDEr: 0.150
2021-06-21 12:52:18,781 - INFO: eval_greddy SPIDEr: 0.1497
loading annotations into memory...
0:00:00.003925
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8716, 'reflen': 9589, 'guess': [8716, 7692, 6668, 5644], 'correct': [4789, 1604, 6



loading annotations into memory...
0:00:00.004050
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10860, 'reflen': 10708, 'guess': [10860, 9836, 8812, 7788], 'correct': [5225, 1740, 627, 167]}
ratio: 1.014194994396618
Bleu_1: 0.481
Bleu_2: 0.292
Bleu_3: 0.182
Bleu_4: 0.107
computing METEOR score...
METEOR: 0.140
computing Rouge score...
ROUGE_L: 0.336
computing CIDEr score...
CIDEr: 0.214
computing SPICE score...
SPICE: 0.093
computing SPIDEr score...
SPIDEr: 0.153
2021-06-21 13:08:11,268 - INFO: eval_greddy SPIDEr: 0.1534
loading annotations into memory...
0:00:00.004251
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8916, 'reflen': 9742, 'guess': [8916, 7892, 6868, 5844], 'correct': [4909, 1772, 71



loading annotations into memory...
0:00:00.004055
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11212, 'reflen': 10941, 'guess': [11212, 10188, 9164, 8140], 'correct': [5319, 1743, 628, 180]}
ratio: 1.0247692167077027
Bleu_1: 0.474
Bleu_2: 0.285
Bleu_3: 0.177
Bleu_4: 0.105
computing METEOR score...
METEOR: 0.144
computing Rouge score...
ROUGE_L: 0.337
computing CIDEr score...
CIDEr: 0.245
computing SPICE score...
SPICE: 0.098
computing SPIDEr score...
SPIDEr: 0.171
2021-06-21 13:23:48,529 - INFO: eval_greddy SPIDEr: 0.1712
loading annotations into memory...
0:00:00.004052
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8243, 'reflen': 9534, 'guess': [8243, 7219, 6195, 5171], 'correct': [4447, 1551, 



loading annotations into memory...
0:00:00.003785
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10196, 'reflen': 10356, 'guess': [10196, 9172, 8148, 7124], 'correct': [5313, 1852, 688, 198]}
ratio: 0.9845500193123807
Bleu_1: 0.513
Bleu_2: 0.319
Bleu_3: 0.204
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.150
computing Rouge score...
ROUGE_L: 0.351
computing CIDEr score...
CIDEr: 0.267
computing SPICE score...
SPICE: 0.099
computing SPIDEr score...
SPIDEr: 0.183
2021-06-21 13:39:07,004 - INFO: eval_greddy SPIDEr: 0.1829
loading annotations into memory...
0:00:00.003937
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8384, 'reflen': 9430, 'guess': [8384, 7360, 6336, 5312], 'correct': [4711, 1705, 6



loading annotations into memory...
0:00:00.003678
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10192, 'reflen': 10377, 'guess': [10192, 9168, 8144, 7120], 'correct': [5294, 1812, 681, 215]}
ratio: 0.9821721114001173
Bleu_1: 0.510
Bleu_2: 0.315
Bleu_3: 0.201
Bleu_4: 0.125
computing METEOR score...
METEOR: 0.150
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.268
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.186
2021-06-21 13:53:09,861 - INFO: eval_greddy SPIDEr: 0.1864
loading annotations into memory...
0:00:00.004020
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8148, 'reflen': 9459, 'guess': [8148, 7124, 6100, 5076], 'correct': [4618, 1620, 6



loading annotations into memory...
0:00:00.003796
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11302, 'reflen': 11000, 'guess': [11302, 10278, 9254, 8230], 'correct': [5751, 1989, 743, 214]}
ratio: 1.027454545454452
Bleu_1: 0.509
Bleu_2: 0.314
Bleu_3: 0.199
Bleu_4: 0.120
computing METEOR score...
METEOR: 0.156
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.274
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.188
2021-06-21 14:07:16,594 - INFO: eval_greddy SPIDEr: 0.1881
loading annotations into memory...
0:00:00.003826
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8812, 'reflen': 9680, 'guess': [8812, 7788, 6764, 5740], 'correct': [4958, 1817, 7



loading annotations into memory...
0:00:00.003794
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9955, 'reflen': 10203, 'guess': [9955, 8931, 7907, 6883], 'correct': [5334, 1788, 657, 194]}
ratio: 0.9756934235027956
Bleu_1: 0.523
Bleu_2: 0.319
Bleu_3: 0.202
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.151
computing Rouge score...
ROUGE_L: 0.353
computing CIDEr score...
CIDEr: 0.273
computing SPICE score...
SPICE: 0.101
computing SPIDEr score...
SPIDEr: 0.187
2021-06-21 14:21:23,641 - INFO: eval_greddy SPIDEr: 0.1872
loading annotations into memory...
0:00:00.004017
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8207, 'reflen': 9506, 'guess': [8207, 7183, 6159, 5135], 'correct': [4696, 1636, 651



loading annotations into memory...
0:00:00.003998
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10302, 'reflen': 10496, 'guess': [10302, 9278, 8254, 7230], 'correct': [5400, 1877, 685, 200]}
ratio: 0.9815167682925894
Bleu_1: 0.514
Bleu_2: 0.320
Bleu_3: 0.203
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.154
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.296
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.199
2021-06-21 14:36:13,872 - INFO: eval_greddy SPIDEr: 0.1988
loading annotations into memory...
0:00:00.004042
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8201, 'reflen': 9535, 'guess': [8201, 7177, 6153, 5129], 'correct': [4709, 1683, 6



loading annotations into memory...
0:00:00.003990
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11148, 'reflen': 10908, 'guess': [11148, 10124, 9100, 8076], 'correct': [5724, 2005, 743, 220]}
ratio: 1.0220022002199283
Bleu_1: 0.513
Bleu_2: 0.319
Bleu_3: 0.202
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.356
computing CIDEr score...
CIDEr: 0.290
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.197
2021-06-21 14:51:33,537 - INFO: eval_greddy SPIDEr: 0.1974
loading annotations into memory...
0:00:00.003963
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8436, 'reflen': 9580, 'guess': [8436, 7412, 6388, 5364], 'correct': [4880, 1786, 



loading annotations into memory...
0:00:00.004048
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10770, 'reflen': 10711, 'guess': [10770, 9746, 8722, 7698], 'correct': [5613, 1882, 704, 210]}
ratio: 1.005508355895714
Bleu_1: 0.521
Bleu_2: 0.317
Bleu_3: 0.201
Bleu_4: 0.122
computing METEOR score...
METEOR: 0.154
computing Rouge score...
ROUGE_L: 0.354
computing CIDEr score...
CIDEr: 0.278
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.191
2021-06-21 15:07:21,411 - INFO: eval_greddy SPIDEr: 0.1906
loading annotations into memory...
0:00:00.004083
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8754, 'reflen': 9630, 'guess': [8754, 7730, 6706, 5682], 'correct': [5060, 1868, 74



loading annotations into memory...
0:00:00.003922
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10243, 'reflen': 10390, 'guess': [10243, 9219, 8195, 7171], 'correct': [5493, 1971, 745, 215]}
ratio: 0.9858517805581342
Bleu_1: 0.529
Bleu_2: 0.334
Bleu_3: 0.215
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.300
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.203
2021-06-21 15:23:17,478 - INFO: eval_greddy SPIDEr: 0.2027
loading annotations into memory...
0:00:00.003965
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8630, 'reflen': 9612, 'guess': [8630, 7606, 6582, 5558], 'correct': [5004, 1843, 7



loading annotations into memory...
0:00:00.004100
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10916, 'reflen': 10796, 'guess': [10916, 9892, 8868, 7844], 'correct': [5783, 2036, 768, 254]}
ratio: 1.0111152278620774
Bleu_1: 0.530
Bleu_2: 0.330
Bleu_3: 0.211
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.315
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.210
2021-06-21 15:39:31,081 - INFO: eval_greddy SPIDEr: 0.2099
loading annotations into memory...
0:00:00.004027
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9400, 'reflen': 9978, 'guess': [9400, 8376, 7352, 6328], 'correct': [5336, 1936, 7



loading annotations into memory...
0:00:00.004251
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10493, 'reflen': 10522, 'guess': [10493, 9469, 8445, 7421], 'correct': [5576, 1952, 713, 223]}
ratio: 0.9972438699865998
Bleu_1: 0.530
Bleu_2: 0.330
Bleu_3: 0.209
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.158
computing Rouge score...
ROUGE_L: 0.361
computing CIDEr score...
CIDEr: 0.310
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.209
2021-06-21 15:55:53,892 - INFO: eval_greddy SPIDEr: 0.2086
loading annotations into memory...
0:00:00.004056
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8875, 'reflen': 9671, 'guess': [8875, 7851, 6827, 5803], 'correct': [5186, 1952, 7



loading annotations into memory...
0:00:00.004568
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10180, 'reflen': 10294, 'guess': [10180, 9156, 8132, 7108], 'correct': [5510, 1892, 713, 229]}
ratio: 0.9889255877209064
Bleu_1: 0.535
Bleu_2: 0.331
Bleu_3: 0.212
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.314
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.208
2021-06-21 16:12:08,210 - INFO: eval_greddy SPIDEr: 0.2084
loading annotations into memory...
0:00:00.004307
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8826, 'reflen': 9634, 'guess': [8826, 7802, 6778, 5754], 'correct': [5118, 1812, 7



loading annotations into memory...
0:00:00.003776
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10684, 'reflen': 10703, 'guess': [10684, 9660, 8636, 7612], 'correct': [5660, 1989, 729, 210]}
ratio: 0.9982247967858545
Bleu_1: 0.529
Bleu_2: 0.330
Bleu_3: 0.209
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.359
computing CIDEr score...
CIDEr: 0.308
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.208
2021-06-21 16:27:06,640 - INFO: eval_greddy SPIDEr: 0.2075
loading annotations into memory...
0:00:00.003881
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9104, 'reflen': 9899, 'guess': [9104, 8080, 7056, 6032], 'correct': [5225, 1932, 7



loading annotations into memory...
0:00:00.004016
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10742, 'reflen': 10743, 'guess': [10742, 9718, 8694, 7670], 'correct': [5598, 1972, 755, 226]}
ratio: 0.9999069161313413
Bleu_1: 0.521
Bleu_2: 0.325
Bleu_3: 0.209
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.356
computing CIDEr score...
CIDEr: 0.320
computing SPICE score...
SPICE: 0.105
computing SPIDEr score...
SPIDEr: 0.213
2021-06-21 16:41:56,282 - INFO: eval_greddy SPIDEr: 0.2129
loading annotations into memory...
0:00:00.004115
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8820, 'reflen': 9663, 'guess': [8820, 7796, 6772, 5748], 'correct': [5035, 1818, 7



loading annotations into memory...
0:00:00.004172
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10233, 'reflen': 10402, 'guess': [10233, 9209, 8185, 7161], 'correct': [5468, 1873, 668, 200]}
ratio: 0.9837531243990594
Bleu_1: 0.526
Bleu_2: 0.324
Bleu_3: 0.204
Bleu_4: 0.123
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.356
computing CIDEr score...
CIDEr: 0.306
computing SPICE score...
SPICE: 0.104
computing SPIDEr score...
SPIDEr: 0.205
2021-06-21 16:56:06,126 - INFO: eval_greddy SPIDEr: 0.2049
loading annotations into memory...
0:00:00.004195
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8720, 'reflen': 9635, 'guess': [8720, 7696, 6672, 5648], 'correct': [5019, 1784, 6



loading annotations into memory...
0:00:00.003886
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10500, 'reflen': 10496, 'guess': [10500, 9476, 8452, 7428], 'correct': [5639, 1969, 740, 238]}
ratio: 1.0003810975608802
Bleu_1: 0.537
Bleu_2: 0.334
Bleu_3: 0.214
Bleu_4: 0.133
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.325
computing SPICE score...
SPICE: 0.108
computing SPIDEr score...
SPIDEr: 0.217
2021-06-21 17:11:15,365 - INFO: eval_greddy SPIDEr: 0.2166
loading annotations into memory...
0:00:00.003907
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8904, 'reflen': 9696, 'guess': [8904, 7880, 6856, 5832], 'correct': [5145, 1863, 7



loading annotations into memory...
0:00:00.003777
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10387, 'reflen': 10474, 'guess': [10387, 9363, 8339, 7315], 'correct': [5402, 1829, 660, 195]}
ratio: 0.9916937177772587
Bleu_1: 0.516
Bleu_2: 0.316
Bleu_3: 0.199
Bleu_4: 0.120
computing METEOR score...
METEOR: 0.153
computing Rouge score...
ROUGE_L: 0.351
computing CIDEr score...
CIDEr: 0.302
computing SPICE score...
SPICE: 0.102
computing SPIDEr score...
SPIDEr: 0.202
2021-06-21 17:26:14,575 - INFO: eval_greddy SPIDEr: 0.2020
loading annotations into memory...
0:00:00.004034
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8559, 'reflen': 9569, 'guess': [8559, 7535, 6511, 5487], 'correct': [4924, 1782, 6



loading annotations into memory...
0:00:00.003993
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10497, 'reflen': 10516, 'guess': [10497, 9473, 8449, 7425], 'correct': [5585, 1965, 735, 228]}
ratio: 0.9981932293646825
Bleu_1: 0.531
Bleu_2: 0.332
Bleu_3: 0.212
Bleu_4: 0.131
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.330
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.220
2021-06-21 17:40:50,787 - INFO: eval_greddy SPIDEr: 0.2203
loading annotations into memory...
0:00:00.003791
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8931, 'reflen': 9691, 'guess': [8931, 7907, 6883, 5859], 'correct': [5205, 1927, 7



loading annotations into memory...
0:00:00.004407
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10468, 'reflen': 10512, 'guess': [10468, 9444, 8420, 7396], 'correct': [5728, 2044, 774, 245]}
ratio: 0.9958143074580483
Bleu_1: 0.545
Bleu_2: 0.343
Bleu_3: 0.221
Bleu_4: 0.137
computing METEOR score...
METEOR: 0.162
computing Rouge score...
ROUGE_L: 0.364
computing CIDEr score...
CIDEr: 0.339
computing SPICE score...
SPICE: 0.111
computing SPIDEr score...
SPIDEr: 0.225
2021-06-21 17:56:12,230 - INFO: eval_greddy SPIDEr: 0.2252
loading annotations into memory...
0:00:00.004738
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8737, 'reflen': 9580, 'guess': [8737, 7713, 6689, 5665], 'correct': [5163, 1904, 7



loading annotations into memory...
0:00:00.003941
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10584, 'reflen': 10634, 'guess': [10584, 9560, 8536, 7512], 'correct': [5555, 1921, 725, 238]}
ratio: 0.9952981004324811
Bleu_1: 0.522
Bleu_2: 0.323
Bleu_3: 0.207
Bleu_4: 0.129
computing METEOR score...
METEOR: 0.159
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.318
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.213
2021-06-21 18:10:43,613 - INFO: eval_greddy SPIDEr: 0.2134
loading annotations into memory...
0:00:00.004497
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8726, 'reflen': 9586, 'guess': [8726, 7702, 6678, 5654], 'correct': [5101, 1844, 7



loading annotations into memory...
0:00:00.003923
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10546, 'reflen': 10576, 'guess': [10546, 9522, 8498, 7474], 'correct': [5525, 1860, 694, 222]}
ratio: 0.9971633888047469
Bleu_1: 0.522
Bleu_2: 0.319
Bleu_3: 0.202
Bleu_4: 0.125
computing METEOR score...
METEOR: 0.157
computing Rouge score...
ROUGE_L: 0.352
computing CIDEr score...
CIDEr: 0.311
computing SPICE score...
SPICE: 0.107
computing SPIDEr score...
SPIDEr: 0.209
2021-06-21 18:25:01,568 - INFO: eval_greddy SPIDEr: 0.2090
loading annotations into memory...
0:00:00.003850
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8750, 'reflen': 9645, 'guess': [8750, 7726, 6702, 5678], 'correct': [5050, 1848, 7



loading annotations into memory...
0:00:00.003793
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10755, 'reflen': 10684, 'guess': [10755, 9731, 8707, 7683], 'correct': [5697, 1949, 718, 215]}
ratio: 1.0066454511418002
Bleu_1: 0.530
Bleu_2: 0.326
Bleu_3: 0.206
Bleu_4: 0.125
computing METEOR score...
METEOR: 0.160
computing Rouge score...
ROUGE_L: 0.357
computing CIDEr score...
CIDEr: 0.315
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.212
2021-06-21 18:39:07,450 - INFO: eval_greddy SPIDEr: 0.2119
loading annotations into memory...
0:00:00.003982
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8917, 'reflen': 9728, 'guess': [8917, 7893, 6869, 5845], 'correct': [5171, 1846, 7



loading annotations into memory...
0:00:00.003834
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10712, 'reflen': 10651, 'guess': [10712, 9688, 8664, 7640], 'correct': [5706, 1951, 713, 219]}
ratio: 1.0057271617687535
Bleu_1: 0.533
Bleu_2: 0.328
Bleu_3: 0.207
Bleu_4: 0.126
computing METEOR score...
METEOR: 0.161
computing Rouge score...
ROUGE_L: 0.358
computing CIDEr score...
CIDEr: 0.334
computing SPICE score...
SPICE: 0.112
computing SPIDEr score...
SPIDEr: 0.223
2021-06-21 18:54:18,053 - INFO: eval_greddy SPIDEr: 0.2230
loading annotations into memory...
0:00:00.003996
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9001, 'reflen': 9738, 'guess': [9001, 7977, 6953, 5929], 'correct': [5259, 1878, 7



loading annotations into memory...
0:00:00.004103
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10878, 'reflen': 10775, 'guess': [10878, 9854, 8830, 7806], 'correct': [5772, 2021, 737, 232]}
ratio: 1.0095591647330848
Bleu_1: 0.531
Bleu_2: 0.330
Bleu_3: 0.209
Bleu_4: 0.128
computing METEOR score...
METEOR: 0.162
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.338
computing SPICE score...
SPICE: 0.109
computing SPIDEr score...
SPIDEr: 0.224
2021-06-21 19:08:32,032 - INFO: eval_greddy SPIDEr: 0.2236
loading annotations into memory...
0:00:00.004138
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9242, 'reflen': 9862, 'guess': [9242, 8218, 7194, 6170], 'correct': [5343, 1979, 7



loading annotations into memory...
0:00:00.003880
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 11282, 'reflen': 11027, 'guess': [11282, 10258, 9234, 8210], 'correct': [5873, 2029, 774, 244]}
ratio: 1.0231250566789676
Bleu_1: 0.521
Bleu_2: 0.321
Bleu_3: 0.205
Bleu_4: 0.127
computing METEOR score...
METEOR: 0.164
computing Rouge score...
ROUGE_L: 0.360
computing CIDEr score...
CIDEr: 0.327
computing SPICE score...
SPICE: 0.110
computing SPIDEr score...
SPIDEr: 0.219
2021-06-21 19:23:15,511 - INFO: eval_greddy SPIDEr: 0.2186
loading annotations into memory...
0:00:00.003889
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9356, 'reflen': 9907, 'guess': [9356, 8332, 7308, 6284], 'correct': [5305, 1932, 



loading annotations into memory...
0:00:00.004375
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10823, 'reflen': 10750, 'guess': [10823, 9799, 8775, 7751], 'correct': [5590, 1831, 690, 223]}
ratio: 1.0067906976743248
Bleu_1: 0.516
Bleu_2: 0.311
Bleu_3: 0.197
Bleu_4: 0.122
computing METEOR score...
METEOR: 0.155
computing Rouge score...
ROUGE_L: 0.344
computing CIDEr score...
CIDEr: 0.299
computing SPICE score...
SPICE: 0.103
computing SPIDEr score...
SPIDEr: 0.201
2021-06-21 19:38:28,401 - INFO: eval_greddy SPIDEr: 0.2005
loading annotations into memory...
0:00:00.004096
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9191, 'reflen': 9850, 'guess': [9191, 8167, 7143, 6119], 'correct': [5260, 1908, 7

model score check (eval)

In [36]:
#epoch=23 일때 score 최대
#if hp.mode == 'eval':
# Evaluation model score
model.load_state_dict(torch.load("./models/0621/23.pt"))
eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=2)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=3)
eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
                       beam_size=4)

loading annotations into memory...
0:00:00.004555
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 10468, 'reflen': 10512, 'guess': [10468, 9444, 8420, 7396], 'correct': [5728, 2044, 774, 245]}
ratio: 0.9958143074580483
Bleu_1: 0.545
Bleu_2: 0.343
Bleu_3: 0.221
Bleu_4: 0.137
computing METEOR score...
METEOR: 0.162
computing Rouge score...
ROUGE_L: 0.364
computing CIDEr score...
CIDEr: 0.339
computing SPICE score...
SPICE: 0.111
computing SPIDEr score...
SPIDEr: 0.225
2021-06-21 19:43:50,185 - INFO: eval_greddy SPIDEr: 0.2252
loading annotations into memory...
0:00:00.003916
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 8737, 'reflen': 9580, 'guess': [8737, 7713, 6689, 5665], 'correct': [5163, 1904, 7

In [38]:
# Generate caption(in test_out.csv)
model.load_state_dict(torch.load("./models/0621/23.pt")) #최종스코어
test_with_beam(test_data, beam_size=3)

In [43]:
#caption 생성 확인하기
import pandas as pd
data = pd.read_csv('/home/hj20/test_out.csv')

In [44]:
data

Unnamed: 0,file_name,caption_predicted
0,test_0567.wav,a machine is running at a pretty constant rate
1,test_0195.wav,a large truck is driving and accelerating rapidly
2,test_0798.wav,a it traffic is being started and stopped
3,test_0383.wav,a door creaks as it is opened and closed
4,test_0898.wav,birds are chirping and singing in the background
...,...,...
1038,test_0869.wav,a machine is running at a constant rate
1039,test_0225.wav,a lot and birds are chirping in the background
1040,test_0377.wav,a it traffic is being busy in the background
1041,test_0260.wav,a door opens and closes while a large machine ...
