In [1]:
import os
import random
from io import open
import unicodedata
import string
import re

import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from pathlib import Path
import kaldi_io
import sys
import gc
import json
import time
from data_4 import AudioDataLoader, AudioDataset, pad_list

%matplotlib inline

print_use = False

In [19]:
train_json = "/home/meichaoyang/workspace/Listen-Attend-Spell/egs/aishell/dump/train/deltatrue/data.json"
test_json = "/home/meichaoyang/workspace/Listen-Attend-Spell/egs/aishell/dump/test/deltatrue/data.json"
batch_size = 32
maxlen_in = 100000
maxlen_out = 30
num_workers = 4

## 加载数据

In [3]:
tr_dataset = AudioDataset(train_json, batch_size,
                              maxlen_in, maxlen_out)

tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=num_workers)


In [20]:
te_dataset = AudioDataset(test_json, batch_size,
                              maxlen_in, maxlen_out)
te_loader = AudioDataLoader(te_dataset, batch_size=1, num_workers=num_workers)

In [4]:
char_list = []
char_list_path = "/home/meichaoyang/workspace/Listen-Attend-Spell/egs/aishell/data/lang_1char/train_chars.txt"
with open(char_list_path, "r") as f:
    for line in f:
        data = line.split()
        char_list.append(data[0])

In [5]:
with open(train_json, 'r') as f:
    json_data = json.load(f)

# 模型搭建

In [6]:
MAX_LENGTH= 200
SOS_token = 0
EOS_token = 1
os.environ["CUDA_VISIBLE_DEVICES"]="2"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Attention

In [7]:
class DotProductAttention(nn.Module):
    r"""Dot product attention.
    Given a set of vector values, and a vector query, attention is a technique
    to compute a weighted sum of the values, dependent on the query.

    NOTE: Here we use the terminology in Stanford cs224n-2018-lecture11.
    """

    def __init__(self):
        super(DotProductAttention, self).__init__()
        # TODO: move this out of this class?
        # self.linear_out = nn.Linear(dim*2, dim)

    def forward(self, queries, values):
        """
        Args:
            queries: N x To x H
            values : N x Ti x H

        Returns:
            output: N x To x H
            attention_distribution: N x To x Ti
        """
        batch_size = queries.size(0)
        hidden_size = queries.size(2)
        input_lengths = values.size(1)
        # (N, To, H) * (N, H, Ti) -> (N, To, Ti)
        attention_scores = torch.bmm(queries, values.transpose(1, 2))
        attention_distribution = F.softmax(
            attention_scores.view(-1, input_lengths), dim=1).view(batch_size, -1, input_lengths)
        # (N, To, Ti) * (N, Ti, H) -> (N, To, H)
        attention_output = torch.bmm(attention_distribution, values)
        # # concat -> (N, To, 2*H)
        # concated = torch.cat((attention_output, queries), dim=2)
        # # TODO: Move this out of this class?
        # # output -> (N, To, H)
        # output = torch.tanh(self.linear_out(
        #     concated.view(-1, 2*hidden_size))).view(batch_size, -1, hidden_size)

        return attention_output, attention_distribution

### 金字塔BLSTM

In [8]:
class pyramidalBLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.0, bidirectional=True):
        super(pyramidalBLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm1 = nn.LSTM(input_size, hidden_size, 
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.lstm2 = nn.LSTM(hidden_size*4, hidden_size,
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.lstm3 = nn.LSTM(hidden_size*4, hidden_size,
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)

    def forward(self, padded_input, input_lengths):
        
#         print("pyramidalBLSTM.padded_input.shape:", padded_input.shape)
        total_length = padded_input.size(1)
#         print("total_length:",total_length)
        packed_input = pack_padded_sequence(padded_input, input_lengths,
                                            batch_first=True)


#         print("packed_input.shape",packed_input.data.shape)
        packed_output1, hidden1 = self.lstm1(packed_input)
#         print("packed_output1.shape",packed_output1.data.shape)

        #两步拼一步

        padded_output1, _ = pad_packed_sequence(packed_output1,
                                        batch_first=True,
                                        total_length=total_length)
#         print("padded_output1.shape:",padded_output1.shape)
        padded_input2 = padded_output1.reshape(padded_output1.shape[0],padded_output1.shape[1]//2,padded_output1.shape[2]*2)
        
        packed_input2 = pack_padded_sequence(padded_input2, input_lengths/2,
                                            batch_first=True)
        packed_output2, hidden2 = self.lstm2(packed_input2)
        padded_output2, _ = pad_packed_sequence(packed_output2,
                                        batch_first=True,
                                        total_length=total_length//2)
        
        padded_input3 = padded_output2.reshape(padded_output2.shape[0],padded_output2.shape[1]//2,padded_output2.shape[2]*2)
        packed_input3 = pack_padded_sequence(padded_input3, input_lengths/4,
                                            batch_first=True)
        
        packed_output3, hidden3 = self.lstm2(packed_input3)  
        output, _ = pad_packed_sequence(packed_output3,
                                        batch_first=True,
                                        total_length=total_length//4)

        
        
        return output, (hidden1, hidden2, hidden3)
        

In [9]:
a = [[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]]
a = torch.tensor(a)

In [10]:
a = torch.rand([32, 2236, 512])
a.shape

torch.Size([32, 2236, 512])

In [None]:
# a.reshape(a.shape[0],a.shape[1]//2,a.shape[2]*2)

## 模型构建

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.0):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.first = True

        self.pyramidalBLSTM = pyramidalBLSTM(input_size, hidden_size, 1, dropout=dropout)

    def forward(self, input, input_lengths):

        output, hidden = self.pyramidalBLSTM(input, input_lengths)

        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [13]:
class Decoder(nn.Module):
    """
    """

    def __init__(self, vocab_size, embedding_dim, sos_id, eos_id, hidden_size,
                 num_layers, bidirectional_encoder=True):
        super(Decoder, self).__init__()
        # Hyper parameters
        # embedding + output
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.sos_id = sos_id  # Start of Sentence
        self.eos_id = eos_id  # End of Sentence
        # rnn
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional_encoder = bidirectional_encoder  # useless now
        self.encoder_hidden_size = hidden_size  # must be equal now
        # Components
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.rnn = nn.ModuleList()
        self.rnn += [nn.LSTMCell(self.embedding_dim +
                                 self.encoder_hidden_size, self.hidden_size)]
        for l in range(1, self.num_layers):
            self.rnn += [nn.LSTMCell(self.hidden_size, self.hidden_size)]
        self.attention = DotProductAttention()
        self.mlp = nn.Sequential(
            nn.Linear(self.encoder_hidden_size + self.hidden_size,
                      self.hidden_size),
            nn.Tanh(),
            nn.Linear(self.hidden_size, self.vocab_size))

    def zero_state(self, encoder_padded_outputs, H=None):
        N = encoder_padded_outputs.size(0)
        H = self.hidden_size if H == None else H
        return encoder_padded_outputs.new_zeros(N, H)

    def forward(self, padded_input, encoder_padded_outputs):
        """
        Args:
            padded_input: N x To
            # encoder_hidden: (num_layers * num_directions) x N x H
            encoder_padded_outputs: N x Ti x H

        Returns:
        """
        # *********Get Input and Output
        # from espnet/Decoder.forward()
        # TODO: need to make more smart way
        ys = [y[y != IGNORE_ID] for y in padded_input]  # parse padded ys
        # prepare input and output word sequences with sos/eos IDs
        eos = ys[0].new([self.eos_id])
        sos = ys[0].new([self.sos_id])
        ys_in = [torch.cat([sos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]
        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = pad_list(ys_in, self.eos_id)
        ys_out_pad = pad_list(ys_out, IGNORE_ID)
        # print("ys_in_pad", ys_in_pad.size())
        assert ys_in_pad.size() == ys_out_pad.size()
        batch_size = ys_in_pad.size(0)
        output_length = ys_in_pad.size(1)
        # max_length = ys_in_pad.size(1) - 1  # TODO: should minus 1(sos)?

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_padded_outputs)]
        c_list = [self.zero_state(encoder_padded_outputs)]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_padded_outputs))
            c_list.append(self.zero_state(encoder_padded_outputs))
        att_c = self.zero_state(encoder_padded_outputs,
                                H=encoder_padded_outputs.size(2))
        y_all = []

        # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP
        embedded = self.embedding(ys_in_pad)
        for t in range(output_length):
            # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
            rnn_input = torch.cat((embedded[:, t, :], att_c), dim=1)
            h_list[0], c_list[0] = self.rnn[0](
                rnn_input, (h_list[0], c_list[0]))
            for l in range(1, self.num_layers):
                h_list[l], c_list[l] = self.rnn[l](
                    h_list[l-1], (h_list[l], c_list[l]))
            rnn_output = h_list[-1]  # below unsqueeze: (N x H) -> (N x 1 x H)
            # step 2. attention: c_i = AttentionContext(s_i,h)
            att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                          encoder_padded_outputs)
            att_c = att_c.squeeze(dim=1)
            # step 3. concate s_i and c_i, and input to MLP
            mlp_input = torch.cat((rnn_output, att_c), dim=1)
            predicted_y_t = self.mlp(mlp_input)
            y_all.append(predicted_y_t)

        y_all = torch.stack(y_all, dim=1)  # N x To x C
        # **********Cross Entropy Loss
        # F.cross_entropy = NLL(log_softmax(input), target))
        y_all = y_all.view(batch_size * output_length, self.vocab_size)
        ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1),
                                  ignore_index=IGNORE_ID,
                                  reduction='mean')

        return ce_loss

       

    def recognize_beam(self, encoder_outputs, char_list, args):
        """Beam search, decode one utterence now.
        Args:
            encoder_outputs: T x H
            char_list: list of character
            args: args.beam

        Returns:
            nbest_hyps:
        """
        # search params
        beam = args.beam_size
        nbest = args.nbest
        if args.decode_max_len == 0:
            maxlen = encoder_outputs.size(0)
        else:
            maxlen = args.decode_max_len

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        c_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
            c_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
        att_c = self.zero_state(encoder_outputs.unsqueeze(0),
                                H=encoder_outputs.unsqueeze(0).size(2))
        # prepare sos
        y = self.sos_id
        vy = encoder_outputs.new_zeros(1).long()

        hyp = {'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list,
               'a_prev': att_c}
        hyps = [hyp]
        ended_hyps = []

        for i in range(maxlen):
            hyps_best_kept = []
            for hyp in hyps:
                # vy.unsqueeze(1)
                vy[0] = hyp['yseq'][i]
                embedded = self.embedding(vy)
                # embedded.unsqueeze(0)
                # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
                rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1)
                h_list[0], c_list[0] = self.rnn[0](
                    rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0]))
                for l in range(1, self.num_layers):
                    h_list[l], c_list[l] = self.rnn[l](
                        h_list[l-1], (hyp['h_prev'][l], hyp['c_prev'][l]))
                rnn_output = h_list[-1]
                # step 2. attention: c_i = AttentionContext(s_i,h)
                # below unsqueeze: (N x H) -> (N x 1 x H)
                att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                              encoder_outputs.unsqueeze(0))
                att_c = att_c.squeeze(dim=1)
                # step 3. concate s_i and c_i, and input to MLP
                mlp_input = torch.cat((rnn_output, att_c), dim=1)
                predicted_y_t = self.mlp(mlp_input)
                local_scores = F.log_softmax(predicted_y_t, dim=1)
                # topk scores
                local_best_scores, local_best_ids = torch.topk(
                    local_scores, beam, dim=1)

                for j in range(beam):
                    new_hyp = {}
                    new_hyp['h_prev'] = h_list[:]
                    new_hyp['c_prev'] = c_list[:]
                    new_hyp['a_prev'] = att_c[:]
                    new_hyp['score'] = hyp['score'] + local_best_scores[0, j]
                    new_hyp['yseq'] = [0] * (1 + len(hyp['yseq']))
                    new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq']
                    new_hyp['yseq'][len(hyp['yseq'])] = int(
                        local_best_ids[0, j])
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(hyps_best_kept,
                                        key=lambda x: x['score'],
                                        reverse=True)[:beam]
            # end for hyp in hyps
            hyps = hyps_best_kept

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                for hyp in hyps:
                    hyp['yseq'].append(self.eos_id)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp['yseq'][-1] == self.eos_id:
                    # hyp['score'] += (i + 1) * penalty
                    ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            hyps = remained_hyps
            if len(hyps) > 0:
                print('remeined hypothes: ' + str(len(hyps)))
            else:
                print('no hypothesis. Finish decoding.')
                break

            for hyp in hyps:
                print('hypo: ' + ''.join([char_list[int(x)]
                                          for x in hyp['yseq'][1:]]))
        # end for i in range(maxlen)
        nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[
            :min(len(ended_hyps), nbest)]
        return nbest_hyps

In [14]:
class Seq2Seq(nn.Module):
    """Sequence-to-Sequence architecture with configurable encoder and decoder.
    """

    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, padded_input, input_lengths, padded_target):
        """
        Args:
            padded_input: N x Ti x D
            padded_targets: N x To
        """
        encoder_padded_outputs, _ = self.encoder(padded_input , input_lengths)
        loss = self.decoder(padded_target, encoder_padded_outputs)
        return loss
    
    def recognize(self, input, input_lengths, char_list, args):
        """Sequence-to-Sequence beam search, decode one utterence now.
        Args:
            input: T x D
            char_list: list of characters
            args: args.beam

        Returns:
            nbest_hyps:
        """
        encoder_outputs, _ = self.encoder(input, input_lengths)
#         print("encoder_outputs", encoder_outputs.squeeze(1).shape)
        
        nbest_hyps = self.decoder.recognize_beam(encoder_outputs.squeeze(0), char_list, args)
        return nbest_hyps


## 单步训练

In [15]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

## 训练迭代

In [16]:
# def trainIters(model, optimizier, print_every=5, plot_every=100, learning_rate=0.01):
#     start = time.time()
#     n_iters = len(tr_dataset)
#     plot_losses = []
#     print_loss_total = 0  # Reset every print_every
#     plot_loss_total = 0  # Reset every plot_every

#     encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
#     decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
# #     training_pairs = random.choices(a, k=n_iters)
    
#     criterion = nn.NLLLoss()

# #     for utt in training_pairs:
#     for i, (data) in enumerate(tr_loader):
#         padded_input, input_lengths, padded_target = data
#         padded_input, input_lengths, padded_target = data
#         padded_input = padded_input.cuda()
#         input_lengths = input_lengths.cuda()
#         padded_target = padded_target.cuda()
# #         print("padded_input:",padded_input.shape)
#         loss = model(padded_input, input_lengths, padded_target)
# #         print(loss) #.requires_grad
#         print_loss_total += float(loss)
#         plot_loss_total += float(loss)
        
#         optimizier.zero_grad()
#         loss.backward()
        
#         optimizier.step()

#         if (i+1) % print_every == 0:
#             print_loss_avg = print_loss_total / print_every
#             print_loss_total = 0
#             print('%s (%d %d%%) %.4f' % (timeSince(start, (i+1) / n_iters),
#                                          (i+1), (i+1) / n_iters * 100, print_loss_avg))

#         if i+1 % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0


#     showPlot(plot_losses)

In [17]:
def trainIters(model, epoch, optimizier, print_every=10, plot_every=10, learning_rate=0.01):
    log = open('train.log', 'w')
    start = time.time()
    n_iters = len(tr_dataset)
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
#     training_pairs = random.choices(a, k=n_iters)
    
    criterion = nn.NLLLoss()

    for e in range(epoch):
        for i, (data) in enumerate(tr_loader):
            padded_input, input_lengths, padded_target = data
            padded_input, input_lengths, padded_target = data
            padded_input = padded_input.cuda()
            input_lengths = input_lengths.cuda()
            padded_target = padded_target.cuda()
    #         print("padded_input:",padded_input.shape)
            loss = model(padded_input, input_lengths, padded_target)
    #         print(loss) #.requires_grad
            print_loss_total += float(loss)
            plot_loss_total += float(loss)

            optimizier.zero_grad()
            loss.backward()

            optimizier.step()

            if (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                txt = 'Epoch %d | Iter %d | %s (%d %d%%) %.4f' % (e+1, i+1, timeSince(start, (e *n_iters +i+1) / (n_iters*epoch)),
                                             (i+1), (e *n_iters +i+1) / (n_iters*epoch) * 100, print_loss_avg)
                print(txt)
                log.write(txt + "\n")
                log.flush()
            if i+1 % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    log.close()

In [18]:
input_size = 240

hidden_size = 256
vocab_size = len(char_list)
embedding_dim = 512
sos_id = 0
eos_id = 1
learning_rate = 1e-3
momentum = 0
l2 = 1e-5

IGNORE_ID=-1

encoder = Encoder(input_size, hidden_size, dropout=0.0)
decoder = Decoder(vocab_size, embedding_dim, sos_id, eos_id, hidden_size*2,
                 num_layers=1, bidirectional_encoder=True)

model = Seq2Seq(encoder, decoder)
print(model)
model.cuda()

optimizier = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
#                                      momentum=momentum,
                                     weight_decay=l2)
trainIters(model, 15,optimizier, print_every=20)

Seq2Seq(
  (encoder): Encoder(
    (pyramidalBLSTM): pyramidalBLSTM(
      (lstm1): LSTM(240, 256, batch_first=True, bidirectional=True)
      (lstm2): LSTM(1024, 256, batch_first=True, bidirectional=True)
      (lstm3): LSTM(1024, 256, batch_first=True, bidirectional=True)
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(4233, 512)
    (rnn): ModuleList(
      (0): LSTMCell(1024, 512)
    )
    (attention): DotProductAttention()
    (mlp): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): Tanh()
      (2): Linear(in_features=512, out_features=4233, bias=True)
    )
  )
)




Epoch 1 | Iter 20 | 0m 5s (- 262m 4s) (20 0%) 7.2119
Epoch 1 | Iter 40 | 0m 9s (- 228m 31s) (40 0%) 6.8208
Epoch 1 | Iter 60 | 0m 13s (- 214m 57s) (60 0%) 6.7195
Epoch 1 | Iter 80 | 0m 17s (- 204m 48s) (80 0%) 6.6013
Epoch 1 | Iter 100 | 0m 21s (- 198m 40s) (100 0%) 6.4842
Epoch 1 | Iter 120 | 0m 24s (- 194m 37s) (120 0%) 6.4215
Epoch 1 | Iter 140 | 0m 28s (- 190m 29s) (140 0%) 6.1952
Epoch 1 | Iter 160 | 0m 32s (- 187m 47s) (160 0%) 6.1438
Epoch 1 | Iter 180 | 0m 35s (- 185m 42s) (180 0%) 6.0543
Epoch 1 | Iter 200 | 0m 39s (- 183m 45s) (200 0%) 5.9537
Epoch 1 | Iter 220 | 0m 43s (- 182m 52s) (220 0%) 5.7614
Epoch 1 | Iter 240 | 0m 46s (- 181m 12s) (240 0%) 5.7289
Epoch 1 | Iter 260 | 0m 50s (- 179m 54s) (260 0%) 5.6083
Epoch 1 | Iter 280 | 0m 53s (- 178m 23s) (280 0%) 5.5556
Epoch 1 | Iter 300 | 0m 56s (- 176m 57s) (300 0%) 5.4295
Epoch 1 | Iter 320 | 1m 0s (- 175m 59s) (320 0%) 5.4165
Epoch 1 | Iter 340 | 1m 3s (- 174m 51s) (340 0%) 5.2709
Epoch 1 | Iter 360 | 1m 7s (- 173m 35s) (360

Epoch 1 | Iter 2860 | 6m 33s (- 122m 31s) (2860 5%) 1.3843
Epoch 1 | Iter 2880 | 6m 35s (- 122m 15s) (2880 5%) 1.4017
Epoch 1 | Iter 2900 | 6m 37s (- 121m 59s) (2900 5%) 1.3817
Epoch 1 | Iter 2920 | 6m 39s (- 121m 43s) (2920 5%) 1.4224
Epoch 1 | Iter 2940 | 6m 41s (- 121m 28s) (2940 5%) 1.3607
Epoch 1 | Iter 2960 | 6m 43s (- 121m 14s) (2960 5%) 1.3777
Epoch 1 | Iter 2980 | 6m 45s (- 120m 59s) (2980 5%) 1.4754
Epoch 1 | Iter 3000 | 6m 47s (- 120m 44s) (3000 5%) 1.4172
Epoch 1 | Iter 3020 | 6m 49s (- 120m 29s) (3020 5%) 1.3696
Epoch 1 | Iter 3040 | 6m 51s (- 120m 14s) (3040 5%) 1.3388
Epoch 1 | Iter 3060 | 6m 53s (- 119m 58s) (3060 5%) 1.2472
Epoch 1 | Iter 3080 | 6m 55s (- 119m 44s) (3080 5%) 1.3166
Epoch 1 | Iter 3100 | 6m 57s (- 119m 29s) (3100 5%) 1.3112
Epoch 1 | Iter 3120 | 6m 59s (- 119m 14s) (3120 5%) 1.3295
Epoch 1 | Iter 3140 | 7m 1s (- 118m 58s) (3140 5%) 1.2698
Epoch 1 | Iter 3160 | 7m 3s (- 118m 44s) (3160 5%) 1.2969
Epoch 1 | Iter 3180 | 7m 5s (- 118m 28s) (3180 5%) 1.2521


Epoch 2 | Iter 1940 | 12m 46s (- 113m 36s) (1940 10%) 0.7544
Epoch 2 | Iter 1960 | 12m 49s (- 113m 30s) (1960 10%) 0.7731
Epoch 2 | Iter 1980 | 12m 51s (- 113m 25s) (1980 10%) 0.7389
Epoch 2 | Iter 2000 | 12m 54s (- 113m 20s) (2000 10%) 0.7501
Epoch 2 | Iter 2020 | 12m 56s (- 113m 15s) (2020 10%) 0.7303
Epoch 2 | Iter 2040 | 12m 58s (- 113m 9s) (2040 10%) 0.7396
Epoch 2 | Iter 2060 | 13m 1s (- 113m 3s) (2060 10%) 0.7736
Epoch 2 | Iter 2080 | 13m 3s (- 112m 58s) (2080 10%) 0.7774
Epoch 2 | Iter 2100 | 13m 5s (- 112m 52s) (2100 10%) 0.7484
Epoch 2 | Iter 2120 | 13m 8s (- 112m 47s) (2120 10%) 0.7566
Epoch 2 | Iter 2140 | 13m 10s (- 112m 42s) (2140 10%) 0.7080
Epoch 2 | Iter 2160 | 13m 12s (- 112m 36s) (2160 10%) 0.7675
Epoch 2 | Iter 2180 | 13m 15s (- 112m 30s) (2180 10%) 0.7186
Epoch 2 | Iter 2200 | 13m 17s (- 112m 24s) (2200 10%) 0.7796
Epoch 2 | Iter 2220 | 13m 19s (- 112m 20s) (2220 10%) 0.7395
Epoch 2 | Iter 2240 | 13m 22s (- 112m 14s) (2240 10%) 0.7204
Epoch 2 | Iter 2260 | 13m 24s 

Epoch 3 | Iter 940 | 18m 30s (- 104m 50s) (940 15%) 0.5582
Epoch 3 | Iter 960 | 18m 33s (- 104m 49s) (960 15%) 0.5292
Epoch 3 | Iter 980 | 18m 36s (- 104m 49s) (980 15%) 0.5681
Epoch 3 | Iter 1000 | 18m 39s (- 104m 48s) (1000 15%) 0.5276
Epoch 3 | Iter 1020 | 18m 42s (- 104m 47s) (1020 15%) 0.5368
Epoch 3 | Iter 1040 | 18m 45s (- 104m 46s) (1040 15%) 0.5014
Epoch 3 | Iter 1060 | 18m 48s (- 104m 45s) (1060 15%) 0.5135
Epoch 3 | Iter 1080 | 18m 51s (- 104m 44s) (1080 15%) 0.5271
Epoch 3 | Iter 1100 | 18m 53s (- 104m 43s) (1100 15%) 0.5348
Epoch 3 | Iter 1120 | 18m 56s (- 104m 41s) (1120 15%) 0.5188
Epoch 3 | Iter 1140 | 18m 59s (- 104m 39s) (1140 15%) 0.5246
Epoch 3 | Iter 1160 | 19m 2s (- 104m 38s) (1160 15%) 0.4865
Epoch 3 | Iter 1180 | 19m 4s (- 104m 36s) (1180 15%) 0.5094
Epoch 3 | Iter 1200 | 19m 7s (- 104m 33s) (1200 15%) 0.5043
Epoch 3 | Iter 1220 | 19m 10s (- 104m 31s) (1220 15%) 0.4909
Epoch 3 | Iter 1240 | 19m 13s (- 104m 29s) (1240 15%) 0.4977
Epoch 3 | Iter 1260 | 19m 15s (- 

Epoch 3 | Iter 3660 | 23m 31s (- 95m 7s) (3660 19%) 0.5005
Epoch 3 | Iter 3680 | 23m 33s (- 94m 59s) (3680 19%) 0.4901
Epoch 3 | Iter 3700 | 23m 34s (- 94m 52s) (3700 19%) 0.4742
Epoch 3 | Iter 3720 | 23m 36s (- 94m 45s) (3720 19%) 0.4780
Epoch 3 | Iter 3740 | 23m 37s (- 94m 39s) (3740 19%) 0.5155
Epoch 4 | Iter 20 | 23m 44s (- 94m 44s) (20 20%) 1.6225
Epoch 4 | Iter 40 | 23m 48s (- 94m 48s) (40 20%) 0.9001
Epoch 4 | Iter 60 | 23m 52s (- 94m 52s) (60 20%) 0.7271
Epoch 4 | Iter 80 | 23m 56s (- 94m 55s) (80 20%) 0.6711
Epoch 4 | Iter 100 | 24m 0s (- 94m 58s) (100 20%) 0.6541
Epoch 4 | Iter 120 | 24m 4s (- 95m 0s) (120 20%) 0.5933
Epoch 4 | Iter 140 | 24m 7s (- 95m 2s) (140 20%) 0.5575
Epoch 4 | Iter 160 | 24m 11s (- 95m 3s) (160 20%) 0.5495
Epoch 4 | Iter 180 | 24m 14s (- 95m 5s) (180 20%) 0.5757
Epoch 4 | Iter 200 | 24m 18s (- 95m 6s) (200 20%) 0.5376
Epoch 4 | Iter 220 | 24m 21s (- 95m 7s) (220 20%) 0.5036
Epoch 4 | Iter 240 | 24m 25s (- 95m 8s) (240 20%) 0.5204
Epoch 4 | Iter 260 | 24

Epoch 4 | Iter 2720 | 29m 46s (- 90m 7s) (2720 24%) 0.3419
Epoch 4 | Iter 2740 | 29m 48s (- 90m 3s) (2740 24%) 0.3301
Epoch 4 | Iter 2760 | 29m 50s (- 89m 58s) (2760 24%) 0.3458
Epoch 4 | Iter 2780 | 29m 51s (- 89m 53s) (2780 24%) 0.3274
Epoch 4 | Iter 2800 | 29m 53s (- 89m 49s) (2800 24%) 0.3484
Epoch 4 | Iter 2820 | 29m 55s (- 89m 45s) (2820 25%) 0.3408
Epoch 4 | Iter 2840 | 29m 57s (- 89m 41s) (2840 25%) 0.3742
Epoch 4 | Iter 2860 | 29m 59s (- 89m 37s) (2860 25%) 0.3405
Epoch 4 | Iter 2880 | 30m 1s (- 89m 32s) (2880 25%) 0.3398
Epoch 4 | Iter 2900 | 30m 3s (- 89m 28s) (2900 25%) 0.3373
Epoch 4 | Iter 2920 | 30m 5s (- 89m 24s) (2920 25%) 0.3596
Epoch 4 | Iter 2940 | 30m 7s (- 89m 20s) (2940 25%) 0.3605
Epoch 4 | Iter 2960 | 30m 9s (- 89m 16s) (2960 25%) 0.3546
Epoch 4 | Iter 2980 | 30m 11s (- 89m 11s) (2980 25%) 0.3691
Epoch 4 | Iter 3000 | 30m 13s (- 89m 7s) (3000 25%) 0.3388
Epoch 4 | Iter 3020 | 30m 15s (- 89m 2s) (3020 25%) 0.3342
Epoch 4 | Iter 3040 | 30m 17s (- 88m 58s) (3040 2

Epoch 5 | Iter 1780 | 35m 33s (- 83m 38s) (1780 29%) 0.2911
Epoch 5 | Iter 1800 | 35m 35s (- 83m 35s) (1800 29%) 0.3024
Epoch 5 | Iter 1820 | 35m 37s (- 83m 32s) (1820 29%) 0.3118
Epoch 5 | Iter 1840 | 35m 40s (- 83m 29s) (1840 29%) 0.2754
Epoch 5 | Iter 1860 | 35m 42s (- 83m 26s) (1860 29%) 0.2795
Epoch 5 | Iter 1880 | 35m 44s (- 83m 23s) (1880 30%) 0.2790
Epoch 5 | Iter 1900 | 35m 47s (- 83m 19s) (1900 30%) 0.2693
Epoch 5 | Iter 1920 | 35m 49s (- 83m 16s) (1920 30%) 0.2679
Epoch 5 | Iter 1940 | 35m 51s (- 83m 13s) (1940 30%) 0.2652
Epoch 5 | Iter 1960 | 35m 53s (- 83m 10s) (1960 30%) 0.2903
Epoch 5 | Iter 1980 | 35m 55s (- 83m 6s) (1980 30%) 0.2798
Epoch 5 | Iter 2000 | 35m 58s (- 83m 3s) (2000 30%) 0.2826
Epoch 5 | Iter 2020 | 36m 0s (- 83m 0s) (2020 30%) 0.2879
Epoch 5 | Iter 2040 | 36m 2s (- 82m 57s) (2040 30%) 0.3286
Epoch 5 | Iter 2060 | 36m 4s (- 82m 54s) (2060 30%) 0.3257
Epoch 5 | Iter 2080 | 36m 6s (- 82m 50s) (2080 30%) 0.3349
Epoch 5 | Iter 2100 | 36m 9s (- 82m 46s) (2100 

Epoch 6 | Iter 820 | 41m 4s (- 76m 59s) (820 34%) 0.2931
Epoch 6 | Iter 840 | 41m 7s (- 76m 57s) (840 34%) 0.2791
Epoch 6 | Iter 860 | 41m 10s (- 76m 55s) (860 34%) 0.2816
Epoch 6 | Iter 880 | 41m 13s (- 76m 54s) (880 34%) 0.2767
Epoch 6 | Iter 900 | 41m 16s (- 76m 52s) (900 34%) 0.2891
Epoch 6 | Iter 920 | 41m 18s (- 76m 50s) (920 34%) 0.2875
Epoch 6 | Iter 940 | 41m 21s (- 76m 48s) (940 35%) 0.2961
Epoch 6 | Iter 960 | 41m 24s (- 76m 46s) (960 35%) 0.2751
Epoch 6 | Iter 980 | 41m 27s (- 76m 44s) (980 35%) 0.3014
Epoch 6 | Iter 1000 | 41m 30s (- 76m 43s) (1000 35%) 0.2759
Epoch 6 | Iter 1020 | 41m 33s (- 76m 41s) (1020 35%) 0.2850
Epoch 6 | Iter 1040 | 41m 36s (- 76m 39s) (1040 35%) 0.2487
Epoch 6 | Iter 1060 | 41m 39s (- 76m 37s) (1060 35%) 0.2571
Epoch 6 | Iter 1080 | 41m 41s (- 76m 35s) (1080 35%) 0.2606
Epoch 6 | Iter 1100 | 41m 44s (- 76m 32s) (1100 35%) 0.2782
Epoch 6 | Iter 1120 | 41m 47s (- 76m 30s) (1120 35%) 0.2752
Epoch 6 | Iter 1140 | 41m 49s (- 76m 28s) (1140 35%) 0.2595


Epoch 6 | Iter 3580 | 46m 23s (- 70m 29s) (3580 39%) 0.2432
Epoch 6 | Iter 3600 | 46m 25s (- 70m 26s) (3600 39%) 0.2669
Epoch 6 | Iter 3620 | 46m 27s (- 70m 22s) (3620 39%) 0.2598
Epoch 6 | Iter 3640 | 46m 29s (- 70m 19s) (3640 39%) 0.2651
Epoch 6 | Iter 3660 | 46m 30s (- 70m 15s) (3660 39%) 0.2831
Epoch 6 | Iter 3680 | 46m 32s (- 70m 11s) (3680 39%) 0.2849
Epoch 6 | Iter 3700 | 46m 34s (- 70m 7s) (3700 39%) 0.2785
Epoch 6 | Iter 3720 | 46m 35s (- 70m 4s) (3720 39%) 0.2544
Epoch 6 | Iter 3740 | 46m 37s (- 70m 0s) (3740 39%) 0.2945
Epoch 7 | Iter 20 | 46m 43s (- 69m 59s) (20 40%) 0.8932
Epoch 7 | Iter 40 | 46m 48s (- 69m 59s) (40 40%) 0.5986
Epoch 7 | Iter 60 | 46m 52s (- 69m 59s) (60 40%) 0.5101
Epoch 7 | Iter 80 | 46m 55s (- 69m 58s) (80 40%) 0.4688
Epoch 7 | Iter 100 | 46m 59s (- 69m 58s) (100 40%) 0.4470
Epoch 7 | Iter 120 | 47m 3s (- 69m 57s) (120 40%) 0.4368
Epoch 7 | Iter 140 | 47m 7s (- 69m 56s) (140 40%) 0.4102
Epoch 7 | Iter 160 | 47m 10s (- 69m 56s) (160 40%) 0.3814
Epoch 7 |

Epoch 7 | Iter 2620 | 52m 43s (- 65m 21s) (2620 44%) 0.2081
Epoch 7 | Iter 2640 | 52m 46s (- 65m 18s) (2640 44%) 0.2172
Epoch 7 | Iter 2660 | 52m 48s (- 65m 15s) (2660 44%) 0.2005
Epoch 7 | Iter 2680 | 52m 50s (- 65m 12s) (2680 44%) 0.1916
Epoch 7 | Iter 2700 | 52m 52s (- 65m 9s) (2700 44%) 0.1954
Epoch 7 | Iter 2720 | 52m 54s (- 65m 6s) (2720 44%) 0.2089
Epoch 7 | Iter 2740 | 52m 56s (- 65m 3s) (2740 44%) 0.1940
Epoch 7 | Iter 2760 | 52m 58s (- 65m 0s) (2760 44%) 0.2072
Epoch 7 | Iter 2780 | 53m 0s (- 64m 57s) (2780 44%) 0.1961
Epoch 7 | Iter 2800 | 53m 2s (- 64m 54s) (2800 44%) 0.2210
Epoch 7 | Iter 2820 | 53m 4s (- 64m 51s) (2820 45%) 0.2102
Epoch 7 | Iter 2840 | 53m 6s (- 64m 48s) (2840 45%) 0.2397
Epoch 7 | Iter 2860 | 53m 8s (- 64m 44s) (2860 45%) 0.2261
Epoch 7 | Iter 2880 | 53m 10s (- 64m 41s) (2880 45%) 0.2231
Epoch 7 | Iter 2900 | 53m 12s (- 64m 38s) (2900 45%) 0.1977
Epoch 7 | Iter 2920 | 53m 14s (- 64m 35s) (2920 45%) 0.2183
Epoch 7 | Iter 2940 | 53m 16s (- 64m 32s) (2940 4

Epoch 8 | Iter 1680 | 58m 48s (- 59m 38s) (1680 49%) 0.1975
Epoch 8 | Iter 1700 | 58m 50s (- 59m 35s) (1700 49%) 0.2124
Epoch 8 | Iter 1720 | 58m 53s (- 59m 33s) (1720 49%) 0.2296
Epoch 8 | Iter 1740 | 58m 55s (- 59m 30s) (1740 49%) 0.2115
Epoch 8 | Iter 1760 | 58m 58s (- 59m 28s) (1760 49%) 0.2143
Epoch 8 | Iter 1780 | 59m 1s (- 59m 25s) (1780 49%) 0.2020
Epoch 8 | Iter 1800 | 59m 3s (- 59m 22s) (1800 49%) 0.1856
Epoch 8 | Iter 1820 | 59m 5s (- 59m 20s) (1820 49%) 0.2086
Epoch 8 | Iter 1840 | 59m 8s (- 59m 17s) (1840 49%) 0.1822
Epoch 8 | Iter 1860 | 59m 10s (- 59m 15s) (1860 49%) 0.1870
Epoch 8 | Iter 1880 | 59m 13s (- 59m 12s) (1880 50%) 0.1710
Epoch 8 | Iter 1900 | 59m 15s (- 59m 10s) (1900 50%) 0.1758
Epoch 8 | Iter 1920 | 59m 18s (- 59m 7s) (1920 50%) 0.1896
Epoch 8 | Iter 1940 | 59m 20s (- 59m 4s) (1940 50%) 0.1813
Epoch 8 | Iter 1960 | 59m 22s (- 59m 1s) (1960 50%) 0.1916
Epoch 8 | Iter 1980 | 59m 25s (- 58m 59s) (1980 50%) 0.1764
Epoch 8 | Iter 2000 | 59m 27s (- 58m 56s) (2000

Epoch 9 | Iter 720 | 64m 26s (- 53m 33s) (720 54%) 0.2764
Epoch 9 | Iter 740 | 64m 29s (- 53m 31s) (740 54%) 0.2893
Epoch 9 | Iter 760 | 64m 32s (- 53m 29s) (760 54%) 0.2983
Epoch 9 | Iter 780 | 64m 36s (- 53m 27s) (780 54%) 0.2772
Epoch 9 | Iter 800 | 64m 38s (- 53m 25s) (800 54%) 0.2872
Epoch 9 | Iter 820 | 64m 41s (- 53m 23s) (820 54%) 0.2577
Epoch 9 | Iter 840 | 64m 44s (- 53m 21s) (840 54%) 0.2518
Epoch 9 | Iter 860 | 64m 47s (- 53m 18s) (860 54%) 0.2570
Epoch 9 | Iter 880 | 64m 50s (- 53m 16s) (880 54%) 0.2446
Epoch 9 | Iter 900 | 64m 53s (- 53m 14s) (900 54%) 0.2423
Epoch 9 | Iter 920 | 64m 56s (- 53m 12s) (920 54%) 0.2289
Epoch 9 | Iter 940 | 64m 59s (- 53m 10s) (940 55%) 0.2371
Epoch 9 | Iter 960 | 65m 2s (- 53m 7s) (960 55%) 0.2374
Epoch 9 | Iter 980 | 65m 5s (- 53m 5s) (980 55%) 0.2633
Epoch 9 | Iter 1000 | 65m 8s (- 53m 3s) (1000 55%) 0.2590
Epoch 9 | Iter 1020 | 65m 11s (- 53m 1s) (1020 55%) 0.2457
Epoch 9 | Iter 1040 | 65m 13s (- 52m 58s) (1040 55%) 0.2173
Epoch 9 | Iter 

Epoch 9 | Iter 3480 | 69m 50s (- 47m 30s) (3480 59%) 0.1964
Epoch 9 | Iter 3500 | 69m 52s (- 47m 27s) (3500 59%) 0.2125
Epoch 9 | Iter 3520 | 69m 54s (- 47m 24s) (3520 59%) 0.2070
Epoch 9 | Iter 3540 | 69m 56s (- 47m 21s) (3540 59%) 0.1969
Epoch 9 | Iter 3560 | 69m 57s (- 47m 18s) (3560 59%) 0.1982
Epoch 9 | Iter 3580 | 69m 59s (- 47m 15s) (3580 59%) 0.1958
Epoch 9 | Iter 3600 | 70m 1s (- 47m 12s) (3600 59%) 0.1975
Epoch 9 | Iter 3620 | 70m 2s (- 47m 9s) (3620 59%) 0.1959
Epoch 9 | Iter 3640 | 70m 4s (- 47m 6s) (3640 59%) 0.2072
Epoch 9 | Iter 3660 | 70m 6s (- 47m 3s) (3660 59%) 0.2213
Epoch 9 | Iter 3680 | 70m 7s (- 47m 0s) (3680 59%) 0.2204
Epoch 9 | Iter 3700 | 70m 9s (- 46m 57s) (3700 59%) 0.2121
Epoch 9 | Iter 3720 | 70m 11s (- 46m 54s) (3720 59%) 0.1900
Epoch 9 | Iter 3740 | 70m 12s (- 46m 51s) (3740 59%) 0.2340
Epoch 10 | Iter 20 | 70m 19s (- 46m 48s) (20 60%) 0.8103
Epoch 10 | Iter 40 | 70m 23s (- 46m 47s) (40 60%) 0.5938
Epoch 10 | Iter 60 | 70m 27s (- 46m 45s) (60 60%) 0.4827

Epoch 10 | Iter 2500 | 76m 3s (- 41m 58s) (2500 64%) 0.1578
Epoch 10 | Iter 2520 | 76m 6s (- 41m 55s) (2520 64%) 0.1536
Epoch 10 | Iter 2540 | 76m 8s (- 41m 53s) (2540 64%) 0.1738
Epoch 10 | Iter 2560 | 76m 10s (- 41m 50s) (2560 64%) 0.1679
Epoch 10 | Iter 2580 | 76m 12s (- 41m 47s) (2580 64%) 0.1619
Epoch 10 | Iter 2600 | 76m 14s (- 41m 45s) (2600 64%) 0.1604
Epoch 10 | Iter 2620 | 76m 17s (- 41m 42s) (2620 64%) 0.1639
Epoch 10 | Iter 2640 | 76m 19s (- 41m 39s) (2640 64%) 0.1861
Epoch 10 | Iter 2660 | 76m 21s (- 41m 36s) (2660 64%) 0.1679
Epoch 10 | Iter 2680 | 76m 23s (- 41m 34s) (2680 64%) 0.1714
Epoch 10 | Iter 2700 | 76m 25s (- 41m 31s) (2700 64%) 0.1727
Epoch 10 | Iter 2720 | 76m 27s (- 41m 28s) (2720 64%) 0.1814
Epoch 10 | Iter 2740 | 76m 29s (- 41m 26s) (2740 64%) 0.1764
Epoch 10 | Iter 2760 | 76m 31s (- 41m 23s) (2760 64%) 0.1830
Epoch 10 | Iter 2780 | 76m 33s (- 41m 20s) (2780 64%) 0.1682
Epoch 10 | Iter 2800 | 76m 35s (- 41m 17s) (2800 64%) 0.1821
Epoch 10 | Iter 2820 | 76m 

Epoch 11 | Iter 1500 | 81m 55s (- 36m 14s) (1500 69%) 0.1999
Epoch 11 | Iter 1520 | 81m 57s (- 36m 11s) (1520 69%) 0.1871
Epoch 11 | Iter 1540 | 82m 0s (- 36m 9s) (1540 69%) 0.1919
Epoch 11 | Iter 1560 | 82m 2s (- 36m 6s) (1560 69%) 0.1887
Epoch 11 | Iter 1580 | 82m 5s (- 36m 4s) (1580 69%) 0.1757
Epoch 11 | Iter 1600 | 82m 7s (- 36m 1s) (1600 69%) 0.1955
Epoch 11 | Iter 1620 | 82m 10s (- 35m 59s) (1620 69%) 0.1642
Epoch 11 | Iter 1640 | 82m 12s (- 35m 56s) (1640 69%) 0.1606
Epoch 11 | Iter 1660 | 82m 15s (- 35m 54s) (1660 69%) 0.1699
Epoch 11 | Iter 1680 | 82m 17s (- 35m 51s) (1680 69%) 0.1744
Epoch 11 | Iter 1700 | 82m 20s (- 35m 49s) (1700 69%) 0.1910
Epoch 11 | Iter 1720 | 82m 22s (- 35m 46s) (1720 69%) 0.2075
Epoch 11 | Iter 1740 | 82m 24s (- 35m 43s) (1740 69%) 0.1911
Epoch 11 | Iter 1760 | 82m 27s (- 35m 41s) (1760 69%) 0.1918
Epoch 11 | Iter 1780 | 82m 29s (- 35m 38s) (1780 69%) 0.1746
Epoch 11 | Iter 1800 | 82m 32s (- 35m 36s) (1800 69%) 0.1689
Epoch 11 | Iter 1820 | 82m 34s (

Epoch 12 | Iter 480 | 87m 19s (- 30m 23s) (480 74%) 0.2289
Epoch 12 | Iter 500 | 87m 22s (- 30m 20s) (500 74%) 0.2990
Epoch 12 | Iter 520 | 87m 26s (- 30m 18s) (520 74%) 0.3508
Epoch 12 | Iter 540 | 87m 29s (- 30m 16s) (540 74%) 0.2994
Epoch 12 | Iter 560 | 87m 32s (- 30m 14s) (560 74%) 0.2861
Epoch 12 | Iter 580 | 87m 35s (- 30m 11s) (580 74%) 0.2501
Epoch 12 | Iter 600 | 87m 38s (- 30m 9s) (600 74%) 0.2503
Epoch 12 | Iter 620 | 87m 41s (- 30m 7s) (620 74%) 0.2361
Epoch 12 | Iter 640 | 87m 44s (- 30m 4s) (640 74%) 0.2157
Epoch 12 | Iter 660 | 87m 47s (- 30m 2s) (660 74%) 0.2254
Epoch 12 | Iter 680 | 87m 50s (- 30m 0s) (680 74%) 0.2185
Epoch 12 | Iter 700 | 87m 53s (- 29m 57s) (700 74%) 0.2257
Epoch 12 | Iter 720 | 87m 56s (- 29m 55s) (720 74%) 0.2231
Epoch 12 | Iter 740 | 87m 59s (- 29m 53s) (740 74%) 0.2318
Epoch 12 | Iter 760 | 88m 2s (- 29m 50s) (760 74%) 0.2131
Epoch 12 | Iter 780 | 88m 5s (- 29m 48s) (780 74%) 0.2303
Epoch 12 | Iter 800 | 88m 8s (- 29m 46s) (800 74%) 0.2203
Epoch

Epoch 12 | Iter 3200 | 92m 56s (- 24m 40s) (3200 79%) 0.1752
Epoch 12 | Iter 3220 | 92m 58s (- 24m 38s) (3220 79%) 0.1613
Epoch 12 | Iter 3240 | 92m 59s (- 24m 35s) (3240 79%) 0.1616
Epoch 12 | Iter 3260 | 93m 1s (- 24m 32s) (3260 79%) 0.1651
Epoch 12 | Iter 3280 | 93m 3s (- 24m 30s) (3280 79%) 0.1495
Epoch 12 | Iter 3300 | 93m 5s (- 24m 27s) (3300 79%) 0.1636
Epoch 12 | Iter 3320 | 93m 7s (- 24m 24s) (3320 79%) 0.1550
Epoch 12 | Iter 3340 | 93m 9s (- 24m 22s) (3340 79%) 0.1578
Epoch 12 | Iter 3360 | 93m 11s (- 24m 19s) (3360 79%) 0.1540
Epoch 12 | Iter 3380 | 93m 13s (- 24m 16s) (3380 79%) 0.1544
Epoch 12 | Iter 3400 | 93m 14s (- 24m 14s) (3400 79%) 0.1618
Epoch 12 | Iter 3420 | 93m 16s (- 24m 11s) (3420 79%) 0.1580
Epoch 12 | Iter 3440 | 93m 18s (- 24m 8s) (3440 79%) 0.1440
Epoch 12 | Iter 3460 | 93m 20s (- 24m 6s) (3460 79%) 0.1623
Epoch 12 | Iter 3480 | 93m 22s (- 24m 3s) (3480 79%) 0.1567
Epoch 12 | Iter 3500 | 93m 24s (- 24m 0s) (3500 79%) 0.1743
Epoch 12 | Iter 3520 | 93m 25s (-

Epoch 13 | Iter 2200 | 99m 2s (- 18m 59s) (2200 83%) 0.2192
Epoch 13 | Iter 2220 | 99m 4s (- 18m 57s) (2220 83%) 0.1938
Epoch 13 | Iter 2240 | 99m 7s (- 18m 54s) (2240 83%) 0.1716
Epoch 13 | Iter 2260 | 99m 9s (- 18m 52s) (2260 84%) 0.1849
Epoch 13 | Iter 2280 | 99m 11s (- 18m 49s) (2280 84%) 0.1754
Epoch 13 | Iter 2300 | 99m 14s (- 18m 46s) (2300 84%) 0.1777
Epoch 13 | Iter 2320 | 99m 16s (- 18m 44s) (2320 84%) 0.1729
Epoch 13 | Iter 2340 | 99m 18s (- 18m 41s) (2340 84%) 0.1745
Epoch 13 | Iter 2360 | 99m 20s (- 18m 39s) (2360 84%) 0.1531
Epoch 13 | Iter 2380 | 99m 23s (- 18m 36s) (2380 84%) 0.1531
Epoch 13 | Iter 2400 | 99m 25s (- 18m 34s) (2400 84%) 0.1698
Epoch 13 | Iter 2420 | 99m 28s (- 18m 31s) (2420 84%) 0.1542
Epoch 13 | Iter 2440 | 99m 30s (- 18m 29s) (2440 84%) 0.1595
Epoch 13 | Iter 2460 | 99m 32s (- 18m 26s) (2460 84%) 0.1596
Epoch 13 | Iter 2480 | 99m 34s (- 18m 24s) (2480 84%) 0.1552
Epoch 13 | Iter 2500 | 99m 37s (- 18m 21s) (2500 84%) 0.1570
Epoch 13 | Iter 2520 | 99m 3

Epoch 14 | Iter 1160 | 104m 46s (- 13m 18s) (1160 88%) 0.1497
Epoch 14 | Iter 1180 | 104m 48s (- 13m 16s) (1180 88%) 0.1587
Epoch 14 | Iter 1200 | 104m 51s (- 13m 13s) (1200 88%) 0.1466
Epoch 14 | Iter 1220 | 104m 54s (- 13m 11s) (1220 88%) 0.1504
Epoch 14 | Iter 1240 | 104m 57s (- 13m 8s) (1240 88%) 0.1515
Epoch 14 | Iter 1260 | 104m 59s (- 13m 6s) (1260 88%) 0.1602
Epoch 14 | Iter 1280 | 105m 2s (- 13m 3s) (1280 88%) 0.1487
Epoch 14 | Iter 1300 | 105m 5s (- 13m 1s) (1300 88%) 0.1485
Epoch 14 | Iter 1320 | 105m 7s (- 12m 58s) (1320 89%) 0.1535
Epoch 14 | Iter 1340 | 105m 10s (- 12m 56s) (1340 89%) 0.1428
Epoch 14 | Iter 1360 | 105m 13s (- 12m 53s) (1360 89%) 0.1466
Epoch 14 | Iter 1380 | 105m 16s (- 12m 51s) (1380 89%) 0.1525
Epoch 14 | Iter 1400 | 105m 18s (- 12m 48s) (1400 89%) 0.1406
Epoch 14 | Iter 1420 | 105m 21s (- 12m 46s) (1420 89%) 0.1518
Epoch 14 | Iter 1440 | 105m 24s (- 12m 43s) (1440 89%) 0.1593
Epoch 14 | Iter 1460 | 105m 27s (- 12m 41s) (1460 89%) 0.1547
Epoch 14 | Iter

Epoch 15 | Iter 120 | 109m 58s (- 7m 35s) (120 93%) 0.3237
Epoch 15 | Iter 140 | 110m 2s (- 7m 32s) (140 93%) 0.2981
Epoch 15 | Iter 160 | 110m 6s (- 7m 30s) (160 93%) 0.2892
Epoch 15 | Iter 180 | 110m 9s (- 7m 27s) (180 93%) 0.2856
Epoch 15 | Iter 200 | 110m 13s (- 7m 25s) (200 93%) 0.2711
Epoch 15 | Iter 220 | 110m 16s (- 7m 23s) (220 93%) 0.2490
Epoch 15 | Iter 240 | 110m 20s (- 7m 20s) (240 93%) 0.2579
Epoch 15 | Iter 260 | 110m 23s (- 7m 18s) (260 93%) 0.2495
Epoch 15 | Iter 280 | 110m 26s (- 7m 15s) (280 93%) 0.2499
Epoch 15 | Iter 300 | 110m 30s (- 7m 13s) (300 93%) 0.2192
Epoch 15 | Iter 320 | 110m 33s (- 7m 10s) (320 93%) 0.2352
Epoch 15 | Iter 340 | 110m 36s (- 7m 8s) (340 93%) 0.2170
Epoch 15 | Iter 360 | 110m 39s (- 7m 5s) (360 93%) 0.2242
Epoch 15 | Iter 380 | 110m 43s (- 7m 3s) (380 94%) 0.2225
Epoch 15 | Iter 400 | 110m 46s (- 7m 0s) (400 94%) 0.2105
Epoch 15 | Iter 420 | 110m 49s (- 6m 58s) (420 94%) 0.2161
Epoch 15 | Iter 440 | 110m 52s (- 6m 56s) (440 94%) 0.1929
Epoc

Epoch 15 | Iter 2860 | 115m 58s (- 1m 52s) (2860 98%) 0.1543
Epoch 15 | Iter 2880 | 116m 0s (- 1m 49s) (2880 98%) 0.1407
Epoch 15 | Iter 2900 | 116m 2s (- 1m 47s) (2900 98%) 0.1322
Epoch 15 | Iter 2920 | 116m 4s (- 1m 44s) (2920 98%) 0.1452
Epoch 15 | Iter 2940 | 116m 6s (- 1m 42s) (2940 98%) 0.1343
Epoch 15 | Iter 2960 | 116m 8s (- 1m 39s) (2960 98%) 0.1446
Epoch 15 | Iter 2980 | 116m 10s (- 1m 37s) (2980 98%) 0.1509
Epoch 15 | Iter 3000 | 116m 12s (- 1m 34s) (3000 98%) 0.1441
Epoch 15 | Iter 3020 | 116m 14s (- 1m 32s) (3020 98%) 0.1411
Epoch 15 | Iter 3040 | 116m 16s (- 1m 29s) (3040 98%) 0.1449
Epoch 15 | Iter 3060 | 116m 18s (- 1m 27s) (3060 98%) 0.1335
Epoch 15 | Iter 3080 | 116m 20s (- 1m 24s) (3080 98%) 0.1425
Epoch 15 | Iter 3100 | 116m 22s (- 1m 22s) (3100 98%) 0.1287
Epoch 15 | Iter 3120 | 116m 24s (- 1m 19s) (3120 98%) 0.1416
Epoch 15 | Iter 3140 | 116m 26s (- 1m 17s) (3140 98%) 0.1382
Epoch 15 | Iter 3160 | 116m 28s (- 1m 14s) (3160 98%) 0.1791
Epoch 15 | Iter 3180 | 116m 3

In [21]:
class Args(object):
    def __init__(self, beam_size, nbest, decode_max_len):
        self.beam_size = beam_size
        self.nbest = nbest
        self.decode_max_len = decode_max_len

In [22]:
args = Args(30, 1, 15)

In [26]:
args = Args(30, 1, 21)
b = 2
n = 1
sample = te_dataset[b][n][1]
input_tensor = torch.tensor(kaldi_io.read_mat(sample["input"][0]["feat"]))
a = input_tensor.shape[0]//4*4
input_tensor = input_tensor[0:(a if a < MAX_LENGTH*4 else MAX_LENGTH*4 ),:]

g = input_tensor.shape[0]
print(torch.tensor([g]))

input_tensor = input_tensor.unsqueeze(0)
print("input_tensor:",input_tensor.shape)
b = model.recognize(input_tensor.to(device),torch.tensor([g]), char_list, args)
print("正解：", sample["output"][0]["text"])

tensor([800])
input_tensor: torch.Size([1, 800, 240])
remeined hypothes: 30
hypo: 温
hypo: 跟
hypo: 分
hypo: 奔
hypo: 文
hypo: 纷
hypo: 很
hypo: 问
hypo: 芬
hypo: 郑
hypo: 根
hypo: 尊
hypo: 微
hypo: 昆
hypo: 目
hypo: 奋
hypo: 封
hypo: 更
hypo: 深
hypo: 稳
hypo: 翁
hypo: 伦
hypo: 恩
hypo: 愤
hypo: 荆
hypo: 惠
hypo: 闷
hypo: 闻
hypo: 喷
hypo: 樊
remeined hypothes: 30
hypo: 温州
hypo: 跟周
hypo: 跟踪
hypo: 跟州
hypo: 分钟
hypo: 奔周
hypo: 温周
hypo: 跟洲
hypo: 跟着
hypo: 跟征
hypo: 文章
hypo: 跟真
hypo: 很多
hypo: 跟妆
hypo: 纷周
hypo: 芬州
hypo: 分州
hypo: 问真
hypo: 郑州
hypo: 温洲
hypo: 根州
hypo: 文周
hypo: 跟植
hypo: 跟卓
hypo: 尊州
hypo: 跟诸
hypo: 跟求
hypo: 跟筹
hypo: 分支
hypo: 分征
remeined hypothes: 30
hypo: 温州光
hypo: 跟周公
hypo: 跟踪光
hypo: 跟周光
hypo: 跟州光
hypo: 跟踪公
hypo: 分钟关
hypo: 跟踪关
hypo: 温州公
hypo: 温州高
hypo: 跟着光
hypo: 跟踪高
hypo: 跟洲光
hypo: 奔周公
hypo: 分钟光
hypo: 温周光
hypo: 跟征光
hypo: 奔周高
hypo: 温周公
hypo: 奔周光
hypo: 跟州公
hypo: 分钟高
hypo: 分钟公
hypo: 温州钢
hypo: 跟洲公
hypo: 跟周高
hypo: 跟真光
hypo: 跟周康
hypo: 跟周瓜
hypo: 跟真公
remeined hypothes: 30
hypo: 温州光速
hypo: 跟周公诉
hypo: 温州光塑
hypo: 温州光束
hypo

remeined hypothes: 24
hypo: 温州光速车火九二八三名遭虹桥市域遭疯抢续
hypo: 温州光速车火九二八三米遭疯抢续一遭疯抢续
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭疯抢续
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭疯抢续
hypo: 温州光速车或九二八三米遭疯抢续一遭疯抢续
hypo: 温州光速车火九二八三名遭疯抢续一遭疯抢续
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭公抢续
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭昏抢续
hypo: 温州光速车或九二八三名招呼将陷入二包大米
hypo: 温州光速车或九二八三名招呼将陷入二包大名
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭弹强续
hypo: 温州光速车或九二八三名招呼将陷入二包大鸣
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭铜枪示
hypo: 温州光速车火九二八三鸣河西边路边家路一遭
hypo: 温州光速车火九二八三名遭虹桥市域遭弹强续
hypo: 温州光速车火九二八三鸣河西边路边家西路边
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭公顷续
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭弹强蓄
hypo: 温州光速车或九二八三名招呼将陷入二包当名
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭弹枪续
hypo: 温州光速车火九二八三名遭虹桥市域遭疯抢市
hypo: 温州光速车火九二八三名遭虹桥市域遭弹强市
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭遇到虹
hypo: 温州光速车火九二八三鸣河西边路边家西安邦
no hypothesis. Finish decoding.
正解： 温州高速车祸九二包大米遭哄抢续带头者被拘


In [None]:
torch.cuda.memory_cached()/1000000

In [None]:
dataset[2]

In [None]:
data_list[1]

In [None]:
for ch in data_list[1]["corpus"]:
    print(ch.item())

In [None]:
d = torch.tensor([[[1,2,3],[4,5,6],[7,8,9],[10,11,12]]])

In [None]:
lang.index2word[1]

In [None]:
d.view((4,1,3))

In [None]:
d[0]