In [1]:
import os
import random
from io import open
import unicodedata
import string
import re

import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from pathlib import Path
import kaldi_io
import sys
import gc
import json
import time
from data_4 import AudioDataLoader, AudioDataset, pad_list

%matplotlib inline

print_use = False

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
train_json = "data.json"
test_json = "data_test.json"
batch_size = 32
maxlen_in = 100000
maxlen_out = 30
num_workers = 4

## 加载数据

In [3]:
with open(train_json, 'rb') as f:
            data = json.load(f)['utts']

In [4]:
sorted_data = sorted(data.items(), key=lambda data: int(
            data[1]['input']['shape'][0]), reverse=True)

In [5]:
tr_dataset = AudioDataset(train_json, batch_size,
                              maxlen_in, maxlen_out)

tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=num_workers)


In [6]:
te_dataset = AudioDataset(test_json, batch_size,
                              maxlen_in, maxlen_out)
te_loader = AudioDataLoader(te_dataset, batch_size=1, num_workers=num_workers)

In [7]:
char_list = []
char_list_path = "train_chars.txt"
with open(char_list_path, "r") as f:
    for line in f:
        data = line.split()
        char_list.append(data[0])

# 模型搭建

In [8]:
MAX_LENGTH= 200
SOS_token = 0
EOS_token = 1
os.environ["CUDA_VISIBLE_DEVICES"]="2"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Attention

In [9]:
class DotProductAttention(nn.Module):
    r"""Dot product attention.
    Given a set of vector values, and a vector query, attention is a technique
    to compute a weighted sum of the values, dependent on the query.

    NOTE: Here we use the terminology in Stanford cs224n-2018-lecture11.
    """

    def __init__(self):
        super(DotProductAttention, self).__init__()
        # TODO: move this out of this class?
        # self.linear_out = nn.Linear(dim*2, dim)

    def forward(self, queries, values):
        """
        Args:
            queries: N x To x H
            values : N x Ti x H

        Returns:
            output: N x To x H
            attention_distribution: N x To x Ti
        """
        batch_size = queries.size(0)
        hidden_size = queries.size(2)
        input_lengths = values.size(1)
        # (N, To, H) * (N, H, Ti) -> (N, To, Ti)
        attention_scores = torch.bmm(queries, values.transpose(1, 2))
        attention_distribution = F.softmax(
            attention_scores.view(-1, input_lengths), dim=1).view(batch_size, -1, input_lengths)
        # (N, To, Ti) * (N, Ti, H) -> (N, To, H)
        attention_output = torch.bmm(attention_distribution, values)
        # # concat -> (N, To, 2*H)
        # concated = torch.cat((attention_output, queries), dim=2)
        # # TODO: Move this out of this class?
        # # output -> (N, To, H)
        # output = torch.tanh(self.linear_out(
        #     concated.view(-1, 2*hidden_size))).view(batch_size, -1, hidden_size)

        return attention_output, attention_distribution

### 金字塔BLSTM

In [33]:
class pyramidalBLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.0, bidirectional=True):
        super(pyramidalBLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm1 = nn.LSTM(input_size, hidden_size, 
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.lstm2 = nn.LSTM(hidden_size*4, hidden_size,
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.lstm3 = nn.LSTM(hidden_size*4, hidden_size,
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)

    def forward(self, padded_input, input_lengths):
        
#         print("pyramidalBLSTM.padded_input.shape:", padded_input.shape)
        total_length = padded_input.size(1)
#         print("total_length:",total_length)
        packed_input = pack_padded_sequence(padded_input, input_lengths,
                                            batch_first=True)


#         print("packed_input.shape",packed_input.data.shape)
        packed_output1, hidden1 = self.lstm1(packed_input)
#         print("packed_output1.shape",packed_output1.data.shape)

        #两步拼一步

        padded_output1, _ = pad_packed_sequence(packed_output1,
                                        batch_first=True,
                                        total_length=total_length)
#         print("padded_output1.shape:",padded_output1.shape)
        padded_input2 = padded_output1.reshape(padded_output1.shape[0],padded_output1.shape[1]//2,padded_output1.shape[2]*2)
        
        packed_input2 = pack_padded_sequence(padded_input2, input_lengths//2,
                                            batch_first=True)
        packed_output2, hidden2 = self.lstm2(packed_input2)
        padded_output2, _ = pad_packed_sequence(packed_output2,
                                        batch_first=True,
                                        total_length=total_length//2)
        
        padded_input3 = padded_output2.reshape(padded_output2.shape[0],padded_output2.shape[1]//2,padded_output2.shape[2]*2)
        packed_input3 = pack_padded_sequence(padded_input3, input_lengths//4,
                                            batch_first=True)
        
        packed_output3, hidden3 = self.lstm2(packed_input3)  
        output, _ = pad_packed_sequence(packed_output3,
                                        batch_first=True,
                                        total_length=total_length//4)

        
        
        return output, (hidden1, hidden2, hidden3)
        

In [34]:
a = [[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]]
a = torch.tensor(a)

In [35]:
a = torch.rand([32, 2236, 512])
a.shape

torch.Size([32, 2236, 512])

In [36]:
# a.reshape(a.shape[0],a.shape[1]//2,a.shape[2]*2)

## 模型构建

In [37]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.0):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.first = True

        self.pyramidalBLSTM = pyramidalBLSTM(input_size, hidden_size, 1, dropout=dropout)

    def forward(self, input, input_lengths):

        output, hidden = self.pyramidalBLSTM(input, input_lengths)

        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [38]:
class Decoder(nn.Module):
    """
    """

    def __init__(self, vocab_size, embedding_dim, sos_id, eos_id, hidden_size,
                 num_layers, bidirectional_encoder=True):
        super(Decoder, self).__init__()
        # Hyper parameters
        # embedding + output
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.sos_id = sos_id  # Start of Sentence
        self.eos_id = eos_id  # End of Sentence
        # rnn
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional_encoder = bidirectional_encoder  # useless now
        self.encoder_hidden_size = hidden_size  # must be equal now
        # Components
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.rnn = nn.ModuleList()
        self.rnn += [nn.LSTMCell(self.embedding_dim +
                                 self.encoder_hidden_size, self.hidden_size)]
        for l in range(1, self.num_layers):
            self.rnn += [nn.LSTMCell(self.hidden_size, self.hidden_size)]
        self.attention = DotProductAttention()
        self.mlp = nn.Sequential(
            nn.Linear(self.encoder_hidden_size + self.hidden_size,
                      self.hidden_size),
            nn.Tanh(),
            nn.Linear(self.hidden_size, self.vocab_size))

    def zero_state(self, encoder_padded_outputs, H=None):
        N = encoder_padded_outputs.size(0)
        H = self.hidden_size if H == None else H
        return encoder_padded_outputs.new_zeros(N, H)

    def forward(self, padded_input, encoder_padded_outputs):
        """
        Args:
            padded_input: N x To
            # encoder_hidden: (num_layers * num_directions) x N x H
            encoder_padded_outputs: N x Ti x H

        Returns:
        """
        # *********Get Input and Output
        # from espnet/Decoder.forward()
        # TODO: need to make more smart way
        ys = [y[y != IGNORE_ID] for y in padded_input]  # parse padded ys
        # prepare input and output word sequences with sos/eos IDs
        eos = ys[0].new([self.eos_id])
        sos = ys[0].new([self.sos_id])
        ys_in = [torch.cat([sos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]
        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = pad_list(ys_in, self.eos_id)
        ys_out_pad = pad_list(ys_out, IGNORE_ID)
        # print("ys_in_pad", ys_in_pad.size())
        assert ys_in_pad.size() == ys_out_pad.size()
        batch_size = ys_in_pad.size(0)
        output_length = ys_in_pad.size(1)
        # max_length = ys_in_pad.size(1) - 1  # TODO: should minus 1(sos)?

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_padded_outputs)]
        c_list = [self.zero_state(encoder_padded_outputs)]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_padded_outputs))
            c_list.append(self.zero_state(encoder_padded_outputs))
        att_c = self.zero_state(encoder_padded_outputs,
                                H=encoder_padded_outputs.size(2))
        y_all = []

        # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP
        embedded = self.embedding(ys_in_pad)
        for t in range(output_length):
            # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
            rnn_input = torch.cat((embedded[:, t, :], att_c), dim=1)
            h_list[0], c_list[0] = self.rnn[0](
                rnn_input, (h_list[0], c_list[0]))
            for l in range(1, self.num_layers):
                h_list[l], c_list[l] = self.rnn[l](
                    h_list[l-1], (h_list[l], c_list[l]))
            rnn_output = h_list[-1]  # below unsqueeze: (N x H) -> (N x 1 x H)
            # step 2. attention: c_i = AttentionContext(s_i,h)
            att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                          encoder_padded_outputs)
            att_c = att_c.squeeze(dim=1)
            # step 3. concate s_i and c_i, and input to MLP
            mlp_input = torch.cat((rnn_output, att_c), dim=1)
            predicted_y_t = self.mlp(mlp_input)
            y_all.append(predicted_y_t)

        y_all = torch.stack(y_all, dim=1)  # N x To x C
        # **********Cross Entropy Loss
        # F.cross_entropy = NLL(log_softmax(input), target))
        y_all = y_all.view(batch_size * output_length, self.vocab_size)
        ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1),
                                  ignore_index=IGNORE_ID,
                                  reduction='mean')

        return ce_loss

       

    def recognize_beam(self, encoder_outputs, char_list, args):
        """Beam search, decode one utterence now.
        Args:
            encoder_outputs: T x H
            char_list: list of character
            args: args.beam

        Returns:
            nbest_hyps:
        """
        # search params
        beam = args.beam_size
        nbest = args.nbest
        if args.decode_max_len == 0:
            maxlen = encoder_outputs.size(0)
        else:
            maxlen = args.decode_max_len

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        c_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
            c_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
        att_c = self.zero_state(encoder_outputs.unsqueeze(0),
                                H=encoder_outputs.unsqueeze(0).size(2))
        # prepare sos
        y = self.sos_id
        vy = encoder_outputs.new_zeros(1).long()

        hyp = {'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list,
               'a_prev': att_c}
        hyps = [hyp]
        ended_hyps = []

        for i in range(maxlen):
            hyps_best_kept = []
            for hyp in hyps:
                # vy.unsqueeze(1)
                vy[0] = hyp['yseq'][i]
                embedded = self.embedding(vy)
                # embedded.unsqueeze(0)
                # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
                rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1)
                h_list[0], c_list[0] = self.rnn[0](
                    rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0]))
                for l in range(1, self.num_layers):
                    h_list[l], c_list[l] = self.rnn[l](
                        h_list[l-1], (hyp['h_prev'][l], hyp['c_prev'][l]))
                rnn_output = h_list[-1]
                # step 2. attention: c_i = AttentionContext(s_i,h)
                # below unsqueeze: (N x H) -> (N x 1 x H)
                att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                              encoder_outputs.unsqueeze(0))
                att_c = att_c.squeeze(dim=1)
                # step 3. concate s_i and c_i, and input to MLP
                mlp_input = torch.cat((rnn_output, att_c), dim=1)
                predicted_y_t = self.mlp(mlp_input)
                local_scores = F.log_softmax(predicted_y_t, dim=1)
                # topk scores
                local_best_scores, local_best_ids = torch.topk(
                    local_scores, beam, dim=1)

                for j in range(beam):
                    new_hyp = {}
                    new_hyp['h_prev'] = h_list[:]
                    new_hyp['c_prev'] = c_list[:]
                    new_hyp['a_prev'] = att_c[:]
                    new_hyp['score'] = hyp['score'] + local_best_scores[0, j]
                    new_hyp['yseq'] = [0] * (1 + len(hyp['yseq']))
                    new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq']
                    new_hyp['yseq'][len(hyp['yseq'])] = int(
                        local_best_ids[0, j])
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(hyps_best_kept,
                                        key=lambda x: x['score'],
                                        reverse=True)[:beam]
            # end for hyp in hyps
            hyps = hyps_best_kept

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                for hyp in hyps:
                    hyp['yseq'].append(self.eos_id)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp['yseq'][-1] == self.eos_id:
                    # hyp['score'] += (i + 1) * penalty
                    ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            hyps = remained_hyps
            if len(hyps) > 0:
                print('remeined hypothes: ' + str(len(hyps)))
            else:
                print('no hypothesis. Finish decoding.')
                break

            for hyp in hyps:
                print('hypo: ' + ''.join([char_list[int(x)]
                                          for x in hyp['yseq'][1:]]))
        # end for i in range(maxlen)
        nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[
            :min(len(ended_hyps), nbest)]
        return nbest_hyps

In [39]:
class Seq2Seq(nn.Module):
    """Sequence-to-Sequence architecture with configurable encoder and decoder.
    """

    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, padded_input, input_lengths, padded_target):
        """
        Args:
            padded_input: N x Ti x D
            padded_targets: N x To
        """
        encoder_padded_outputs, _ = self.encoder(padded_input , input_lengths)
        loss = self.decoder(padded_target, encoder_padded_outputs)
        return loss
    
    def recognize(self, input, input_lengths, char_list, args):
        """Sequence-to-Sequence beam search, decode one utterence now.
        Args:
            input: T x D
            char_list: list of characters
            args: args.beam

        Returns:
            nbest_hyps:
        """
        encoder_outputs, _ = self.encoder(input, input_lengths)
#         print("encoder_outputs", encoder_outputs.squeeze(1).shape)
        
        nbest_hyps = self.decoder.recognize_beam(encoder_outputs.squeeze(0), char_list, args)
        return nbest_hyps


## 单步训练

In [40]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

## 训练迭代

In [42]:
def trainIters(model, epoch, optimizier, print_every=10, plot_every=10, learning_rate=0.01):
    log = open('train.log', 'w')
    start = time.time()
    n_iters = len(tr_dataset)
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
#     training_pairs = random.choices(a, k=n_iters)
    
    criterion = nn.NLLLoss()

    for e in range(epoch):
        for i, (data) in enumerate(tr_loader):
            padded_input, input_lengths, padded_target = data
            padded_input, input_lengths, padded_target = data
            padded_input = padded_input.cuda()
            input_lengths = input_lengths.cuda()
            padded_target = padded_target.cuda()
    #         print("padded_input:",padded_input.shape)
            loss = model(padded_input, input_lengths, padded_target)
    #         print(loss) #.requires_grad
            print_loss_total += float(loss)
            plot_loss_total += float(loss)

            optimizier.zero_grad()
            loss.backward()

            optimizier.step()

            if (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                txt = 'Epoch %d | Iter %d | %s (%d %d%%) %.4f' % (e+1, i+1, timeSince(start, (e *n_iters +i+1) / (n_iters*epoch)),
                                             (i+1), (e *n_iters +i+1) / (n_iters*epoch) * 100, print_loss_avg)
                print(txt)
                log.write(txt + "\n")
                log.flush()
            if i+1 % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    log.close()

In [None]:
input_size = 40

hidden_size = 256
vocab_size = len(char_list)
embedding_dim = 512
sos_id = 0
eos_id = 1
learning_rate = 1e-3
momentum = 0
l2 = 1e-5

IGNORE_ID=-1

encoder = Encoder(input_size, hidden_size, dropout=0.0)
decoder = Decoder(vocab_size, embedding_dim, sos_id, eos_id, hidden_size*2,
                 num_layers=1, bidirectional_encoder=True)

model = Seq2Seq(encoder, decoder)
print(model)
model.cuda()

optimizier = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
#                                      momentum=momentum,
                                     weight_decay=l2)
trainIters(model, 30,optimizier, print_every=20)

Seq2Seq(
  (encoder): Encoder(
    (pyramidalBLSTM): pyramidalBLSTM(
      (lstm1): LSTM(40, 256, batch_first=True, bidirectional=True)
      (lstm2): LSTM(1024, 256, batch_first=True, bidirectional=True)
      (lstm3): LSTM(1024, 256, batch_first=True, bidirectional=True)
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(4520, 512)
    (rnn): ModuleList(
      (0): LSTMCell(1024, 512)
    )
    (attention): DotProductAttention()
    (mlp): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): Tanh()
      (2): Linear(in_features=512, out_features=4520, bias=True)
    )
  )
)
Epoch 1 | Iter 20 | 0m 13s (- 6268m 11s) (20 0%) 6.9429
Epoch 1 | Iter 40 | 0m 25s (- 5678m 34s) (40 0%) 6.3361
Epoch 1 | Iter 60 | 0m 35s (- 5365m 26s) (60 0%) 6.2632
Epoch 1 | Iter 80 | 0m 45s (- 5153m 45s) (80 0%) 6.3083
Epoch 1 | Iter 100 | 0m 54s (- 4889m 12s) (100 0%) 6.3281
Epoch 1 | Iter 120 | 1m 2s (- 4717m 53s) (120 0%) 6.2642
Epoch 1 | Iter 140 | 1m 10s (- 4542

Epoch 1 | Iter 2600 | 11m 57s (- 2478m 53s) (2600 0%) 3.1231
Epoch 1 | Iter 2620 | 12m 1s (- 2474m 35s) (2620 0%) 3.2000
Epoch 1 | Iter 2640 | 12m 5s (- 2470m 5s) (2640 0%) 3.2016
Epoch 1 | Iter 2660 | 12m 9s (- 2465m 51s) (2660 0%) 3.2841
Epoch 1 | Iter 2680 | 12m 14s (- 2461m 52s) (2680 0%) 3.2082
Epoch 1 | Iter 2700 | 12m 18s (- 2458m 8s) (2700 0%) 3.1423
Epoch 1 | Iter 2720 | 12m 22s (- 2453m 53s) (2720 0%) 3.2390
Epoch 1 | Iter 2740 | 12m 26s (- 2449m 15s) (2740 0%) 2.9885
Epoch 1 | Iter 2760 | 12m 31s (- 2446m 7s) (2760 0%) 3.1719
Epoch 1 | Iter 2780 | 12m 35s (- 2442m 53s) (2780 0%) 3.1430
Epoch 1 | Iter 2800 | 12m 40s (- 2439m 39s) (2800 0%) 3.1908
Epoch 1 | Iter 2820 | 12m 44s (- 2434m 56s) (2820 0%) 3.0825
Epoch 1 | Iter 2840 | 12m 48s (- 2430m 9s) (2840 0%) 3.2142


In [None]:
class Args(object):
    def __init__(self, beam_size, nbest, decode_max_len):
        self.beam_size = beam_size
        self.nbest = nbest
        self.decode_max_len = decode_max_len

In [None]:
args = Args(30, 1, 15)

In [26]:
args = Args(30, 1, 21)
b = 2
n = 1
sample = te_dataset[b][n][1]
input_tensor = torch.tensor(kaldi_io.read_mat(sample["input"][0]["feat"]))
a = input_tensor.shape[0]//4*4
input_tensor = input_tensor[0:(a if a < MAX_LENGTH*4 else MAX_LENGTH*4 ),:]

g = input_tensor.shape[0]
print(torch.tensor([g]))

input_tensor = input_tensor.unsqueeze(0)
print("input_tensor:",input_tensor.shape)
b = model.recognize(input_tensor.to(device),torch.tensor([g]), char_list, args)
print("正解：", sample["output"][0]["text"])

tensor([800])
input_tensor: torch.Size([1, 800, 240])
remeined hypothes: 30
hypo: 温
hypo: 跟
hypo: 分
hypo: 奔
hypo: 文
hypo: 纷
hypo: 很
hypo: 问
hypo: 芬
hypo: 郑
hypo: 根
hypo: 尊
hypo: 微
hypo: 昆
hypo: 目
hypo: 奋
hypo: 封
hypo: 更
hypo: 深
hypo: 稳
hypo: 翁
hypo: 伦
hypo: 恩
hypo: 愤
hypo: 荆
hypo: 惠
hypo: 闷
hypo: 闻
hypo: 喷
hypo: 樊
remeined hypothes: 30
hypo: 温州
hypo: 跟周
hypo: 跟踪
hypo: 跟州
hypo: 分钟
hypo: 奔周
hypo: 温周
hypo: 跟洲
hypo: 跟着
hypo: 跟征
hypo: 文章
hypo: 跟真
hypo: 很多
hypo: 跟妆
hypo: 纷周
hypo: 芬州
hypo: 分州
hypo: 问真
hypo: 郑州
hypo: 温洲
hypo: 根州
hypo: 文周
hypo: 跟植
hypo: 跟卓
hypo: 尊州
hypo: 跟诸
hypo: 跟求
hypo: 跟筹
hypo: 分支
hypo: 分征
remeined hypothes: 30
hypo: 温州光
hypo: 跟周公
hypo: 跟踪光
hypo: 跟周光
hypo: 跟州光
hypo: 跟踪公
hypo: 分钟关
hypo: 跟踪关
hypo: 温州公
hypo: 温州高
hypo: 跟着光
hypo: 跟踪高
hypo: 跟洲光
hypo: 奔周公
hypo: 分钟光
hypo: 温周光
hypo: 跟征光
hypo: 奔周高
hypo: 温周公
hypo: 奔周光
hypo: 跟州公
hypo: 分钟高
hypo: 分钟公
hypo: 温州钢
hypo: 跟洲公
hypo: 跟周高
hypo: 跟真光
hypo: 跟周康
hypo: 跟周瓜
hypo: 跟真公
remeined hypothes: 30
hypo: 温州光速
hypo: 跟周公诉
hypo: 温州光塑
hypo: 温州光束
hypo

remeined hypothes: 24
hypo: 温州光速车火九二八三名遭虹桥市域遭疯抢续
hypo: 温州光速车火九二八三米遭疯抢续一遭疯抢续
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭疯抢续
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭疯抢续
hypo: 温州光速车或九二八三米遭疯抢续一遭疯抢续
hypo: 温州光速车火九二八三名遭疯抢续一遭疯抢续
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭公抢续
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭昏抢续
hypo: 温州光速车或九二八三名招呼将陷入二包大米
hypo: 温州光速车或九二八三名招呼将陷入二包大名
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭弹强续
hypo: 温州光速车或九二八三名招呼将陷入二包大鸣
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭铜枪示
hypo: 温州光速车火九二八三鸣河西边路边家路一遭
hypo: 温州光速车火九二八三名遭虹桥市域遭弹强续
hypo: 温州光速车火九二八三鸣河西边路边家西路边
hypo: 跟周公诉车祸九二八三名遭虹桥市域遭公顷续
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭弹强蓄
hypo: 温州光速车或九二八三名招呼将陷入二包当名
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭弹枪续
hypo: 温州光速车火九二八三名遭虹桥市域遭疯抢市
hypo: 温州光速车火九二八三名遭虹桥市域遭弹强市
hypo: 跟周公诉车祸九二八三名遭虹抢市域遭遇到虹
hypo: 温州光速车火九二八三鸣河西边路边家西安邦
no hypothesis. Finish decoding.
正解： 温州高速车祸九二包大米遭哄抢续带头者被拘


In [None]:
torch.cuda.memory_cached()/1000000

In [None]:
dataset[2]

In [None]:
data_list[1]

In [None]:
for ch in data_list[1]["corpus"]:
    print(ch.item())

In [None]:
d = torch.tensor([[[1,2,3],[4,5,6],[7,8,9],[10,11,12]]])

In [None]:
lang.index2word[1]

In [None]:
d.view((4,1,3))

In [None]:
d[0]