In [1]:
import os
import random
from io import open
import unicodedata
import string
import re

import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from pathlib import Path
import kaldi_io
import sys
import gc
import json
import time
from data import AudioDataLoader, AudioDataset, pad_list

%matplotlib inline

print_use = False

In [2]:
train_json = "/home/meichaoyang/workspace/Listen-Attend-Spell/egs/aishell/dump/train/deltatrue/data.json"
test_json = "/home/meichaoyang/workspace/Listen-Attend-Spell/egs/aishell/dump/test/deltatrue/data.json"
batch_size = 32
maxlen_in = 100000
maxlen_out = 30
num_workers = 4

## 加载数据

In [3]:
tr_dataset = AudioDataset(train_json, batch_size,
                              maxlen_in, maxlen_out)


tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=num_workers)



In [4]:
te_dataset = AudioDataset(test_json, batch_size,
                              maxlen_in, maxlen_out)
te_loader = AudioDataLoader(te_dataset, batch_size=1, num_workers=num_workers)

In [5]:
with open(train_json, 'rb') as f:
    json_data = json.load(f)

In [6]:
char_list = []
char_list_path = "/home/meichaoyang/workspace/Listen-Attend-Spell/egs/aishell/data/lang_1char/train_chars.txt"
with open(char_list_path, "r") as f:
    for line in f:
        data = line.split()
        char_list.append(data[0])

In [7]:
char_list[0]

'<unk>'

# 模型搭建

In [8]:
MAX_LENGTH= 200
SOS_token = 0
EOS_token = 1
os.environ["CUDA_VISIBLE_DEVICES"]="2"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Attention

In [9]:
class DotProductAttention(nn.Module):
    r"""Dot product attention.
    Given a set of vector values, and a vector query, attention is a technique
    to compute a weighted sum of the values, dependent on the query.

    NOTE: Here we use the terminology in Stanford cs224n-2018-lecture11.
    """

    def __init__(self):
        super(DotProductAttention, self).__init__()
        # TODO: move this out of this class?
        # self.linear_out = nn.Linear(dim*2, dim)

    def forward(self, queries, values):
        """
        Args:
            queries: N x To x H
            values : N x Ti x H

        Returns:
            output: N x To x H
            attention_distribution: N x To x Ti
        """
        batch_size = queries.size(0)
        hidden_size = queries.size(2)
        input_lengths = values.size(1)
        # (N, To, H) * (N, H, Ti) -> (N, To, Ti)
        attention_scores = torch.bmm(queries, values.transpose(1, 2))
        attention_distribution = F.softmax(
            attention_scores.view(-1, input_lengths), dim=1).view(batch_size, -1, input_lengths)
        # (N, To, Ti) * (N, Ti, H) -> (N, To, H)
        attention_output = torch.bmm(attention_distribution, values)
        # # concat -> (N, To, 2*H)
        # concated = torch.cat((attention_output, queries), dim=2)
        # # TODO: Move this out of this class?
        # # output -> (N, To, H)
        # output = torch.tanh(self.linear_out(
        #     concated.view(-1, 2*hidden_size))).view(batch_size, -1, hidden_size)

        return attention_output, attention_distribution

## 模型构建

In [10]:
class Encoder(nn.Module):
    r"""Applies a multi-layer LSTM to an variable length input sequence.
    """

    def __init__(self, input_size, hidden_size, num_layers,
                 dropout=0.0, bidirectional=True, rnn_type='lstm'):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.rnn_type = rnn_type
        self.dropout = dropout
        if self.rnn_type == 'lstm':
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers,
                               batch_first=True,
                               dropout=dropout,
                               bidirectional=bidirectional)

    def forward(self, padded_input, input_lengths):
        """
        Args:
            padded_input: N x T x D
            input_lengths: N

        Returns: output, hidden
            - **output**: N x T x H
            - **hidden**: (num_layers * num_directions) x N x H 
        """
        # Add total_length for supportting nn.DataParallel() later
        # see https://pytorch.org/docs/stable/notes/faq.html#pack-rnn-unpack-with-data-parallelism
        total_length = padded_input.size(1)  # get the max sequence length
        packed_input = pack_padded_sequence(padded_input, input_lengths,
                                            batch_first=True)
        packed_output, hidden = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output,
                                        batch_first=True,
                                        total_length=total_length)
        return output, hidden

    def flatten_parameters(self):
        self.rnn.flatten_parameters()


In [11]:
class Decoder(nn.Module):
    """
    """

    def __init__(self, vocab_size, embedding_dim, sos_id, eos_id, hidden_size,
                 num_layers, bidirectional_encoder=True):
        super(Decoder, self).__init__()
        # Hyper parameters
        # embedding + output
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.sos_id = sos_id  # Start of Sentence
        self.eos_id = eos_id  # End of Sentence
        # rnn
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional_encoder = bidirectional_encoder  # useless now
        self.encoder_hidden_size = hidden_size  # must be equal now
        # Components
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.rnn = nn.ModuleList()
        self.rnn += [nn.LSTMCell(self.embedding_dim +
                                 self.encoder_hidden_size, self.hidden_size)]
        for l in range(1, self.num_layers):
            self.rnn += [nn.LSTMCell(self.hidden_size, self.hidden_size)]
        self.attention = DotProductAttention()
        self.mlp = nn.Sequential(
            nn.Linear(self.encoder_hidden_size + self.hidden_size,
                      self.hidden_size),
            nn.Tanh(),
            nn.Linear(self.hidden_size, self.vocab_size))

    def zero_state(self, encoder_padded_outputs, H=None):
        N = encoder_padded_outputs.size(0)
        H = self.hidden_size if H == None else H
        return encoder_padded_outputs.new_zeros(N, H)

    def forward(self, padded_input, encoder_padded_outputs):
        """
        Args:
            padded_input: N x To
            # encoder_hidden: (num_layers * num_directions) x N x H
            encoder_padded_outputs: N x Ti x H

        Returns:
        """
        # *********Get Input and Output
        # from espnet/Decoder.forward()
        # TODO: need to make more smart way
        ys = [y[y != IGNORE_ID] for y in padded_input]  # parse padded ys
        # prepare input and output word sequences with sos/eos IDs
        eos = ys[0].new([self.eos_id])
        sos = ys[0].new([self.sos_id])
        ys_in = [torch.cat([sos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]
        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = pad_list(ys_in, self.eos_id)
        ys_out_pad = pad_list(ys_out, IGNORE_ID)
        # print("ys_in_pad", ys_in_pad.size())
        assert ys_in_pad.size() == ys_out_pad.size()
        batch_size = ys_in_pad.size(0)
        output_length = ys_in_pad.size(1)
        # max_length = ys_in_pad.size(1) - 1  # TODO: should minus 1(sos)?

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_padded_outputs)]
        c_list = [self.zero_state(encoder_padded_outputs)]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_padded_outputs))
            c_list.append(self.zero_state(encoder_padded_outputs))
        att_c = self.zero_state(encoder_padded_outputs,
                                H=encoder_padded_outputs.size(2))
        y_all = []

        # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP
        embedded = self.embedding(ys_in_pad)
        for t in range(output_length):
            # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
            rnn_input = torch.cat((embedded[:, t, :], att_c), dim=1)
            h_list[0], c_list[0] = self.rnn[0](
                rnn_input, (h_list[0], c_list[0]))
            for l in range(1, self.num_layers):
                h_list[l], c_list[l] = self.rnn[l](
                    h_list[l-1], (h_list[l], c_list[l]))
            rnn_output = h_list[-1]  # below unsqueeze: (N x H) -> (N x 1 x H)
            # step 2. attention: c_i = AttentionContext(s_i,h)
            att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                          encoder_padded_outputs)
            att_c = att_c.squeeze(dim=1)
            # step 3. concate s_i and c_i, and input to MLP
            mlp_input = torch.cat((rnn_output, att_c), dim=1)
            predicted_y_t = self.mlp(mlp_input)
            y_all.append(predicted_y_t)

        y_all = torch.stack(y_all, dim=1)  # N x To x C
        # **********Cross Entropy Loss
        # F.cross_entropy = NLL(log_softmax(input), target))
        y_all = y_all.view(batch_size * output_length, self.vocab_size)
        ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1),
                                  ignore_index=IGNORE_ID,
                                  reduction='mean')

        return ce_loss

       

    def recognize_beam(self, encoder_outputs, char_list, args):
        """Beam search, decode one utterence now.
        Args:
            encoder_outputs: T x H
            char_list: list of character
            args: args.beam

        Returns:
            nbest_hyps:
        """
        # search params
        beam = args.beam_size
        nbest = args.nbest
        if args.decode_max_len == 0:
            maxlen = encoder_outputs.size(0)
        else:
            maxlen = args.decode_max_len

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        c_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
            c_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
        att_c = self.zero_state(encoder_outputs.unsqueeze(0),
                                H=encoder_outputs.unsqueeze(0).size(2))
        # prepare sos
        y = self.sos_id
        vy = encoder_outputs.new_zeros(1).long()

        hyp = {'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list,
               'a_prev': att_c}
        hyps = [hyp]
        ended_hyps = []

        for i in range(maxlen):
            hyps_best_kept = []
            for hyp in hyps:
                # vy.unsqueeze(1)
                vy[0] = hyp['yseq'][i]
                embedded = self.embedding(vy)
                # embedded.unsqueeze(0)
                # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
                rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1)
                h_list[0], c_list[0] = self.rnn[0](
                    rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0]))
                for l in range(1, self.num_layers):
                    h_list[l], c_list[l] = self.rnn[l](
                        h_list[l-1], (hyp['h_prev'][l], hyp['c_prev'][l]))
                rnn_output = h_list[-1]
                # step 2. attention: c_i = AttentionContext(s_i,h)
                # below unsqueeze: (N x H) -> (N x 1 x H)
                att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                              encoder_outputs.unsqueeze(0))
                att_c = att_c.squeeze(dim=1)
                # step 3. concate s_i and c_i, and input to MLP
                mlp_input = torch.cat((rnn_output, att_c), dim=1)
                predicted_y_t = self.mlp(mlp_input)
                local_scores = F.log_softmax(predicted_y_t, dim=1)
                # topk scores
                local_best_scores, local_best_ids = torch.topk(
                    local_scores, beam, dim=1)

                for j in range(beam):
                    new_hyp = {}
                    new_hyp['h_prev'] = h_list[:]
                    new_hyp['c_prev'] = c_list[:]
                    new_hyp['a_prev'] = att_c[:]
                    new_hyp['score'] = hyp['score'] + local_best_scores[0, j]
                    new_hyp['yseq'] = [0] * (1 + len(hyp['yseq']))
                    new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq']
                    new_hyp['yseq'][len(hyp['yseq'])] = int(
                        local_best_ids[0, j])
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(hyps_best_kept,
                                        key=lambda x: x['score'],
                                        reverse=True)[:beam]
            # end for hyp in hyps
            hyps = hyps_best_kept

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                for hyp in hyps:
                    hyp['yseq'].append(self.eos_id)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp['yseq'][-1] == self.eos_id:
                    # hyp['score'] += (i + 1) * penalty
                    ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            hyps = remained_hyps
            if len(hyps) > 0:
                print('remeined hypothes: ' + str(len(hyps)))
            else:
                print('no hypothesis. Finish decoding.')
                break

            for hyp in hyps:
                print('hypo: ' + ''.join([char_list[int(x)]
                                          for x in hyp['yseq'][1:]]))
        # end for i in range(maxlen)
        nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[
            :min(len(ended_hyps), nbest)]
        return nbest_hyps

In [12]:
class Seq2Seq(nn.Module):
    """Sequence-to-Sequence architecture with configurable encoder and decoder.
    """

    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, padded_input, input_lengths, padded_target):
        """
        Args:
            padded_input: N x Ti x D
            padded_targets: N x To
        """
        encoder_padded_outputs, _ = self.encoder(padded_input , input_lengths)
        loss = self.decoder(padded_target, encoder_padded_outputs)
        return loss
    
    def recognize(self, input, input_lengths, char_list, args):
        """Sequence-to-Sequence beam search, decode one utterence now.
        Args:
            input: T x D
            char_list: list of characters
            args: args.beam

        Returns:
            nbest_hyps:
        """
        encoder_outputs, _ = self.encoder(input, input_lengths)
#         print("encoder_outputs", encoder_outputs.squeeze(1).shape)
        
        nbest_hyps = self.decoder.recognize_beam(encoder_outputs.squeeze(0), char_list, args)
        return nbest_hyps


## 单步训练

In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

## 训练迭代

In [14]:
# def trainIters(model, optimizier, print_every=5, plot_every=100, learning_rate=0.01):
#     start = time.time()
#     n_iters = len(tr_dataset)
#     plot_losses = []
#     print_loss_total = 0  # Reset every print_every
#     plot_loss_total = 0  # Reset every plot_every

#     encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
#     decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
# #     training_pairs = random.choices(a, k=n_iters)
    
#     criterion = nn.NLLLoss()

# #     for utt in training_pairs:
#     for i, (data) in enumerate(tr_loader):
#         padded_input, input_lengths, padded_target = data
#         padded_input, input_lengths, padded_target = data
#         padded_input = padded_input.cuda()
#         input_lengths = input_lengths.cuda()
#         padded_target = padded_target.cuda()
# #         print("padded_input:",padded_input.shape)
#         loss = model(padded_input, input_lengths, padded_target)
# #         print(loss) #.requires_grad
#         print_loss_total += float(loss)
#         plot_loss_total += float(loss)
        
#         optimizier.zero_grad()
#         loss.backward()
        
#         optimizier.step()

#         if (i+1) % print_every == 0:
#             print_loss_avg = print_loss_total / print_every
#             print_loss_total = 0
#             print('%s (%d %d%%) %.4f' % (timeSince(start, (i+1) / n_iters),
#                                          (i+1), (i+1) / n_iters * 100, print_loss_avg))

#         if i+1 % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0



In [15]:
def trainIters(model, epoch, optimizier, print_every=10, plot_every=10, learning_rate=0.01):
    log = open('train_esp.log', 'w')
    start = time.time()
    n_iters = len(tr_dataset)
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
#     training_pairs = random.choices(a, k=n_iters)
    
    criterion = nn.NLLLoss()

    for e in range(epoch):
        for i, (data) in enumerate(tr_loader):
            padded_input, input_lengths, padded_target = data
            padded_input, input_lengths, padded_target = data
            padded_input = padded_input.cuda()
            input_lengths = input_lengths.cuda()
            padded_target = padded_target.cuda()
    #         print("padded_input:",padded_input.shape)
            loss = model(padded_input, input_lengths, padded_target)
    #         print(loss) #.requires_grad
            print_loss_total += float(loss)
            plot_loss_total += float(loss)

            optimizier.zero_grad()
            loss.backward()

            optimizier.step()

            if (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                txt = 'Epoch %d | Iter %d | %s (%d %d%%) %.4f' % (e+1, i+1, timeSince(start, (e *n_iters +i+1) / (n_iters*epoch)),
                                             (i+1), (e *n_iters +i+1) / (n_iters*epoch) * 100, print_loss_avg)
                print(txt)
                log.write(txt + "\n")
                log.flush()
            if i+1 % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    log.close()

In [None]:
input_size = 240

hidden_size = 256
vocab_size = len(char_list)
embedding_dim = 512
sos_id = 0
eos_id = 1
learning_rate = 1e-3
momentum = 0
l2 = 1e-5

IGNORE_ID=-1

encoder = Encoder(input_size, hidden_size, 3, dropout=0.0)
decoder = Decoder(vocab_size, embedding_dim, sos_id, eos_id, hidden_size*2,
                 num_layers=1, bidirectional_encoder=True)

model = Seq2Seq(encoder, decoder)
print(model)
model.cuda()

optimizier = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
#                                      momentum=momentum,
                                     weight_decay=l2)
trainIters(model, 15,optimizier, print_every=20)

Seq2Seq(
  (encoder): Encoder(
    (rnn): LSTM(240, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(4233, 512)
    (rnn): ModuleList(
      (0): LSTMCell(1024, 512)
    )
    (attention): DotProductAttention()
    (mlp): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): Tanh()
      (2): Linear(in_features=512, out_features=4233, bias=True)
    )
  )
)
Epoch 1 | Iter 20 | 0m 4s (- 214m 16s) (20 0%) 7.2342
Epoch 1 | Iter 40 | 0m 7s (- 183m 53s) (40 0%) 6.7838
Epoch 1 | Iter 60 | 0m 10s (- 168m 33s) (60 0%) 6.5869
Epoch 1 | Iter 80 | 0m 13s (- 160m 58s) (80 0%) 6.3944
Epoch 1 | Iter 100 | 0m 16s (- 156m 16s) (100 0%) 6.2635
Epoch 1 | Iter 120 | 0m 19s (- 152m 23s) (120 0%) 6.1941
Epoch 1 | Iter 140 | 0m 22s (- 149m 10s) (140 0%) 5.9539
Epoch 1 | Iter 160 | 0m 25s (- 146m 34s) (160 0%) 5.8947
Epoch 1 | Iter 180 | 0m 27s (- 144m 47s) (180 0%) 5.7971
Epoch 1 | Iter 200 | 0m 30s (- 142m 38s) (200 0%) 5.7075

In [23]:
class Args(object):
    def __init__(self, beam_size, nbest, decode_max_len):
        self.beam_size = beam_size
        self.nbest = nbest
        self.decode_max_len = decode_max_len

In [92]:
# te_dataset[1][1][1]#["input"][0]["feat"]

In [101]:
args = Args(30, 1, 25)
b = 10
n = 1
sample = te_dataset[b][n][1]
input_tensor = torch.tensor(kaldi_io.read_mat(sample["input"][0]["feat"]))
a = input_tensor.shape[0]//4*4
input_tensor = input_tensor[0:(a if a < MAX_LENGTH*4 else MAX_LENGTH*4 ),:]

g = input_tensor.shape[0]
print(torch.tensor([g]))

input_tensor = input_tensor.unsqueeze(0)
print("input_tensor:",input_tensor.shape)
b = model.recognize(input_tensor.to(device),torch.tensor([g]), char_list, args)
print("正解：", sample["output"][0]["text"])

tensor([800])
input_tensor: torch.Size([1, 800, 240])
remeined hypothes: 30
hypo: 虽
hypo: 最
hypo: 罪
hypo: 自
hypo: 非
hypo: 遂
hypo: 今
hypo: 这
hypo: 飞
hypo: 碎
hypo: 嘴
hypo: 岁
hypo: 黑
hypo: 昨
hypo: 醉
hypo: 却
hypo: 费
hypo: 作
hypo: 据
hypo: 俊
hypo: 近
hypo: 剧
hypo: 瑞
hypo: 随
hypo: 脆
hypo: 孙
hypo: 尊
hypo: 滋
hypo: 资
hypo: 日
remeined hypothes: 30
hypo: 虽然
hypo: 最人
hypo: 最难
hypo: 最年
hypo: 最终
hypo: 最让
hypo: 最大
hypo: 最完
hypo: 最近
hypo: 自然
hypo: 罪人
hypo: 非常
hypo: 最忍
hypo: 最爱
hypo: 最安
hypo: 最后
hypo: 今年
hypo: 遂然
hypo: 最低
hypo: 最引
hypo: 最严
hypo: 最简
hypo: 最繁
hypo: 最便
hypo: 最令
hypo: 最圆
hypo: 最远
hypo: 最然
hypo: 最文
hypo: 飞人
remeined hypothes: 30
hypo: 虽然投
hypo: 虽然头
hypo: 最人投
hypo: 虽然陶
hypo: 最难投
hypo: 最年投
hypo: 虽然淘
hypo: 最终投
hypo: 虽然图
hypo: 最终的
hypo: 虽然土
hypo: 最难头
hypo: 虽然他
hypo: 最让投
hypo: 虽然妥
hypo: 最人头
hypo: 虽然桃
hypo: 最完投
hypo: 最让头
hypo: 虽然讨
hypo: 虽然打
hypo: 最难的
hypo: 虽然坦
hypo: 虽然吐
hypo: 最近的
hypo: 虽然偷
hypo: 虽然倘
hypo: 虽然透
hypo: 最大投
hypo: 虽然的
remeined hypothes: 30
hypo: 虽然投入
hypo: 虽然头露
hypo: 虽然头颅
hypo: 最人投入
hypo

remeined hypothes: 16
hypo: 虽然投入产生问题继父亲儿后企业界瞩目录呢
hypo: 虽然投入产生问题继父亲儿后企业绩的目录呢
hypo: 虽然投入产生问题继父亲儿后企业绩的入户呢
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界
hypo: 虽然投入产生问题继父亲儿后企业绩的危机会
hypo: 虽然投入产生问题继父亲儿后企业绩的危机机
hypo: 虽然投入产生问题继父亲儿后企业绩的危机或
hypo: 虽然投入产生问题继父亲儿后企业绩的危机期
hypo: 虽然投入产生问题继父亲儿后企业绩的入户了
hypo: 虽然投入产生问题继父亲儿后企业绩的危机乎
hypo: 虽然投入产生问题继父亲儿后企业绩的危机之
hypo: 虽然投入产生问题继父亲儿后企业绩无预户呢
hypo: 虽然投入产生问题继父亲儿后企业绩的危机记
hypo: 虽然投入产生问题继父亲儿后企业绩的危机下
hypo: 虽然投入产生问题继父亲儿后企业绩的危机味
hypo: 虽然投入产生问题继父亲儿后企业界瞩目录了
remeined hypothes: 18
hypo: 虽然投入产生问题继父亲儿后企业绩的危机机会
hypo: 虽然投入产生问题继父亲儿后企业绩的危机记住
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界袭
hypo: 虽然投入产生问题继父亲儿后企业绩的危机机器
hypo: 虽然投入产生问题继父亲儿后企业绩的危机下降
hypo: 虽然投入产生问题继父亲儿后企业绩的危机期待
hypo: 虽然投入产生问题继父亲儿后企业绩的危机味地
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界限
hypo: 虽然投入产生问题继父亲儿后企业绩的危机机制
hypo: 虽然投入产生问题继父亲儿后企业绩的危机机械
hypo: 虽然投入产生问题继父亲儿后企业绩的危机乎让
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界面
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界度
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界惊
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界依
hypo: 虽然投入产生问题继父亲儿后企业绩的危机味呢
hypo: 虽然投入产生问题继父亲儿后企业绩的危机之间
hypo: 虽然投入产生问题继父亲儿后企业绩的危机界也
remeined hypothes: 1

In [None]:
torch.cuda.memory_cached()/1000000

In [None]:
dataset[2]

In [None]:
data_list[1]

In [None]:
for ch in data_list[1]["corpus"]:
    print(ch.item())

In [None]:
d = torch.tensor([[[1,2,3],[4,5,6],[7,8,9],[10,11,12]]])

In [None]:
lang.index2word[1]

In [None]:
d.view((4,1,3))

In [None]:
d[0]