In [1]:
%config ZMQInteractiveShell.ast_node_interactivity = "all"
%pprint

Pretty printing has been turned OFF


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import sys
import torch
import torch.nn as nn
from functools import reduce
sys.path.append("../d2l_func/")
from data_prepare import load_data_jay_song, data_iter_random, data_iter_consecutive, to_onehot
from sqdm import sqdm

In [4]:
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

# deep RNNs

普通的RNNs模型，如果从时间步来看，它是一个深度模型，而如果从纵向来看，它其实是一个浅层的模型。
- 因此，可以在纵向上添加多个隐藏层，使得模型变成深度RNN模型

![深度RNNs](./img/deep-rnn.svg)

第一层的运算和普通的RNN没有什么区别，而后面层的输入则是上一层隐层的输出
$$H^{(1)} = \delta(X_{t}W_{xh}^{(1)} + H_{t-1}^{(1)}W_{hh}^{(1)} + b_h^{(1)})$$
$$H^{(l)} = \delta(H_{t}^{(l-1)}W_{xh}^{(l)} + H_{t-1}^{(l)}W_{hh}^{(l)} + b_h^{(l)})$$

## Deep-RNNs

这里实现普通的深层RNNs模型

### 自定义实现

In [5]:
# 定义参数
def get_params(input_num, hidden_num, num_layers, device):
    # weight
    def _ones(shape):
        weight = nn.Parameter(torch.normal(0, 0.01, size=shape, device=device), requires_grad=True)
        return weight
    
    def _zeros(shape):
        bias = nn.Parameter(torch.zeros(shape, device=device), requires_grad=True)
        return bias
    
    def _three(input_num, hidden_num):
        return [
            _ones((input_num, hidden_num)),
            _ones((hidden_num, hidden_num)),
            _zeros(hidden_num)
        ]
    
    # hidden layer
    params = _three(input_num, hidden_num)
    for num in range(1, num_layers):
        params.extend(_three(hidden_num, hidden_num))
        
    # outputs layer
    params.append(_ones((hidden_num, input_num)))
    params.append(_zeros(input_num))
    return params

In [6]:
# 定义rnn隐藏层的状态
def init_rnn_hidden_state(batch_size, hidden_num, num_layers, device):
#     return reduce(lambda x, y: torch.cat((x, y)), 
#                   [torch.zeros(1, batch_size, hidden_num, device=device)]*num_layers)
    return [torch.zeros(batch_size, hidden_num, device=device) for _ in range(num_layers)]

# 定义rnn网络计算
def deep_rnn(inputs, params, h_state, num_layers, device):
    # 隐藏层的输出
    outputs = []
    
    
    # inputs.shape is (num_step, batch_size, vocab_size), h_state.shape is (num_layers, batch_size, hidden_num)
    for i in range(inputs.shape[0]):
        x = inputs[i]
        for num in range(num_layers):
            h_state[num] = torch.tanh(torch.mm(x, params[3*num]) + 
                                      torch.mm(h_state[num], params[3*num+1]) + params[3*num+2])
            x = h_state[num]
        y = torch.mm(x, params[-2] + params[-1])
        outputs.append(y.unsqueeze(0))
        
    return reduce(lambda x,y: torch.cat((x, y)), outputs), h_state

In [7]:
# 验证
set_seed(2020)
vocab_size, hidden_num, num_layers, device = 15, 20, 2, "cuda"
inputs = torch.arange(10).view(2, 5)
inputs = to_onehot(inputs, vocab_size, device)
params = get_params(vocab_size, hidden_num, num_layers, device)
h_state = init_rnn_hidden_state(inputs.shape[1], hidden_num, num_layers, device)
outputs, h_state = deep_rnn(inputs, params, h_state, num_layers, device)
# print(outputs.shape, h_state.shape)

#### 预测

In [8]:
def predict_rnn(prefix, pred_num, model, init_rnn_hidden_state, hidden_num, num_layers, 
                params, char_to_idx, vocab_set, vocab_size, device):
    # 将词转为index
    outputs = [char_to_idx[prefix[0]]]
    # 初始化hidden state
    h_state = init_rnn_hidden_state(1, hidden_num, num_layers, device)
    
    for i in range(len(prefix) + pred_num - 1):
        # to_onehot接收的shape为(batch_size, time_step), 单独预测时time_step为1
        inputs = to_onehot(torch.tensor(outputs[-1]).view(-1, 1), vocab_size, device)
        # 预测, y.shape-->(1, 1, vocab_size), h_state.shape-->(num_layers, batch_size, hidden_num)
        y, h_state = model(inputs, params, h_state, num_layers, device)
        # 添加到outpus
        if i + 1 < len(prefix):
            outputs.append(char_to_idx[prefix[i+1]])
        else:
            outputs.append(y.argmax(dim=2).item())
    return "".join(vocab_set[i] for i in outputs)

In [9]:
# load data
set_seed(2020)
hidden_num, pred_num, num_layers, device = 256, 10, 2, "cuda"
corpus_index, char_to_idx, vocab_set, vocab_size = load_data_jay_song()
params = get_params(vocab_size, hidden_num, num_layers, device)
# 预测
predict_rnn("分开", pred_num, deep_rnn, init_rnn_hidden_state, hidden_num, 
            num_layers, params, char_to_idx, vocab_set, vocab_size, device)

'分开诀舍键亲找忽此泛马骑'

#### 训练

In [10]:
from optim import sgd, grad_clipping

In [11]:
# training
def train_rnn(epoch_num, batch_num, model, loss, init_hidden_state, get_params, data_iter, corpus_index,
              num_step, hidden_num, lr, batch_size, char_to_idx, vocab_set, vocab_size, prefixs, num_layers,
              predict_rnn, pred_num, clipping_theta=1e-2, random_sample=True, device="cuda"):
    """
    function: training and predict in rnn
    params epoch_num: the number of epoch
    params batch_num: the number of batch in a epoch
    params rnn: the rnn model
    params loss: such as nn.CrossEntropyLoss()
    params init_hidden_state: define the state of hidden layer
    params get_params: get the weight and bias in rnn
    params data_iter: data_iter_random/data_iter_consecutive
    params corpus_index: the index of corpus
    params num_step: the number of time step in rnn
    params hidden_num: the number of unit in hidden layer in rnn
    params lr: the learning rate
    params batch_size: the size of a batch
    params char_to_idx: char index which convert Chinese to idx
    params vocab_set: the list of word in corpus
    params vocab_size: the length of vocab_set
    params prefixs: the list include input when you want to predict, such as ["分开", "不分开"]
    params pred_num: the number you want to predict
    params clipping_heta: the max value of the norm of grad
    params random_sample: if sample in random, use data_iter_random. otherwise, use data_iter_consecutive
    params device: "cpu"/"cuda"
    """
    # training bar
    process_bar = sqdm()
    # init
    l_sum, n_class = 0, 0
    # get params in rnn
    params = get_params(vocab_size, hidden_num, num_layers, device)

    for epoch in range(epoch_num):
        # sample in consecutive
        if not random_sample:
            h_state = init_rnn_hidden_state(batch_size, hidden_num, num_layers, device)
        print(f"Epoch [{epoch + 1}/{epoch_num}]")
        for x, y in data_iter(corpus_index, batch_size, num_step, device):
            # x shape: (num_step, batch_size, vocab_size)
            inputs = to_onehot(x, vocab_size, device)
            # if sample with random, init h_state in each batch
            if random_sample:
                h_state = init_rnn_hidden_state(inputs.shape[1], hidden_num, num_layers, device)
            else:
                if h_state is not None:
                    if isinstance(h_state, list):
                        h_state = [h_state[0].detach_(), h_state[1].detach_()]
                    else:
                        # split h_state from cal graph, when sample_consecusive
                        h_state.detach_()

            # rnn, the shape of outputs is (num_step, batch_size, vocab_size)
            outputs, h_state = model(inputs, params, h_state, num_layers, device)
#             print(outputs.shape, h_state.shape)
            # In order to calculate loss, change outputs shape and y shape
            outputs = outputs.view(-1, outputs.shape[-1])
            y = y.transpose(0, 1).contiguous().view(-1)
            # calculate loss, y --> long type
            l = loss(outputs, y.long())
            
            # update params
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()

            # backward
            l.backward()
            # grad clip
            grad_clipping(params, clipping_theta, device)
            # sgd
            sgd(params, lr)

            # loss_sum
            l_sum += l.item() * y.shape[0]
            n_class += y.shape[0]

            # perplexity
            try:
                perplexity = np.exp(l_sum / n_class)
            except OverflowError:
                perplexity = float("inf")

            # training bar
            process_bar.show_process(batch_num, 1, train_loss=perplexity)

        # predict
        print("\n")
        for prefix in prefixs:
            print(f"prefix-{prefix}: ", predict_rnn(prefix, pred_num, model, init_rnn_hidden_state, hidden_num, 
                                                    num_layers, params, char_to_idx, vocab_set, vocab_size, device))
        print("\n")

In [12]:
super_params = {
    "epoch_num": 1000,
    "model": deep_rnn,
    "loss": nn.CrossEntropyLoss(),
    "init_hidden_state": init_rnn_hidden_state,
    "hidden_num": 256,
    "get_params": get_params,
    "batch_size": 64,
    "num_step": 32,
    "corpus_index": corpus_index,
    "data_iter": data_iter_consecutive,
    "lr": 10,
    "char_to_idx": char_to_idx,
    "vocab_set": vocab_set,
    "vocab_size": vocab_size,
    "predict_rnn": predict_rnn,
    "pred_num": 50,
    "prefixs": ["分开", "不分开"],
    "num_layers": 2,
    "random_sample": False
}

super_params["batch_num"] = len(list(data_iter_consecutive(corpus_index, super_params["batch_size"],
                                                     super_params["num_step"], "cpu")))

train_rnn(**super_params)

Epoch [1/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 923.5364, train_score: -, test_loss: -, test_score: --

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [2/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 672.6997, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [3/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 587.1384, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [4/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 545.1814, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                    

### 简洁实现

In [13]:
from model_train import train_rnn_pytorch
from predict import predict_rnn_pytorch
from rnn_model import RNNModel

In [14]:
# load data
corpus_index, char_to_idx, vocab_set, vocab_size = load_data_jay_song()
# model
hidden_num, num_layers = 256, 2
rnn_layer = nn.RNN(vocab_size, hidden_num, num_layers)
model = RNNModel(rnn_layer, vocab_size)
model = model.cuda()
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

params = {
    "epoch_num": 1000,
    "model": model,
    "loss": loss,
    "optimizer": optimizer,
    "batch_size": 64,
    "num_step": 32,
    "corpus_index": corpus_index,
    "data_iter": data_iter_consecutive,
    "char_to_idx": char_to_idx,
    "vocab_set": vocab_set,
    "vocab_size": vocab_size,
    "predict_rnn_pytorch": predict_rnn_pytorch,
    "pred_num": 50,
    "prefixs": ["分开", "不分开"],
    "random_sample": False
}

params["batch_num"] = len(list(data_iter_consecutive(corpus_index, params["batch_size"],
                                                     params["num_step"], "cpu")))

train_rnn_pytorch(**params)

Epoch [1/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 585.5224, train_score: -, test_loss: -, test_score: --

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [2/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 492.3270, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [3/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 463.4048, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [4/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 449.2634, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                    

## 双向RNN

### 自定义实现

#### 结构定义

In [15]:
# 定义参数
def get_params(input_num, hidden_num, device):
    # weight
    def _ones(shape):
        weight = nn.Parameter(torch.normal(0, 0.01, size=shape, device=device), requires_grad=True)
        return weight
    
    def _zeros(shape):
        bias = nn.Parameter(torch.zeros(shape, device=device), requires_grad=True)
        return bias
    
    def _three(input_num, hidden_num):
        return [
            _ones((input_num, hidden_num)),
            _ones((hidden_num, hidden_num)),
            _zeros(hidden_num)
        ]
    
    # hidden layer
    params = _three(input_num, hidden_num)
    params.extend(_three(input_num, hidden_num))
        
    # outputs layer
    params.append(_ones((hidden_num*2, input_num)))
    params.append(_zeros(input_num))
    return params

In [16]:
# 定义rnn隐藏层的状态
def init_rnn_hidden_state(batch_size, hidden_num, device):
#     return reduce(lambda x, y: torch.cat((x, y)), 
#                   [torch.zeros(1, batch_size, hidden_num, device=device)]*num_layers)
    return [torch.zeros(batch_size, hidden_num, device=device), 
            torch.zeros(batch_size, hidden_num, device=device)]

# 定义rnn网络计算
def birnn(inputs, params, h_state, device):
    # 隐藏层的输出
    outputs = []
    # 反转inputs(根据时间步)
    inputs_convert = torch.index_select(inputs, 0, 
                                        torch.arange(inputs.shape[0]-1, -1, -1, device=inputs.device))
    
    # inputs.shape is (num_step, batch_size, vocab_size), h_state.shape is (num_layers, batch_size, hidden_num)
    for i in range(inputs.shape[0]):
        h_state[0] = torch.tanh(torch.mm(inputs[i], params[0]) + 
                                  torch.mm(h_state[0], params[1]) + params[2])
        h_state[1] = torch.tanh(torch.mm(inputs_convert[i], params[3]) + 
                                  torch.mm(h_state[1], params[4]) + params[5])
        y = torch.mm(torch.cat((h_state[0], h_state[1]), dim=1), params[-2]) + params[-1]
        outputs.append(y.unsqueeze(0))
        
    return reduce(lambda x,y: torch.cat((x, y)), outputs), h_state

In [17]:
# 验证
set_seed(2020)
vocab_size, hidden_num, device = 15, 20, "cuda"
inputs = torch.arange(10).view(2, 5)
inputs = to_onehot(inputs, vocab_size, device)
params = get_params(vocab_size, hidden_num, device)
h_state = init_rnn_hidden_state(inputs.shape[1], hidden_num, device)
outputs, h_state = birnn(inputs, params, h_state, device)
# print(outputs.shape, h_state.shape)

#### 预测

In [18]:
def predict_rnn(prefix, pred_num, model, init_rnn_hidden_state, hidden_num, 
                params, char_to_idx, vocab_set, vocab_size, device):
    # 将词转为index
    outputs = [char_to_idx[prefix[0]]]
    # 初始化hidden state
    h_state = init_rnn_hidden_state(1, hidden_num, device)
    
    for i in range(len(prefix) + pred_num - 1):
        # to_onehot接收的shape为(batch_size, time_step), 单独预测时time_step为1
        inputs = to_onehot(torch.tensor(outputs[-1]).view(-1, 1), vocab_size, device)
        # 预测, y.shape-->(1, 1, vocab_size), h_state.shape-->(num_layers, batch_size, hidden_num)
        y, h_state = model(inputs, params, h_state, device)
        # 添加到outpus
        if i + 1 < len(prefix):
            outputs.append(char_to_idx[prefix[i+1]])
        else:
            outputs.append(y.argmax(dim=2).item())
    return "".join(vocab_set[i] for i in outputs)

In [19]:
# load data
set_seed(2020)
hidden_num, pred_num, num_layers, device = 256, 10, 2, "cuda"
corpus_index, char_to_idx, vocab_set, vocab_size = load_data_jay_song()
params = get_params(vocab_size, hidden_num, device)
# 预测
predict_rnn("分开", pred_num, birnn, init_rnn_hidden_state, hidden_num, 
            params, char_to_idx, vocab_set, vocab_size, device)

'分开麦書浅位轨氧莉褪详殿'

#### 训练

In [20]:
from optim import sgd, grad_clipping

In [21]:
# training
def train_rnn(epoch_num, batch_num, model, loss, init_hidden_state, get_params, data_iter, corpus_index,
              num_step, hidden_num, lr, batch_size, char_to_idx, vocab_set, vocab_size, prefixs, 
              predict_rnn, pred_num, clipping_theta=1e-2, random_sample=True, device="cuda"):
    """
    function: training and predict in rnn
    params epoch_num: the number of epoch
    params batch_num: the number of batch in a epoch
    params rnn: the rnn model
    params loss: such as nn.CrossEntropyLoss()
    params init_hidden_state: define the state of hidden layer
    params get_params: get the weight and bias in rnn
    params data_iter: data_iter_random/data_iter_consecutive
    params corpus_index: the index of corpus
    params num_step: the number of time step in rnn
    params hidden_num: the number of unit in hidden layer in rnn
    params lr: the learning rate
    params batch_size: the size of a batch
    params char_to_idx: char index which convert Chinese to idx
    params vocab_set: the list of word in corpus
    params vocab_size: the length of vocab_set
    params prefixs: the list include input when you want to predict, such as ["分开", "不分开"]
    params pred_num: the number you want to predict
    params clipping_heta: the max value of the norm of grad
    params random_sample: if sample in random, use data_iter_random. otherwise, use data_iter_consecutive
    params device: "cpu"/"cuda"
    """
    # training bar
    process_bar = sqdm()
    # init
    l_sum, n_class = 0, 0
    # get params in rnn
    params = get_params(vocab_size, hidden_num, device)

    for epoch in range(epoch_num):
        # sample in consecutive
        if not random_sample:
            h_state = init_rnn_hidden_state(batch_size, hidden_num, device)
        print(f"Epoch [{epoch + 1}/{epoch_num}]")
        for x, y in data_iter(corpus_index, batch_size, num_step, device):
            # x shape: (num_step, batch_size, vocab_size)
            inputs = to_onehot(x, vocab_size, device)
            # if sample with random, init h_state in each batch
            if random_sample:
                h_state = init_rnn_hidden_state(inputs.shape[1], hidden_num, device)
            else:
                if h_state is not None:
                    if isinstance(h_state, list):
                        h_state = [h_state[0].detach_(), h_state[1].detach_()]
                    else:
                        # split h_state from cal graph, when sample_consecusive
                        h_state.detach_()

            # rnn, the shape of outputs is (num_step, batch_size, vocab_size)
            outputs, h_state = model(inputs, params, h_state, device)
#             print(outputs.shape, h_state.shape)
            # In order to calculate loss, change outputs shape and y shape
            outputs = outputs.view(-1, outputs.shape[-1])
            y = y.transpose(0, 1).contiguous().view(-1)
            # calculate loss, y --> long type
            l = loss(outputs, y.long())
            
            # update params
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()

            # backward
            l.backward()
            # grad clip
            grad_clipping(params, clipping_theta, device)
            # sgd
            sgd(params, lr)

            # loss_sum
            l_sum += l.item() * y.shape[0]
            n_class += y.shape[0]

            # perplexity
            try:
                perplexity = np.exp(l_sum / n_class)
            except OverflowError:
                perplexity = float("inf")

            # training bar
            process_bar.show_process(batch_num, 1, train_loss=perplexity)

        # predict
        print("\n")
        for prefix in prefixs:
            print(f"prefix-{prefix}: ", predict_rnn(prefix, pred_num, model, init_rnn_hidden_state, hidden_num, 
                                                    params, char_to_idx, vocab_set, vocab_size, device))
        print("\n")

In [22]:
super_params = {
    "epoch_num": 1000,
    "model": birnn,
    "loss": nn.CrossEntropyLoss(),
    "init_hidden_state": init_rnn_hidden_state,
    "hidden_num": 256,
    "get_params": get_params,
    "batch_size": 64,
    "num_step": 32,
    "corpus_index": corpus_index,
    "data_iter": data_iter_random,
    "lr": 15,
    "char_to_idx": char_to_idx,
    "vocab_set": vocab_set,
    "vocab_size": vocab_size,
    "predict_rnn": predict_rnn,
    "pred_num": 50,
    "prefixs": ["分开", "不分开"],
#     "random_sample": False
}

super_params["batch_num"] = len(list(data_iter_random(corpus_index, super_params["batch_size"],
                                                     super_params["num_step"], "cpu")))

train_rnn(**super_params)

Epoch [1/5]
31/31 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 936.3162, train_score: -, test_loss: -, test_score: --

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [2/5]
31/31 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 734.0991, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [3/5]
31/31 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 648.9943, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [4/5]
31/31 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 610.7383, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                    

### 简洁实现

In [23]:
from model_train import train_rnn_pytorch
from predict import predict_rnn_pytorch
from rnn_model import RNNModel

In [24]:
# load data
corpus_index, char_to_idx, vocab_set, vocab_size = load_data_jay_song()
# model
hidden_num, num_layers = 256, 2
rnn_layer = nn.RNN(vocab_size, hidden_num, bidirectional=True)
model = RNNModel(rnn_layer, vocab_size)
model = model.cuda()
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

params = {
    "epoch_num": 1000,
    "model": model,
    "loss": loss,
    "optimizer": optimizer,
    "batch_size": 64,
    "num_step": 32,
    "corpus_index": corpus_index,
    "data_iter": data_iter_consecutive,
    "char_to_idx": char_to_idx,
    "vocab_set": vocab_set,
    "vocab_size": vocab_size,
    "predict_rnn_pytorch": predict_rnn_pytorch,
    "pred_num": 50,
    "prefixs": ["分开", "不分开"],
    "random_sample": False
}

params["batch_num"] = len(list(data_iter_consecutive(corpus_index, params["batch_size"],
                                                     params["num_step"], "cpu")))

train_rnn_pytorch(**params)

Epoch [1/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 615.0332, train_score: -, test_loss: -, test_score: --

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [2/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 492.8114, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开                                                  
prefix-不分开:  不分开                                                  


Epoch [3/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 418.5828, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我
prefix-不分开:  不分开 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我


Epoch [4/5]
30/30 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 339.9148, train_score: -, test_loss: -, test_score: -

prefix-分开:  分开 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我
prefix-不分开:  不分开 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我 我