In [1]:
#pytorchLM 2 较成熟悉案例
#参考一：https://blog.csdn.net/shenfuli/article/details/105053645
#参考二：pytorch官网
#https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html?highlight=lstm

In [1]:
import random
from collections import defaultdict, Counter
from pathlib import Path
import os
import numpy as np
import torch
import torch.nn as nn

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(2020)


                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [265]:
# #打印看是否用了GPU (cuda)还是CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ',device)
DATA_ROOT = 'data/'
print('PyTorch Version:', torch.__version__)
print('DATA_ROOT:',DATA_ROOT)


device:  cuda
PyTorch Version: 1.5.0
DATA_ROOT: data/


In [266]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        
        self.word2idx['<UNK>'] = 0
        self.word2idx['<PAD>'] = 1
        self.idx2word.append('<UNK>')
        self.idx2word.append('<PAD>')

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self, path):
        self.vocab = Dictionary()
        self.train_data = self.tokenize(os.path.join(path, 'train.txt'))#os.path.join(,)相当于path + 'train.txt'
        self.valid_data = self.tokenize(os.path.join(path, 'dev.txt'))
        self.test_data = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<PAD>']#结尾不用<eos>，用<PAD>，一起mask
                for word in words:
                    self.vocab.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<PAD>']#结尾不用<eos>，用<PAD>，一起mask
                ids = []
                for word in words:
                    ids.append(self.vocab.word2idx[word])
                #idss.append(torch.tensor(ids).type(torch.int64))#拷贝数据转tensor格式
                idss.append(ids)
            #ids = torch.cat(idss)#拼接

        return idss#ids
    
#     def tokenize(self, text_path):
        
#         with open(text_path,'r',encoding='utf-8') as f:
#             index_data = []  # 索引数据，存储每个样本的单词索引列表
#             for s in f.readlines():
#                 index_data.append(
#                     self.sentence_to_index(s)
#                 )
#         if self.sort_by_len:  # 为了提升训练速度，可以考虑将样本按照长度排序，这样可以减少padding
#             index_data = sorted(index_data, key=lambda x: len(x), reverse=True)
#         return index_data


    def sentence_to_index(self, s):
        return [self.vocab.word2idx[w] for w in s.split()]
    
    def index_to_sentence(self, x):
        return ' '.join([self.vocab.idx2word[i] for i in x])


In [267]:
model_data_filepath = 'data/'

corpus = Corpus(model_data_filepath)

In [268]:
print(corpus.train_data)

[[2, 3, 4, 1], [2, 3, 5, 1], [6, 7, 8, 9, 10, 8, 11, 12, 1], [13, 5, 14, 15, 16, 17, 18, 1], [19, 20, 21, 22, 23, 1], [24, 25, 26, 27, 28, 29, 30, 9, 31, 32, 1], [33, 34, 35, 36, 37, 38, 39, 40, 41, 1], [33, 42, 43, 44, 45, 34, 46, 3, 47, 48, 1], [49, 50, 51, 52, 36, 53, 54, 55, 1], [56, 51, 57, 58, 54, 59, 60, 61, 62, 51, 52, 45, 63, 64, 65, 66, 18, 1], [67, 68, 69, 70, 71, 41, 39, 1], [49, 51, 37, 38, 39, 40, 41, 1], [72, 51, 57, 58, 73, 33, 74, 75, 76, 1], [77, 9, 56, 51, 57, 58, 78, 76, 1], [33, 42, 43, 44, 79, 47, 48, 1], [33, 42, 43, 44, 80, 34, 81, 82, 83, 84, 85, 86, 87, 1], [88, 89, 90, 1], [13, 34, 91, 92, 93, 94, 61, 47, 48, 1], [95, 96, 97, 9, 98, 99, 96, 97, 100, 101, 93, 102, 1], [103, 104, 1], [105, 106, 61, 107, 108, 109, 1], [19, 20, 21, 22, 23, 110, 111, 112, 29, 30, 9, 31, 32, 1], [113, 49, 114, 115, 102, 116, 74, 40, 41, 1], [33, 5, 60, 117, 117, 118, 119, 120, 1], [121, 122, 123, 92, 124, 125, 126, 127, 128, 129, 130, 63, 64, 1], [19, 20, 21, 22, 23, 110, 111, 112,

In [269]:
for i in range(4):
    print(list(corpus.train_data[i]))
    print(corpus.index_to_sentence(list(corpus.train_data[i])))

[2, 3, 4, 1]
多 囊 肝 <PAD>
[2, 3, 5, 1]
多 囊 肾 <PAD>
[6, 7, 8, 9, 10, 8, 11, 12, 1]
胆 总 管 、 胰 管 扩 张 <PAD>
[13, 5, 14, 15, 16, 17, 18, 1]
右 肾 门 区 致 密 影 <PAD>


In [270]:
#定义Dataset 输入输出切分
class MyDataSet(torch.utils.data.Dataset):
    
    def __init__(self, index_data):
        self.index_data = index_data
        
    def __getitem__(self, i):
        # 根据语言模型定义，这里我们要用前n-1个单词预测后n-1个单词
        feature = self.index_data[i][:-1]
        output = self.index_data[i][1:]
        return  feature,output
        
    def __len__(self):
        return len(self.index_data)


In [271]:
#测试上面dataset
train_set = MyDataSet(corpus.train_data)
print('训练集大小：', len(train_set))
print('训练集样本：')
print('\t输入：', train_set[1][0])
print('\t     ', corpus.index_to_sentence(train_set[1][0]))
print('\结果：', train_set[1][1])
print('\t     ', corpus.index_to_sentence(train_set[1][1]))


训练集大小： 166
训练集样本：
	输入： [2, 3, 5]
	      多 囊 肾
\结果： [3, 5, 1]
	      囊 肾 <PAD>


In [272]:
vocab = Dictionary()
print(vocab.word2idx['<PAD>'])

1


In [273]:
#定义DataLoader，对每个batch预处理，加mask等

PAD_IDX = vocab.word2idx['<PAD>'] #
def lm_collate_fn(batch):
    """
        DataLoader 中对每个batch 进行预处理的函数
        
        输入batch
    """
    # 这里输入的batch格式为[(input_1, target_1), ... ,(input_n, target_n)]
    # 我们要将其格式转换为[(input_1, ... , input_n), (target_1, ... , target_n)]
    batch = list(zip(*batch))
    # 生成长度列表
    lengths = torch.LongTensor( [ len(x) for x in batch[0] ]).to(device)
    # 对输入和目标进行padding
    inputs = [torch.LongTensor(x).to(device) for x in batch[0]]
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True,padding_value=PAD_IDX) # pad_sequence 可以按照最大长度句子进行填充统一长度标准（0填充）
    targets = [torch.LongTensor(x).to(device) for x in batch[1]]
    targets = nn.utils.rnn.pad_sequence(targets, batch_first=True,padding_value=PAD_IDX)
    
    # 因为输入数据没有 “1” 的索引，存在“1” 表示是padding_index 的结构 ，由此生成mask矩阵
    mask = (inputs != 1).float().to(device) # 1 表示该位置存储／ 0 表示该位置不存在 是填充的数据padding（注意：计算loss 的时候需要去掉）
    #print("==MASK==",mask)
    # 在之后的训练中因为还要进行pack_padded_sequence操作，所以在这里按照长度降序排列
    lengths, sorted_index = lengths.sort(descending=True)
    # 根据排序后的perm_index ，进行重新获取数据列表
    inputs = inputs[sorted_index]
    targets = targets[sorted_index]
    mask = mask[sorted_index]
    
    return inputs, targets, lengths, mask
    #返回输入、输出、长度、mask


In [274]:
#测试上面
test_loader = torch.utils.data.DataLoader(
    dataset=train_set,
    batch_size=64,
    shuffle=False,
    collate_fn=lm_collate_fn
)
inputs, targets, lengths, mask = next(iter(test_loader))
print('输入：')
print(inputs)
print('-' * 60)
print('目标：')
print(targets)
print('-' * 60)
print('Mask：')
print(mask)
print('-' * 60)
print('每个样本的实际长度：')
print(lengths)
print('-' * 60)


输入：
tensor([[164,  34,  34,  ..., 172, 102, 173],
        [ 56,  51,  57,  ...,   1,   1,   1],
        [ 67,  68,   4,  ...,   1,   1,   1],
        ...,
        [  4,  93, 102,  ...,   1,   1,   1],
        [193, 149,   1,  ...,   1,   1,   1],
        [103, 104,   1,  ...,   1,   1,   1]], device='cuda:0')
------------------------------------------------------------
目标：
tensor([[ 34,  34,  88,  ..., 102, 173,   1],
        [ 51,  57,  58,  ...,   1,   1,   1],
        [ 68,   4,  13,  ...,   1,   1,   1],
        ...,
        [ 93, 102,   1,  ...,   1,   1,   1],
        [149,   1,   1,  ...,   1,   1,   1],
        [104,   1,   1,  ...,   1,   1,   1]], device='cuda:0')
------------------------------------------------------------
Mask：
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.

In [275]:
#定义网络 BiLSTM

class BiLSTM(nn.Module):
    """语言模型网络架构
    
    Args:
        vocab_size: 词表中的单词数目
        embedding_size: 词向量维度
        hidden_size: LSTM隐含状态的维度
        dropout: Dropout概率
    """
    
    def __init__(self, vocab_size, embedding_size=200, hidden_size=200, dropout=0.5,num_layers=1):
        """
            input_size(embedding_size) – The number of expected features in the input x
            hidden_size – The number of features in the hidden state h
            num_layers – Number of recurrent layers.
            batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
        """
        super(BiLSTM, self).__init__()
        self.num_layers=num_layers
        self.drop = nn.Dropout(dropout)
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.encoder = nn.LSTM(
                            input_size = embedding_size, 
                            hidden_size = hidden_size, 
                            num_layers = 1,
                            batch_first=True,bidirectional=True) 
        #  bidirectional ＝ True 设置False 可以进行对比，效果非常的明显
            
        self.decoder = nn.Linear(2 * hidden_size, vocab_size)
        
 
    
    def forward(self, inputs, lengths):
        # inputs shape: (batch_size, max_length)
        # x_emb shape: (batch_size, max_length, embed_size)
        x_emb = self.drop(self.embed(inputs))
        
        packed_emb = nn.utils.rnn.pack_padded_sequence(
            x_emb, 
            lengths, 
            batch_first=True
        )
        # 这里LSTM的h_0,c_0使用全0的默认初始化，LSTM层经过后丢弃
        packed_out, _ = self.encoder(packed_emb)
        # x_out shape: (batch_size, max_length, hidden_size)
        x_out, _ = nn.utils.rnn.pad_packed_sequence(
            packed_out, batch_first=True
        )
        
        # outputs shape: (batch, max_length, vocab_size)
        return self.decoder(self.drop(x_out))
        
model = BiLSTM(len(corpus.vocab), 200, 200)
model.to(device)#将模型加载到指定的设备，device上面有定义 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')，就是cuda



BiLSTM(
  (drop): Dropout(p=0.5, inplace=False)
  (embed): Embedding(269, 200)
  (encoder): LSTM(200, 200, batch_first=True, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=269, bias=True)
)

In [276]:
#打印测试上面
inputs, targets, lengths, mask = next(iter(test_loader))
outputs = model(inputs, lengths)
print('模型输入Shape：', inputs.shape)
print('模型输出Shape：', outputs.shape)


模型输入Shape： torch.Size([64, 25])
模型输出Shape： torch.Size([64, 25, 269])


In [277]:
#定义损失函数
class MaskCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(MaskCrossEntropyLoss, self).__init__()
        self.celoss = nn.CrossEntropyLoss(reduction='none')
    
    def forward(self, outputs, targets, mask):
        # outputs shape: (batch_size * max_len, vocab_size)
        
        outputs = outputs.view(-1, outputs.size(2)) # outputs.size(2) 获取第2 个维度的大小
        #print('outputs: ',outputs.shape)
        # targets shape: (batch_size * max_len)
        targets = targets.view(-1)
        #print('targets: ',targets.shape)
        # mask shape: (batch_size * max_len)
        mask = mask.view(-1)
        #print('mask: ',mask.shape)
        #print('loss: ',self.celoss(outputs, targets).shape)
        
        #print(mask)
        #print(self.celoss(outputs, targets))
        loss = self.celoss(outputs, targets) * mask # 把pading的loss设置为0
        return torch.sum(loss) / torch.sum(mask) # 非0的loss之和 ➗ 所有非0的个数  平均loss


In [278]:
#测试上面
inputs, targets, lengths, mask = next(iter(test_loader))
outputs = model(inputs, lengths)
criterion = MaskCrossEntropyLoss().to(device)
loss = criterion(outputs, targets, mask)
print('损失值：', loss)


损失值： tensor(5.6059, device='cuda:0', grad_fn=<DivBackward0>)


In [279]:
#模型训练预测
#定义学习器
class LanguageModelLearner:
    def __init__(self, corpus, embedding_size=200, hidden_size=200, dropout=0.5, 
                 batch_size=128, early_stopping_round=5):
        self.corpus = corpus
        self.batch_size = batch_size # 每次加载记录数
        self.early_stopping_round = early_stopping_round
        self.model = BiLSTM(len(corpus.vocab), embedding_size, hidden_size, dropout).to(device) # 初始化model
        self.criterion = MaskCrossEntropyLoss().to(device) # 自定义CrossEntroyLoss(删除了padding 的数据－效果会更好)
        self.optimizer = torch.optim.Adam(self.model.parameters()) # 采用Adam 剃度下降算法 更新权重 
        self.history = defaultdict(list) # list ，存储训练结果
        
    def fit(self, num_epochs):
        # 定义训练集dataloader
        train_set = MyDataSet(self.corpus.train_data)
        train_loader = torch.utils.data.DataLoader(
            dataset=train_set,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=lm_collate_fn # 使用自定义 lm_collate_fn 对训练集长度统一化操作
        )
        
        # 定义验证集dataloader
        valid_set = MyDataSet(self.corpus.valid_data)
        valid_loader = torch.utils.data.DataLoader(
            dataset=valid_set,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=lm_collate_fn
        )
        
        # 记录验证集没有提高的轮数，用于EarlyStopping
        no_improve_round = 0
        
        for epoch in range(num_epochs):            
            train_loss, train_acc, train_words = self._make_train_step(train_loader)
            #if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}:')
            print('Train Step --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(
                train_loss, train_acc, train_words))
            # 记录训练信息
            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
        
            valid_loss, valid_acc, valid_words = self._make_valid_step(valid_loader)
            #if (epoch + 1) % 10 == 0:
            print('Valid Step --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(
                valid_loss, valid_acc, valid_words))
            self.history['valid_loss'].append(valid_loss)
            self.history['valid_acc'].append(valid_acc)
            
            # 根据验证集的准确率进行EarlyStopping
            if self.history['valid_acc'][-1] < max(self.history['valid_acc']):
                no_improve_round += 1
            else:
                no_improve_round = 0
            if no_improve_round == self.early_stopping_round:
                print(f'Early Stopping at Epoch {epoch+1}')
                break
            
        
    def predict(self):
        test_set = MyDataSet(self.corpus.test_data)
        # 这里注意，为了方便之后分析不要shuffle，batch_size设置为1
        test_loader = torch.utils.data.DataLoader(
            dataset=test_set,
            batch_size=1,
            shuffle=False,
            collate_fn=lm_collate_fn
        )
        
        # 验证模式
        self.model.eval()
        
        # 总损失
        total_loss = 0.0
        # 正确预测的数目，单词总数
        total_correct, total_words = 0, 0
        # 预测结果字典，包含preds和targets
        test_result = defaultdict(list) 
        
        with torch.no_grad():
            for inputs, targets, lengths, mask in test_loader:
                # 计算模型输出
                outputs = self.model(inputs, lengths)
                
                # 统计当前预测正确的数目
                total_correct += (outputs.argmax(-1) == targets).sum().item()
                # 统计当前总预测单词数
                total_words += torch.sum(lengths).item()
                
                # 记录结果
                test_result['preds'].append(outputs.argmax(-1).data.cpu().numpy()[0])
                test_result['targets'].append(targets.data.cpu().numpy()[0])
                
                # 计算模型Mask交叉熵损失
                loss = self.criterion(outputs, targets, mask)
                # 统计总损失
                total_loss += loss.item() * torch.sum(mask).item()
        return total_loss / total_words, total_correct / total_words, total_words, test_result
        
    def _make_train_step(self, train_loader):
        # 训练模式
        self.model.train()
        
        # 总损失
        total_loss = 0.0
        # 正确预测的数目，单词总数
        total_correct, total_words = 0, 0
        
        for inputs, targets, lengths, mask in train_loader:
            # 计算模型输出
            outputs = self.model(inputs, lengths)
            
            # 统计当前预测正确的数目
            total_correct += (outputs.argmax(-1) == targets).sum().item()
            # 统计当前总预测单词数
            total_words += torch.sum(lengths).item()
            
            # 计算模型Mask交叉熵损失
            loss = self.criterion(outputs, targets, mask)
            # 统计总损失
            total_loss += loss.item() * torch.sum(mask).item() # torch.sum(mask) 表示真正有效的单词
                        
            # 反向传播
            self.optimizer.zero_grad()
            loss.backward() # w = d_loss/dw 
            self.optimizer.step() #w -= w-learning_rate*w
        return total_loss / total_words, total_correct / total_words, total_words
    
    def _make_valid_step(self, valid_loader):
        # 验证模式
        self.model.eval()
        
        # 总损失
        total_loss = 0.0
        # 正确预测的数目，单词总数
        total_correct, total_words = 0, 0
        
        with torch.no_grad():
            for inputs, targets, lengths, mask in valid_loader:
                # 计算模型输出
                outputs = self.model(inputs, lengths)
                
                # 统计当前预测正确的数目
                total_correct += (outputs.argmax(-1) == targets).sum().item()
                # 统计当前总预测单词数
                total_words += torch.sum(lengths).item()
                
                # 计算模型Mask交叉熵损失
                loss = self.criterion(outputs, targets, mask)
                # 统计总损失
                total_loss += loss.item() * torch.sum(mask).item()
        return total_loss / total_words, total_correct / total_words, total_words


In [280]:
#设定参数，开始训练
torch.cuda.empty_cache()
learner = LanguageModelLearner(corpus, embedding_size=200, hidden_size=200, dropout=0.5, batch_size=128)
learner.fit(10)


Epoch 1:
Train Step --> Loss: 5.594, Acc: 0.008, Words: 1367
Valid Step --> Loss: 5.489, Acc: 0.153, Words: 1367
Epoch 2:
Train Step --> Loss: 5.506, Acc: 0.038, Words: 1367
Valid Step --> Loss: 5.388, Acc: 0.351, Words: 1367
Epoch 3:
Train Step --> Loss: 5.421, Acc: 0.138, Words: 1367
Valid Step --> Loss: 5.286, Acc: 0.454, Words: 1367
Epoch 4:
Train Step --> Loss: 5.338, Acc: 0.231, Words: 1367
Valid Step --> Loss: 5.181, Acc: 0.509, Words: 1367
Epoch 5:
Train Step --> Loss: 5.258, Acc: 0.639, Words: 1367
Valid Step --> Loss: 5.070, Acc: 2.357, Words: 1367
Epoch 6:
Train Step --> Loss: 5.161, Acc: 2.182, Words: 1367
Valid Step --> Loss: 4.950, Acc: 1.861, Words: 1367
Epoch 7:
Train Step --> Loss: 5.062, Acc: 1.890, Words: 1367
Valid Step --> Loss: 4.818, Acc: 2.364, Words: 1367
Epoch 8:
Train Step --> Loss: 4.947, Acc: 1.735, Words: 1367
Valid Step --> Loss: 4.670, Acc: 2.355, Words: 1367
Epoch 9:
Train Step --> Loss: 4.825, Acc: 1.750, Words: 1367
Valid Step --> Loss: 4.499, Acc: 2.

In [281]:
#模型预测
test_loss, test_acc, test_words, test_result = learner.predict()
print('测试集上的结果 --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(
    test_loss, test_acc, test_words))


测试集上的结果 --> Loss: 4.301, Acc: 0.530, Words: 1367


In [291]:
print('预测句子数量：', len(test_result['preds']))
print('-' * 60)


sample_index = 100#好好处理下数据。
print('结果样例：')
print('预测值\t', test_result['preds'][sample_index])
print('实际值\t', test_result['targets'][sample_index])
print('预测句子\t', corpus.index_to_sentence(test_result['preds'][sample_index]))
print('实际句子\t', corpus.index_to_sentence(test_result['targets'][sample_index]))


预测句子数量： 166
------------------------------------------------------------
结果样例：
预测值	 [51 51 37 38 39 40  1]
实际值	 [131  51  37  38  39  41   1]
预测句子	 椎 椎 退 行 性 改 <PAD>
实际句子	 腰 椎 退 行 性 变 <PAD>
