# $$任务四：基于LSTM+CRF的序列标注$$

# 1.数据集预处理

#### 加载数据，创建字典

In [3]:
import torch
import os
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import collections
import os
import random
import time
import torch.nn.functional as F
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm

from nltk import word_tokenize
import numpy as np
import sys
sys.path.append('..')
device=torch.device('cuda'if torch.cuda.is_available() else 'cpu')

In [4]:
data_path=r'E:\NLP_jupyternotebook\Fudan_NLP_beginner\datasets\CoNLL-2003'
class CoNLL2003():
    def read(self,data_path):
        docs=['train','dev','test']
        extension='.txt'
        dataset={}
        for doc in tqdm(docs):
            doc_path=os.path.join(data_path,doc+extension)
            dataset[doc]=self.read_file(str(doc_path))
        return dataset
    def read_file(self,doc_path):
        samples=[]                              #目标形式[(['he','is','a','dog'],['o','o','o','o'])]
        tokens=[]
        tags=[]
        with open(doc_path,'r',encoding='utf-8') as fb:
            for line in fb:                     # example:"briefing NN I-NP O"
                line=line.strip('\n')
                if line=='-DOCSTART- -X- -X- O': #去除每个txt文件的数据头
                    pass
                elif line=='':                  #观察文本中可以看到每段话是以一行空白分割开的
                    if len(tokens)!=0:         #除了最开始的情况外，就是已经读取完了一句话，然后加入到samples里面
                        samples.append((tokens,tags))
                        tokens=[]
                        tags=[]
                else:
                    contents=line.split(' ')
                    tokens.append(contents[0])
                    tags.append(contents[-1])
        return samples   

In [5]:
conll2003=CoNLL2003()
raw_data=conll2003.read(data_path)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  9.78it/s]


In [6]:
print('-'*40,'测试','-'*40)
for sample in raw_data['train'][:3]:
    print(sample)
len(raw_data['train'])


---------------------------------------- 测试 ----------------------------------------
(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])
(['Peter', 'Blackburn'], ['B-PER', 'I-PER'])
(['BRUSSELS', '1996-08-22'], ['B-LOC', 'O'])


14041

In [75]:
def get_vocab(data):
    counter=collections.Counter([tk for st,_ in data for tk in st])
    return Vocab.Vocab(counter)
def get_tag_vocab(data):
    counter=collections.Counter([tk for _,st in data for tk in st])
    return Vocab.Vocab(counter),counter
vocab=get_vocab(raw_data['train']+raw_data['dev']+raw_data['test'])
tag_vocab,tag_nums=get_tag_vocab(raw_data['train'])

In [8]:
label2id= {'<START>': 0, 'I-LOC': 1,'O': 2, 'B-LOC': 3, 'B-PER': 4, 'B-ORG': 5,'I-PER': 6,'I-ORG': 7,'B-MISC': 8,'I-MISC': 9,'<END>': 10}
id2label=[item[0] for item in label2id.items()]

In [9]:
print('-'*40,'测试','-'*40)
label2id

---------------------------------------- 测试 ----------------------------------------


{'<START>': 0,
 'I-LOC': 1,
 'O': 2,
 'B-LOC': 3,
 'B-PER': 4,
 'B-ORG': 5,
 'I-PER': 6,
 'I-ORG': 7,
 'B-MISC': 8,
 'I-MISC': 9,
 '<END>': 10}

In [10]:
id2label

['<START>',
 'I-LOC',
 'O',
 'B-LOC',
 'B-PER',
 'B-ORG',
 'I-PER',
 'I-ORG',
 'B-MISC',
 'I-MISC',
 '<END>']

In [11]:
print('-'*40,'测试','-'*40)
vocab.stoi

---------------------------------------- 测试 ----------------------------------------


defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001BED0F24D08>>,
            {'<unk>': 0,
             '<pad>': 1,
             ',': 2,
             '.': 3,
             'the': 4,
             'of': 5,
             'in': 6,
             'to': 7,
             'a': 8,
             '(': 9,
             ')': 10,
             'and': 11,
             '"': 12,
             'on': 13,
             'said': 14,
             "'s": 15,
             'for': 16,
             '-': 17,
             '1': 18,
             'The': 19,
             'was': 20,
             'at': 21,
             '2': 22,
             '3': 23,
             '0': 24,
             'with': 25,
             'that': 26,
             'from': 27,
             'by': 28,
             ':': 29,
             'is': 30,
             '4': 31,
             'as': 32,
             'he': 33,
             'had': 34,
             'his': 35,
             'were': 36,
             'it': 37,
             'not

In [12]:
len(vocab.itos),len(vocab.stoi)

(30291, 30291)

#### 建立数据迭代器：每次迭代的数据包含训练集[batch，seq_len]，标签[batch，seq_len]，真实单词数[batch]

首先创建自定义的数据类，然后定制collate_fn设置动态padding

In [13]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
class MyDataSet(Dataset):
    def __init__(self,raw_data,vocab,label2id):
        self.raw_data=raw_data
        self.vocab=vocab.stoi
        self.label2id=label2id
        self.x_list=[[vocab.stoi[w] if w in vocab.stoi else vocab.stoi['<unk>'] for w in item[0]]for item in raw_data]
        self.y_list=[[label2id[lable]for lable in item[1]]for item in raw_data]
    def __getitem__(self,item):
        return torch.Tensor(self.x_list[item]),torch.Tensor(self.y_list[item])
    def __len__(self):
        return len(self.raw_data)
def collate_fn(batch):
    x_list=[x[0]for x in batch]
    y_list=[x[1]for x in batch]
    lengths=[len(item[0])for item in batch]
    x_list=pad_sequence(x_list,padding_value=vocab.stoi['<pad>'])
    y_list=pad_sequence(y_list,padding_value=-1)
    return x_list.long().transpose(0,1),y_list.long().transpose(0,1),lengths


In [14]:
train_set=MyDataSet(raw_data['train'],vocab,label2id)
test_set=MyDataSet(raw_data['test'],vocab,label2id)

查看数据类和数据迭代器

In [15]:
def id2sentence(sequence,vocab):
    return [vocab.itos[int(w)]for w in sequence]
print('-'*40,'测试','-'*40)
print('原始数据：\n',raw_data['train'][0])
print('自定义数据：\n',train_set[0],id2sentence(train_set[0][0],vocab))

---------------------------------------- 测试 ----------------------------------------
原始数据：
 (['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])
自定义数据：
 (tensor([1.0490e+03, 1.4971e+04, 2.3900e+02, 8.3900e+02, 7.0000e+00, 4.0960e+03,
        2.3000e+02, 1.0093e+04, 3.0000e+00]), tensor([5., 2., 8., 2., 2., 2., 8., 2., 2.])) ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [16]:
train_iter=DataLoader(dataset=train_set,batch_size=2,collate_fn=collate_fn)
test_iter=DataLoader(dataset=test_set,batch_size=2,collate_fn=collate_fn)
print('-'*40,'测试','-'*40)
for idx,x in enumerate(train_iter):
    print(idx,'th\t训练集batch:',x[0],x[0].shape,"\n\t标签 batch:",x[1].shape,"\n\t真实单词数：",x[2])
    #if idx==4:
    break

---------------------------------------- 测试 ----------------------------------------
0 th	训练集batch: tensor([[ 1049, 14971,   239,   839,     7,  4096,   230, 10093,     3],
        [  689,  2077,     1,     1,     1,     1,     1,     1,     1]]) torch.Size([2, 9]) 
	标签 batch: torch.Size([2, 9]) 
	真实单词数： [9, 2]


In [17]:
def get_mask(lengths):
    """
    Parameters:
        lengths--由数据迭代器返回的第三个数据，包含了一个batch中的每个样本的真实单词数，也就是上面的真实单词数 list of int
    Return:
        mask-- 由Lengths生成的掩码矩阵，在真实单词上为1，填充的地方为0
    """
    max_len=max(lengths)
    mask=torch.Tensor()
    for length in lengths:
        mask=torch.cat((mask,torch.Tensor([[1]*length+[0]*(max_len-length)])),dim=0)
    return mask

In [18]:
print('-'*40,'测试','-'*40)
a=[2,3,4]
b=get_mask(a)
b

---------------------------------------- 测试 ----------------------------------------


tensor([[1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])

In [19]:
len(vocab.stoi)

30291

# 2.建立模型

模型分为两个大部分，第一个是LSTM模型，目的是通过输出数据得到发射矩阵；第二个是CRF模型，目的是通过已知的观测序列进行参数学习得到最优的参数

In [20]:
from torch import nn
from torch.nn import init
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

class LstmCrf(nn.Module):
    def __init__(self,input_dim,hidden_dim,label2id,vocab,device):
        super(LstmCrf,self).__init__()
        #参数
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.label2id=label2id
        self.n_class=len(label2id)
        self.vocab=vocab
        self.device=device
        #嵌入层 input:[batch,seq_len]  output:[batch,seq_len,input_dim]
        self.embedding=nn.Embedding(len(vocab),input_dim)
        
        #LSTM层  input:[batch,seq_len,input_dim] output[batch,seq_len,2*hidden_num]
        self.lstm=nn.LSTM(input_size=self.input_dim,hidden_size=self.hidden_dim,num_layers=2,batch_first=True,bidirectional=True)
        
        #输出映射tag output[batch,n_class]
        self.hidden2tag=nn.Linear(2*hidden_dim,self.n_class)
        
        #转移矩阵，因为是需要学习的，所以用paramet添加,transition[i,j]means transition probability from i to j
        self.transition_matrix=nn.Parameter(torch.rand(self.n_class,self.n_class))
        self.reset_parameters()
    def reset_parameters(self):
        init.normal_(self.transition_matrix)
        #转移矩阵一行一列都零，目的是为了使从结束转移到其他的概率为0，和从其他转移到开始概率为0，log之后就是非常小，取-10000
        #同时通过detach()从计算图中进行分离，使这一部分参数不会改变
        #transition_matrix[i,j]表示当前状态为i，转移到状态j的得分，由于只会在句子开头添加<start>，结尾添加<end>所以不存在
        #从其他状态转移到<START>，或者<END>转移到其他状态.
        self.transition_matrix.detach()[self.label2id['<END>'],:]=-10000
        self.transition_matrix.detach()[:,label2id['<START>']]=-10000
    def forward(self,input_data,real_length):
        """
        Params:
            input_data [batch,seq_len] 从数据迭代器迭代出的一批次样本
        Return:
            output [batch,seq_len,n_class] LSTM的输出经过线性层生成的发射矩阵
        """
        embed=self.embedding(input_data) #[batch,seq_len,input_dim]
        #print("embed dim:",embed.shape)
        packed_embed=pack_padded_sequence(input=embed,lengths=real_length,batch_first=True,enforce_sorted = False)#未排序的序列
        packed_output,_=self.lstm(packed_embed)
        output,_=pad_packed_sequence(packed_output,batch_first=True) #[batch,seq_len,hidden_dim]
        #print('\noutput dim:',output.shape)
        output=self.hidden2tag(output)             #[batch,seq,n_class]
        return output
    def forward_alpha(self,emission,mask):
        """
        主要目的：
                计算所有可能路径的分数指数和的对数具体来说就是softmax分母的log值，由于每个时间步的每个输出都有m种可能，
                所以必须采用动态规划的方法来计算所有路径的概率 ，具体来说是维护一个n_class维度的向量alpha_t,
                使alpha_t[i]代表t时刻，y_t=i的所有路径指数和的对数logZ(y_t=i)，
                即alpha_t=[log(Z(y_t=0)),log(Z(y_t=1)),......,log(Z(y_t)=n_class)]
                利用关系log(Z(y_t+1=j))=sum(alpha_t+transition[:j]+emission[t+1,j]) 来更新alpha
                
                具体推导过程参考文章：https://zhuanlan.zhihu.com/p/97829287
                代码部分参考pytorch官方tutorial :Advanced: Making Dynamic Decisions and the Bi-LSTM CRF
        注意事项：
                在每个样本开头添加START最后结束的时候添加END，然后计算log_sum_exp得到最终所有路径的分数的指数和的对数
        
        Parameter:
                emission torch.tensor ---[batch,seq_len,n_class]       LSTM+linear后的output
                mask torch.tensor ---[batch,seq_len]                   1=real_word 0=padding_word
        Return:
                total_score torch.tensor [batch] 
        
        """
        #mask=torch.tensor(get_mask(lengths),device=device)
        emission=emission.to(device)
        #print("emission.shape:",emission.shape)
        batch_size,seq_len=mask.size()
        #time_step=0的时候，状态为START的概率为1，其他为0，对应的log分别为0和-10000，表示起始只有START一条路劲
        log_alpha_init=torch.full((batch_size,self.n_class),fill_value=-10000,device=device)
        log_alpha_init[:,label2id['<START>']]=0
        log_alpha=log_alpha_init #[batch,n_class]
        for time_step in range(0,seq_len):
            mask_t=mask[:,time_step].unsqueeze(-1) #[batch,1]
            emission_t=emission[:,time_step,:].unsqueeze(1).expand(-1,self.n_class,-1) #[batch,n_class,n_class] 进行了复制扩行的操作
            """
            除去batch维度(dim=0)以一个样本进行说明：
                这一步的操作是为了进行行的复制，假设一个time_step的发射矩阵为[E_t(0),E_t(1),....,,E_t(n)] [n_class]
                是一个torch.tensor([n])的tensor
                进行unsqueeze(1)后实际变为 [[E_t(0),E_t(1),....,,E_t(n)]] 注意方括号数 [1,n_class]
                进行expand(-self.n_class,-1)的操作后，实际上进行了行的复制
                即：
                [[E_t(0),E_t(1),....,,E_t(n)],
                 [E_t(0),E_t(1),....,,E_t(n)],
                 [E_t(0),E_t(1),....,,E_t(n)],
                 [E_t(0),E_t(1),....,,E_t(n)],
                 [E_t(0),E_t(1),....,,E_t(n)]] 直到行数也为n此时这个矩阵的形状为[n_class,n_class]
            
            """
            log_alpha_matrix=log_alpha.unsqueeze(2).expand(-1,-1,self.n_class)         #[batch,n_class,n_class] 进行了复制扩列的操作
            """
            除去batch维度(dim=0)以一个样本进行说明：
                log_alpha保存的是当前时刻，每个label对应的所有路径的得分的指数和的对数
                这一步的操作是为了进行列的复制，假设一个time_step的log_alpha矩阵为[alpha_t(0),alpha_t(1),....,alpha_t(n)]
                是一个torch.tensor([n])的tensor
                unsqueeze(2)操作后，实际变为[[alpha_t(0)],[alpha_t(1)],....,[alpha_t(n)]] 注意方括号
                expand(-1,-1,self.n_class) 的操作后，实际上进行了列的复制
                即
                [[alpha_t(0),alpha_t(0),alpha_t(0),alpha_t(0),alpha_t(0),alpha_t(0),alpha_t(0)],
                 [alpha_t(1),alpha_t(1),alpha_t(1),alpha_t(1),alpha_t(1),alpha_t(1),alpha_t(1)]
                 ,....,
                 [alpha_t(n),alpha_t(n),alpha_t(n),alpha_t(n),alpha_t(n),alpha_t(n),alpha_t(n)]]
            
            """
            """
            transition_matrix:
                [[T[0,0],T[0,1],....T[0,n]],
                 [T[1,0],T[1,1],....T[1,n]],
                 [T[2,0],T[2,1],....T[2,n]],
                 ...
                 [T[n,0],T[n,1],....T[n,n]]
            """
            #print("log_alpha_matrix:",log_alpha_matrix.device,"\n\nlog_M_matrix:",log_M_matrix.device,"\n\nmask:",mask_t.device)
            add_matrix=log_alpha_matrix+emission_t+self.transition_matrix  #[batch,n_class,n_class]
            log_alpha=torch.logsumexp(add_matrix,dim=1)*mask_t+log_alpha*(1-mask_t) #[batch,n_class]
            """
            对上述三个矩阵进行列方向的相加，这里因为加入了batch维度，所以dim=1
            满足alpha_t+1(j)=∑(alpha_t(i)+T[i:j]+E[t+1,j] for i in  range(n_class))
            
            按批次更新log_alpha时，利用到了mask矩阵。当某个数据对应的mask值为1时，表明该个数据还没有穷尽实际长度，
            可以更新，于是更新的数值乘以1保留，原来的log_alpha值乘以1-mask=0，消除。如果当前mask值为0，则表示其实
            这个数据已经结束，此时虽然新的log_alpha_matrix在对应的位置为0，但是依然存在转移矩阵的得分，所以
            要乘以0取消更新，最后将两者相加，最终更新该数据的log_alpha。
            """
        alpha=log_alpha+self.transition_matrix[:,label2id['<END>']].unsqueeze(0)#最后，加上到<END>标签的转移分数，得到最终结果。
        total_score=torch.logsumexp(alpha,dim=1) #[batch]
        return total_score
    def sentence_score(self,emission,labels,mask):
        """
        主要目的：
            计算一个批次的标签序列得分
        Parameters:
            emission 发射矩阵torch.tensor [batch,seq_len,n_class]
            labels   标签矩阵 torch.tensor [batch,seq_len]
            mask     掩码矩阵 torch.tensor [batch,seq_len]
        Return:
            socores  得分矩阵 torch.tensor [batch_size,1]
        """
        #mask=torch.tensor(get_mask(lengths),device=device)
        batch_size,seq_len,n_class=emission.size()
        #[batch,seq_len+1]
        labels=torch.cat([labels.new_full((batch_size,1),fill_value=label2id['<START>']),labels],1) #用new_full保持和labels同样的dtype和device
        scores=emission.new_zeros(batch_size) 
        for t in range(seq_len):
            mask_t=mask[:,t] #[batch,1]
            emission_t=emission[:,t,:] #[batch,n_class]
            #一个读取一个样本的发射分数，和对应的标签，计算在当前时间步的发射得分
            #最后所有Batch并在dim=0的维度连接起来最后[batch,1]；t+1是因为加了一个<START>
            emit_score=torch.cat([each_score[next_label].unsqueeze(-1) for each_score,next_label in zip(emission_t,labels[:,t+1])],dim=0)
            transition_score=torch.stack([self.transition_matrix[labels[b,t],labels[b,t+1]]for b in range(batch_size)])#[batch,1]
            #transtion_score是计算前一个时间步到当前时间步的标签转移得分所以从t开始
            scores+=(emit_score+transition_score)*mask_t #如果当前时间步的单词是填充的话就不计算分数
        """
        添加最后一步到<END>的转移分数，由于每个步骤的实际长度不同，用mask[b:].sum()-1+1来求最后一个单词数的位置，实际上是
        """
        transition_to_end=torch.stack([self.transition_matrix[label[mask[b,:].sum().long()],label2id['<END>']]for b,label in enumerate(labels)])
        scores+=transition_to_end
        return scores 
        
    def neg_log_likelihood(self,emission,labels,mask):
        mask=mask.cuda()
        alpha_score=self.forward_alpha(emission,mask)
        score_sentence=self.sentence_score(emission,labels,mask)
        loss=(alpha_score-score_sentence).sum()
        return loss

In [69]:
def viterbi_decode( feats, mask,transition,tag2idx):
    """
    :param feats: (seq_len, batch_size, tag_size)
    :param mask: (seq_len, batch_size)
    :return best_path: (seq_len, batch_size)
    """
    feats=feats.transpose(1,0).contiguous()
    mask=mask.transpose(1,0).contiguous()
    seq_len, batch_size, tag_size = feats.size()
    # initialize scores in log space
    scores = feats.new_full((batch_size, tag_size), fill_value=-10000)
    scores[:, tag2idx['<START>']] = 0
    pointers = []
    # forward
    for t, feat in enumerate(feats):
        # broadcast dimension: (batch_size, next_tag, current_tag)
        scores_t = scores.unsqueeze(1) + transition.unsqueeze(0)  # (batch_size, tag_size, tag_size)
        # max along current_tag to obtain: next_tag score, current_tag pointer
        scores_t, pointer = torch.max(scores_t, -1)  # (batch_size, tag_size), (batch_size, tag_size)
        scores_t += feat
        pointers.append(pointer)
        mask_t = mask[t].unsqueeze(-1)  # (batch_size, 1)
        scores = scores_t * mask_t + scores * (1 - mask_t)
    pointers = torch.stack(pointers, 0) # (seq_len, batch_size, tag_size)
    scores +=transition[:,tag2idx['<END>']].unsqueeze(0)
    best_score, best_tag = torch.max(scores, -1)  # (batch_size, ), (batch_size, )
    # backtracking
    best_path = best_tag.unsqueeze(-1).tolist() # list shape (batch_size, 1)
    for i in range(batch_size):
        best_tag_i = best_tag[i]
        seq_len_i = int(mask[:, i].sum())
        for ptr_t in reversed(pointers[:seq_len_i, i]):
        # ptr_t shape (tag_size, )
            best_tag_i = ptr_t[best_tag_i].item()
            best_path[i].append(best_tag_i)
        # pop first tag
        best_path[i].pop()
        # reverse order
        best_path[i].reverse()
        
    return np.array(best_path).T


维特比预测最佳路劲这一步其实跟模型没有太大关系，因为本身就要禁止梯度的传递。并且要不断修改这个函数，如果放在类里面的话每次都要重新加载模型，加词向量比较麻烦，所以就单独拿出来在外面好修改

# 3.训练及评价模型

#### 加载glove词向量

In [21]:
device=torch.device('cuda' if torch.cuda.is_available else  'cpu')
model=LstmCrf(300,100,label2id,vocab.stoi,device)
glove_vocab=Vocab.GloVe(name='840B',dim=300,cache=r'E:\NLP_jupyternotebook\Fudan_NLP_beginner\.vector_cache')
"""
将词向量作为每个词的特征向量，先创建一个矩阵，在通过copy_()函数赋值给模型中的self.embedding
"""
def load_pretrain_embedding(words,pretrained_vocab): #两个都是vocab类
    embed=torch.zeros(len(words),pretrained_vocab.vectors[0].shape[0])
    oov_count=0
    for i,word in enumerate(words):#词典中的词可能没有对应的词向量，所以用try except来避免异常导致的程序中断
        try:
            idx=pretrained_vocab.stoi[word]
            embed[i,:]=pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count+=1
    if oov_count>0:
        print("有%d个 OOV单词."%oov_count)
    return embed
model.embedding.weight.data.copy_(load_pretrain_embedding(vocab.itos,glove_vocab))
#model.embedding.weight.requires_grad=False #已经是预训练好的词向量，这里就不再需要进行更新

有3442个 OOV单词.


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0828,  0.6720, -0.1499,  ..., -0.1918, -0.3785, -0.0659],
        ...,
        [ 0.3803,  0.0872, -0.1009,  ..., -0.3404, -0.2451, -0.0514],
        [-0.9065,  0.0039,  0.0859,  ..., -0.9103,  0.2001, -0.1049],
        [ 0.4403, -0.4592,  0.1195,  ...,  0.5576,  0.1360, -0.1152]])

试过了选择6B 100d的词向量，发现OOV单词过多了，所以就改成了840B,300d的词向量

In [22]:
model.transition_matrix[:,label2id['<END>']]

tensor([ 6.5786e-02,  1.0397e-01, -2.5876e-01, -1.1300e+00, -1.7075e-01,
         6.8380e-02,  1.5320e+00,  3.6261e-01, -5.9450e-01, -1.7887e-01,
        -1.0000e+04], grad_fn=<SelectBackward>)

In [23]:
train_iter=DataLoader(dataset=train_set,batch_size=32,collate_fn=collate_fn)
test_iter=DataLoader(dataset=test_set,batch_size=16,collate_fn=collate_fn)
lr,epoch=0.01,5
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [68]:
import numpy as np 
def Assessment(test_iter,net,device,label2id):
    """
    主要目的：
            构建混淆矩阵confmatrix并计算F1,RECALL,PRECISION
            confmatrix(i,j)表示实际类i被预测成j的样本数，当且仅当i=j，即矩阵主对角线上的元素是被正确预测的
            总体准确率:all_prediction=sum(diag(confmatrix))/sum(confmatrix)
            某个类别的精度 label_i_precision=confmatrix(i,i)/sum(confmatrix(j,i)for j in range(n_class))
            某个类别召回率 label_i_recall=confmatrix(i,i)/sum(confmatrix(i,j)for j in range(n_class))
            宏召回率 all_recall=avg(label_i_prediciton for i in range(n_class))
            宏精度 all_precision=avg(label_i_precision for i in range(n_class))
            某个类别的F1 F1=2*precision*recall/(precision+recall)
    
    """
    n_class=len(label2id)
    #创建混淆矩阵
    confmatrix=np.zeros((n_class,n_class),dtype=np.int32)
    net.eval()
    ci=0
    with torch.no_grad():
        for X,y,lengths in test_iter:
            ci+=1
            X=X.long()
            y=y.long()
            X=X.to(device)
            y=y.to(device)
            mask=get_mask(lengths)
            mask=mask.cuda()
            emission=net.forward(X,lengths)
            #print(emission.device)
            y_hat=viterbi_decode(emission,mask,model.transition_matrix,label2id)
            #print('mask',mask)
            #print('y',y[0])
            #if(ci%10==0):
                #print('实际的标签序列：',[id2label[x] for x in y[0]],"实际标签长度",len(y[0]))
                #print('预测的最好标签序列：',[id2label[x] for x in y_hat[0]],"预测标签长度",len(y_hat[0]))
            for b in range(len(y_hat)):
                for i in range(int(sum(mask[b]))):
                    true_label_idx=y[b][i]
                    predict_label_idx=y_hat[b][i]
                    #predict_label_idx=y[b][i]
                    confmatrix[true_label_idx][predict_label_idx]+=1
    net.train()
    #求整体准确率
    total_sum=confmatrix.sum()
    correct_sum=(np.diag(confmatrix)).sum()
    total_accuracy=100*float(correct_sum)/float(total_sum)
    def calculate_precision(confMatrix,labelidx):
        """求某一个类别的precision"""
        label_total_sum = confMatrix.sum(axis=0)[labelidx]
        label_correct_sum = confMatrix[labelidx][labelidx]
        precision=0
        if label_total_sum != 0:
            precision = 100*float(label_correct_sum)/float(label_total_sum)
        return precision
    
    def calculate_recall(confMatrix,labelidx):
        label_total_sum = confMatrix.sum(axis=1)[labelidx]
        label_correct_sum = confMatrix[labelidx][labelidx]
        recall = 0
        if label_total_sum != 0:
            recall = 100*float(label_correct_sum)/float(label_total_sum)
        return recall
    
    def calculate_f1(prediction,recall):
        if (prediction+recall==0):
            return 0
        return round(2*prediction*recall/(prediction+recall),2)
    precisions=dict([labelname,calculate_precision(confmatrix,labelidx)] for labelname,labelidx in label2id.items())
    recalls=dict([labelname,calculate_recall(confmatrix,labelidx)] for labelname,labelidx in label2id.items())
    F1s=dict([labelname,calculate_f1(precisions[labelname],recalls[labelname])] for labelname,_ in label2id.items())
    return confmatrix,total_accuracy,precisions,recalls,F1s

In [71]:
from tqdm import tqdm
def train(train_iter,test_iter,net,optimizer,device,num_epochs):
    net=net.to(device)
    print("training on ",device)
    batch_count=0
    for epoch in range(num_epochs):
        net.train()
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y,lengths in tqdm(train_iter):
            X=X.long()
            y=y.long()
            X=X.to(device)
            y=y.to(device)
            n+=y.shape[0]
            mask=get_mask(lengths)
            emission=net.forward(X,lengths)
            #print(emission.shape)
            #return 0
            l=net.neg_log_likelihood(emission,y,mask)
            optimizer.zero_grad()
            l.backward()
            #torch.nn.utils.clip_grad_norm_(parameters=net.parameters(), max_norm=10)
            optimizer.step()
            train_l_sum+=l
            #print([mask.sum(dim=1)]==[len(x) for x in y_hat])
            #confmatrix,total_accuracy,precisions,recalls,F1s=Assessment(test_iter,net,device,label2id)
            #print("epoch %d\tloss %.3f\ttotal_accuracy %.2f\n各标签查准率:"%(epoch,train_l_sum,total_accuracy),precisions,"\n各标签召回率:",recalls,"\n各标签的F1值:",F1s,'\n',confmatrix)
        train_l_avg=train_l_sum/n
        confmatrix,total_accuracy,precisions,recalls,F1s=Assessment(test_iter,net,device,label2id)
        print("epoch %d\tloss %.3f\ttotal_accuracy %.2f\n各标签查准率:"%(epoch,train_l_avg,total_accuracy),precisions,"\n各标签召回率:",recalls,"\n各标签的F1值:",F1s,'\n',confmatrix)

In [72]:
train(train_iter,test_iter,model,optimizer,device,epoch)

  0%|                                                                                          | 0/439 [00:00<?, ?it/s]

training on  cuda


100%|████████████████████████████████████████████████████████████████████████████████| 439/439 [09:02<00:00,  1.24s/it]
  0%|                                                                                          | 0/439 [00:00<?, ?it/s]

epoch 0	loss 1.506	total_accuracy 93.77
各标签查准率: {'<START>': 0, 'I-LOC': 31.65680473372781, 'O': 98.44238382996457, 'B-LOC': 83.94833948339483, 'B-PER': 84.29423459244533, 'B-ORG': 73.76861397479954, 'I-PER': 83.61204013377926, 'I-ORG': 50.85995085995086, 'B-MISC': 64.72148541114058, 'I-MISC': 29.315068493150687, '<END>': 0} 
各标签召回率: {'<START>': 0, 'I-LOC': 83.26848249027238, 'O': 98.71235667458929, 'B-LOC': 82.27848101265823, 'B-PER': 79.1044776119403, 'B-ORG': 77.54364840457556, 'I-PER': 65.38796861377507, 'I-ORG': 24.790419161676645, 'B-MISC': 69.51566951566952, 'I-MISC': 49.53703703703704, '<END>': 0} 
各标签的F1值: {'<START>': 0, 'I-LOC': 45.87, 'O': 98.58, 'B-LOC': 83.11, 'B-PER': 81.62, 'B-ORG': 75.61, 'I-PER': 73.39, 'I-ORG': 33.33, 'B-MISC': 67.03, 'I-MISC': 36.83, '<END>': 0} 
 [[    0     0     0     0     0     0     0     0     0     0     0]
 [    0   214    14    20     1     0     0     6     0     2     0]
 [    0    29 37794    30    36    62    12   129   110    85     0]


100%|████████████████████████████████████████████████████████████████████████████████| 439/439 [09:07<00:00,  1.25s/it]
  0%|                                                                                          | 0/439 [00:00<?, ?it/s]

epoch 1	loss 0.390	total_accuracy 94.31
各标签查准率: {'<START>': 0, 'I-LOC': 42.295081967213115, 'O': 97.9980367844596, 'B-LOC': 78.67607162235485, 'B-PER': 85.59670781893004, 'B-ORG': 77.00879765395895, 'I-PER': 85.65697091273822, 'I-ORG': 68.4375, 'B-MISC': 67.34693877551021, 'I-MISC': 30.303030303030305, '<END>': 0} 
各标签召回率: {'<START>': 0, 'I-LOC': 50.19455252918288, 'O': 99.08585159453601, 'B-LOC': 87.40204942736588, 'B-PER': 77.61194029850746, 'B-ORG': 79.04876580373269, 'I-PER': 74.45510026155188, 'I-ORG': 26.227544910179642, 'B-MISC': 70.51282051282051, 'I-MISC': 41.666666666666664, '<END>': 0} 
各标签的F1值: {'<START>': 0, 'I-LOC': 45.91, 'O': 98.54, 'B-LOC': 82.81, 'B-PER': 81.41, 'B-ORG': 78.02, 'I-PER': 79.66, 'I-ORG': 37.92, 'B-MISC': 68.89, 'I-MISC': 35.09, '<END>': 0} 
 [[    0     0     0     0     0     0     0     0     0     0     0]
 [    0   129    31    90     0     1     1     2     2     1     0]
 [    0     8 37937    42    37    53    13    40    82    75     0]
 [    0 

100%|████████████████████████████████████████████████████████████████████████████████| 439/439 [09:13<00:00,  1.26s/it]
  0%|                                                                                          | 0/439 [00:00<?, ?it/s]

epoch 2	loss 0.154	total_accuracy 95.18
各标签查准率: {'<START>': 0.0, 'I-LOC': 60.182370820668694, 'O': 98.38277735888302, 'B-LOC': 88.29787234042553, 'B-PER': 91.19031607262946, 'B-ORG': 81.38181818181818, 'I-PER': 92.01793721973094, 'I-ORG': 51.94805194805195, 'B-MISC': 74.54545454545455, 'I-MISC': 51.492537313432834, '<END>': 0} 
各标签召回率: {'<START>': 0, 'I-LOC': 77.04280155642023, 'O': 98.82989004100608, 'B-LOC': 85.05123568414707, 'B-PER': 84.32835820895522, 'B-ORG': 67.36905478627332, 'I-PER': 89.45074106364429, 'I-ORG': 67.06586826347305, 'B-MISC': 70.08547008547009, 'I-MISC': 63.888888888888886, '<END>': 0} 
各标签的F1值: {'<START>': 0, 'I-LOC': 67.58, 'O': 98.61, 'B-LOC': 86.64, 'B-PER': 87.63, 'B-ORG': 73.72, 'I-PER': 90.72, 'I-ORG': 58.55, 'B-MISC': 72.25, 'I-MISC': 57.02, '<END>': 0} 
 [[    0     0     0     0     0     0     0     0     0     0     0]
 [    0   198    20    19     0     0     2    14     0     4     0]
 [    1    20 37839    30    43    48    10   178    66    52    

100%|████████████████████████████████████████████████████████████████████████████████| 439/439 [09:14<00:00,  1.26s/it]
  0%|                                                                                          | 0/439 [00:00<?, ?it/s]

epoch 3	loss 0.073	total_accuracy 95.55
各标签查准率: {'<START>': 0, 'I-LOC': 69.6969696969697, 'O': 98.59448747033821, 'B-LOC': 84.6685878962536, 'B-PER': 89.63058976020739, 'B-ORG': 83.68314150304671, 'I-PER': 94.98580889309366, 'I-ORG': 61.469933184855236, 'B-MISC': 73.11827956989248, 'I-MISC': 44.082840236686394, '<END>': 0} 
各标签召回率: {'<START>': 0, 'I-LOC': 62.64591439688716, 'O': 98.75414631598193, 'B-LOC': 88.5473176612417, 'B-PER': 86.00746268656717, 'B-ORG': 74.41300421432872, 'I-PER': 87.53269398430689, 'I-ORG': 66.10778443113773, 'B-MISC': 77.4928774928775, 'I-MISC': 68.98148148148148, '<END>': 0} 
各标签的F1值: {'<START>': 0, 'I-LOC': 65.98, 'O': 98.67, 'B-LOC': 86.56, 'B-PER': 87.78, 'B-ORG': 78.78, 'I-PER': 91.11, 'I-ORG': 63.7, 'B-MISC': 75.24, 'I-MISC': 53.79, '<END>': 0} 
 [[    0     0     0     0     0     0     0     0     0     0     0]
 [    0   161    28    32     1     1     1    13     0    20     0]
 [    0    10 37810    35    53    47    18   135    75   104     0]
 [  

100%|████████████████████████████████████████████████████████████████████████████████| 439/439 [09:03<00:00,  1.24s/it]


epoch 4	loss 0.043	total_accuracy 95.82
各标签查准率: {'<START>': 0, 'I-LOC': 67.76556776556777, 'O': 98.50563958625708, 'B-LOC': 86.5351131746953, 'B-PER': 90.59438275636839, 'B-ORG': 84.49511400651465, 'I-PER': 92.26569608735214, 'I-ORG': 70.83333333333333, 'B-MISC': 74.56395348837209, 'I-MISC': 41.23076923076923, '<END>': 0} 
各标签召回率: {'<START>': 0, 'I-LOC': 71.98443579766537, 'O': 98.99704860657664, 'B-LOC': 89.87341772151899, 'B-PER': 86.25621890547264, 'B-ORG': 78.08549066827213, 'I-PER': 88.40453356582388, 'I-ORG': 61.07784431137725, 'B-MISC': 73.07692307692308, 'I-MISC': 62.03703703703704, '<END>': 0} 
各标签的F1值: {'<START>': 0, 'I-LOC': 69.81, 'O': 98.75, 'B-LOC': 88.17, 'B-PER': 88.37, 'B-ORG': 81.16, 'I-PER': 90.29, 'I-ORG': 65.59, 'B-MISC': 73.81, 'I-MISC': 49.54, '<END>': 0} 
 [[    0     0     0     0     0     0     0     0     0     0     0]
 [    0   185    18    35     0     1     3     6     1     8     0]
 [    0    10 37903    31    47    57    16    79    63    81     0]
 [

In [77]:
tag_nums

Counter({'B-ORG': 6321,
         'O': 169578,
         'B-MISC': 3438,
         'B-PER': 6600,
         'I-PER': 4528,
         'B-LOC': 7140,
         'I-ORG': 3704,
         'I-MISC': 1155,
         'I-LOC': 1157})

训练集的loss在不断降低，同时测试集的准确率也在不断的提高，各个标签的F1值出现了波动，但总体是上升趋势。观察发现各个标签的F1值与标签的数量成正相关