In [1]:
import pandas as pd
import numpy as np
import math

import os
import logging
import time

from sklearn.metrics import f1_score

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF
from torch.utils.data import Dataset,DataLoader, RandomSampler

from IPython.core.interactiveshell import InteractiveShell
with open('./dataset/train.txt') as file:
     content = file.readlines()

message = []
tokens = []
tag = []

for line in content:
    if line == '\n':
        message.append((tokens,tag))
        tokens = []
        tag = []
    else:
        line = line.rstrip('\n')
        contents = line.split(' ')
        tokens.append(contents[0])
        tag.append(contents[-1])

In [2]:
words = []
labels = []
for i in range(len(message)):
    words += message[i][0]
    labels += message[i][1]

In [3]:
words_dict = list(set(words))
labels_dict = list(set(labels))

In [4]:
words_dict.append("<pad>")
words_dict

['continued',
 '339-4',
 'terrorism',
 'Portugal',
 '1910',
 'Hotdogs',
 'Kazakhstan',
 'MONTPELIER',
 'Yone',
 'Wisc',
 'Man-of-the-Match',
 '+12',
 'situated',
 'Amica',
 'PVS',
 'sectoral',
 '3-mth',
 'Suwon',
 'WARSAW',
 'Haneda',
 'debt',
 'burn',
 '1918',
 'service',
 '251/08',
 'ANCHORAGE',
 '9-242',
 'invaluable',
 'Korei',
 'sense',
 'slim',
 '502',
 'sidestepped',
 '951',
 'Canonica',
 'adrift',
 'tossing',
 '21,000',
 'dispatching',
 'conquered',
 'P.W.',
 '5-12',
 'excellent',
 'Sacchi',
 'Wyborcza',
 '251/04',
 'HOME',
 'Coast',
 'Paulo',
 'Group',
 'al-Akhbar',
 'adequate',
 'spectacle',
 'flood-hit',
 'Svoboda',
 'Worku',
 '869.3',
 'celebrate',
 'Matusevitch',
 'Para',
 'Barroso',
 'Uniceramic',
 'Artur',
 'geology',
 'Bogota',
 'Petah',
 '93.12',
 '4746',
 '+387-71-663-864',
 'destitute',
 'Consequently',
 'TRIO',
 'infringement',
 '3.83',
 'FR',
 'penalty',
 'Posts',
 'Zizkov',
 'Velez',
 'Ankara',
 'Home',
 'Reef',
 'Wash.-based',
 'Amsterdam-Rotterdam',
 'ex-preside

In [5]:
message_index = []
labels_index = []
for msg in message:
    word_index = []
    label_index = []
    if len(msg[0]) < 37:
        pad_len = 37 - len(msg[0])
        for m in msg[0]:
            word_index.append(words_dict.index(m))
        word_index = word_index + [len(words_dict) - 1] * pad_len
        for l in msg[1]:
            label_index.append(labels_dict.index(l))
        label_index = label_index + [labels_dict.index('O')] * pad_len
    elif len(msg[0]) > 37:
        for m in msg[0]:
            word_index.append(words_dict.index(m))
        word_index = word_index[:36]
        for l in msg[1]:
            label_index.append(labels_dict.index(l))
        label_index = label_index[:36]
    message_index.append(torch.Tensor(word_index))
    labels_index.append(torch.Tensor(label_index))

In [6]:
message_index[0]

tensor([19338., 19287., 20657., 13001., 11787., 14450., 16487.,  1124., 13402.,
        23623., 23623., 23623., 23623., 23623., 23623., 23623., 23623., 23623.,
        23623., 23623., 23623., 23623., 23623., 23623., 23623., 23623., 23623.,
        23623., 23623., 23623., 23623., 23623., 23623., 23623., 23623., 23623.,
        23623.])

In [32]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, pad_index,batch_size):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.pad_idx = pad_index
        self.batch_size = batch_size
        
        self.word_embeds = nn.Embedding(vocab_size,embedding_dim,padding_idx=self.pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, num_layers = 1, bidirectional = True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size)
    
   
    def forward(self, sentence, tags=None):     #sentence=(batch,seq_len)   tags=(batch,seq_len)
        embeds = self.word_embeds(sentence.long())
        # self.hidden = (torch.randn(2,self.batch_size,self.hidden_dim//2),torch.randn(2,self.batch_size,self.hidden_dim//2))
        lstm_out = self.lstm(embeds) 
        lstm_feats = self.hidden2tag(lstm_out)   
        if tags is not None:
            loss = -1.*self.crf(emissions=lstm_feats,tags=tags.permute(1,0),mask=mask.permute(1,0),reduction='mean')   #outputs=(batch_size,)   输出log形式的likelihood
            return loss
        else:
            prediction = self.crf.decode(emissions=lstm_feats,mask=mask.permute(1,0))   #mask=attention_masks.byte()
            return prediction

In [33]:
MAX_LEN = 37    #句子的标准长度
BATCH_SIZE = 8  #minibatch的大小
EMBEDDING_DIM = 9
HIDDEN_DIM = 9
def get_label_dict():
    return dict({(labels_dict[i],i) for i in range(len(labels_dict))})
labels_tensor = get_label_dict()
print(labels_tensor)
model = BiLSTM_CRF(len(words_dict), labels_tensor, EMBEDDING_DIM, HIDDEN_DIM,len(words_dict)-1,BATCH_SIZE)

{'I-LOC': 3, 'B-PER': 0, 'B-MISC': 1, 'B-ORG': 6, 'I-MISC': 5, 'O': 4, 'B-LOC': 7, 'I-PER': 2, 'I-ORG': 8}


In [34]:
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
#显示模型基本参数
model

BiLSTM_CRF(
  (word_embeds): Embedding(23624, 9, padding_idx=23623)
  (lstm): LSTM(9, 4, bidirectional=True)
  (hidden2tag): Linear(in_features=9, out_features=9, bias=True)
  (crf): CRF()
)