In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, BertModel

import torch.nn as nn
from torchcrf import CRF

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [2]:
# 超参数
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001

model_path = "../../../models/bert-base-chinese/"
tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertModel.from_pretrained(model_path)

In [3]:
def tokenize_and_preserve_labels(text, labels):
    tokenized_text = []
    token_labels = []
    for word, label in zip(text.split(), labels.split()):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
 
        tokenized_text.extend(tokenized_word)
        token_labels.extend([label] * n_subwords)
 
    return tokenized_text, token_labels
 
def pad_sequences(sequences, max_len, padding_value=0):
    padded_sequences = torch.zeros((len(sequences), max_len)).long()
    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        if seq_len <= max_len:
            padded_sequences[i, :seq_len] = torch.tensor(seq)
        else:
            padded_sequences[i, :] = torch.tensor(seq[:max_len])
    return padded_sequences
 
def train(model, optimizer, train_dataloader):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        print(input_ids.shape)
        print(attention_mask.shape)
        print(labels.shape)
        
        loss = model(input_ids, attention_mask, labels)
        total_loss += loss.item()
 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 
    avg_train_loss = total_loss / len(train_dataloader)
    return avg_train_loss
 
def evaluate(model, eval_dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
 
            loss = model(input_ids, attention_mask, labels)
            total_loss += loss.item()
 
    avg_eval_loss = total_loss / len(eval_dataloader)
    return avg_eval_loss
 
def predict(model, text):
    model.eval()
    tokenized_text = tokenizer.tokenize(text)
    tokenized_text_with_labels = [(token, 'O') for token in tokenized_text]
    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokenized_text)])
    attention_mask = torch.ones_like(input_ids)
 
    with torch.no_grad():
        tags = model(input_ids.to(device), attention_mask.to(device))
 
    tag_labels = [id2label[tag] for tag in tags[0]]
    return list(zip(tokenized_text, tag_labels))

In [4]:
# 加载数据集
def load_data():
    file_train = '../dataset/msra_ner/train/part.txt'
    df = pd.read_csv(file_train, sep='\t', nrows=1000, header=None)
    df.columns = ['text', 'labels']
    df['text'] = df['text'].apply(lambda x: x.replace('', ' '))
    df['labels'] = df['labels'].apply(lambda x: x.replace('', ' '))

    return df

df = load_data()

In [5]:
def make_label_id_dict(df):
    labels = [x.split() for x in df.labels.values.tolist()]
    # 标签去重
    unique_labels = set()
    for lb in labels:
        [unique_labels.add(i) for i in lb]
        
    print(unique_labels)
        
    label2id = {v: k for k, v in enumerate(unique_labels)}
    id2label = {k: v for k, v in enumerate(unique_labels)}
    
    return label2id, id2label, unique_labels, labels

label2id, id2label, unique_labels, all_labels = make_label_id_dict(df)

print('-' * 20)
print(label2id)
print('-' * 20)
print(id2label)

{'B-LOC', 'B-ORG', 'I-PER', 'I-ORG', 'B-PER', 'O', 'I-LOC'}
--------------------
{'B-LOC': 0, 'B-ORG': 1, 'I-PER': 2, 'I-ORG': 3, 'B-PER': 4, 'O': 5, 'I-LOC': 6}
--------------------
{0: 'B-LOC', 1: 'B-ORG', 2: 'I-PER', 3: 'I-ORG', 4: 'B-PER', 5: 'O', 6: 'I-LOC'}


In [6]:

# 切分训练集、验证集、测试集
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

train_data = df_train.values.tolist()
test_data = df_test.values.tolist()
val_data = df_val.values.tolist()

 
# 同样地，我们还需要加载验证集和测试集，并将它们转换为模型所需的格式

(800, 2)
(100, 2)
(100, 2)


In [7]:
def makeDataLoader(data):
    # 将数据集转换为模型所需的格式
    train_input_ids = []
    train_attention_masks = []
    train_labels = []
    
    # 对每一个word做label补齐
    for words, label in data:
        try:
            tokenized_text, token_labels = tokenize_and_preserve_labels(words, label)
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
            attention_mask = [1] * len(input_ids)
         
            train_input_ids.append(input_ids)
            train_attention_masks.append(attention_mask)
            train_labels.append([label2id[label] for label in token_labels])
        except:
            print(words)
            print(label)
            print(tokenized_text)
            print(token_labels)
            aaa
     
    train_input_ids = pad_sequences(train_input_ids, MAX_LEN)
    train_attention_masks = pad_sequences(train_attention_masks, MAX_LEN)
    train_labels = pad_sequences(train_labels, MAX_LEN, padding_value=-1)
     
    train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    return train_dataloader


train_dataloader = makeDataLoader(train_data)
test_dataloader = makeDataLoader(test_data)
val_dataloader = makeDataLoader(val_data)

In [8]:
class EntityModel(nn.Module):
    def __init__(self, bert_model, hidden_size, num_tags):
        super(EntityModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.bilstm = nn.LSTM(bidirectional=True, input_size=hidden_size, hidden_size=hidden_size // 2, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_tags)
        self.crf = CRF(num_tags, batch_first=True)
 
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, _ = self.bilstm(sequence_output)
        logits = self.fc(lstm_output)
        if labels is not None:
            loss = -self.crf(logits, labels, mask=attention_mask.byte())
            return loss
        else:
            tags = self.crf.decode(logits, mask=attention_mask.byte())
            return tags

In [9]:
# 训练模型
import torch.optim as optim

# 训练模型
model = EntityModel(bert_model, hidden_size=768, num_tags=len(label2id))
model.to(device)
 
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
for epoch in range(EPOCHS):
    avg_train_loss = train(model, optimizer, train_dataloader)
    avg_eval_loss = evaluate(model, test_dataloader)
    print(f'Epoch {epoch + 1}: train_loss={avg_train_loss:.4f}, eval_loss={avg_eval_loss:.4f}')
 
# 测试模型
test_sentences = ['今天是个好日子', '我喜欢中国菜', '巴黎是一座美丽的城市']
for sentence in test_sentences:
    tags = predict(model, sentence)
    print(tags)

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size

KeyboardInterrupt: 