In [1]:
import pandas as pd
from transformers import BertTokenizerFast,Pipeline
from torch.utils.data import DataLoader,Dataset
import tqdm
from torch.optim import SGD

In [19]:
df = pd.read_csv('train.txt',sep=' ',header=None).dropna()
df.head(20)

Unnamed: 0,0,1,2,3
0,-DOCSTART-,-X-,-X-,O
1,EU,NNP,B-NP,B-ORG
2,rejects,VBZ,B-VP,O
3,German,JJ,B-NP,B-MISC
4,call,NN,I-NP,O
5,to,TO,B-VP,O
6,boycott,VB,I-VP,O
7,British,JJ,B-NP,B-MISC
8,lamb,NN,I-NP,O
9,.,.,O,O


In [20]:
# 每行的第一项是单词，第二项是词性 (POS) 标记，第三项是句法块标记，第四项是命名实体标记
df.shape

(202383, 4)

In [21]:
# 获取单词
sentence = df.iloc[:,0]
sentence

0         -DOCSTART-
1                 EU
2            rejects
3             German
4               call
             ...    
204562         three
204563       Swansea
204564             1
204565       Lincoln
204566             2
Name: 0, Length: 202383, dtype: object

In [22]:
sentence.shape

(202383,)

In [23]:
# 获取命名实体标记
labels = df.iloc[:,-1]
labels

0              O
1          B-ORG
2              O
3         B-MISC
4              O
           ...  
204562         O
204563     B-ORG
204564         O
204565     B-ORG
204566         O
Name: 3, Length: 202383, dtype: object

In [24]:
labels.shape

(202383,)

In [25]:
unique_labels = set(labels)
unique_labels

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [26]:
# 将每个标签映射到它的id表示，反之亦然
labels_to_ids = {k: v for v, k in enumerate(sorted(list(unique_labels)))}
ids_to_labels = {v: k for v, k in enumerate(sorted(list(unique_labels)))}
print(ids_to_labels)

{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}


In [135]:
# 合并成句子
sent = []
sent_temp = ['[CLS]']
labs = []
labs_temp = ['[CLS]']
for sett,label in zip(sentence,labels):
    if sett !=".":
        sent_temp.append(sett)
        labs_temp.append(label)
    else:
        sent_temp.append(sett)
        sent_temp.append('[SEP]')
        sent.append(sent_temp)

        labs_temp.append(label)
        labs_temp.append('[SEP]')
        labs.append(labs_temp)
        sent_temp = ['[CLS]']
        labs_temp = ['[CLS]']


In [136]:
str(sent[1])

"['[CLS]', 'Peter', 'Blackburn', 'BRUSSELS', '1996-08-22', 'The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', '[SEP]']"

In [137]:
labs[0]

['[CLS]',
 'O',
 'B-ORG',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 '[SEP]']

In [138]:
# sentence = pd.DataFrame(sent)
# labels = pd.DataFrame(labs)

In [27]:
# 初始化分词器
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [140]:
# tokens = sentence.to_numpy(sentence)
# tokens = tokens.tolist()


In [28]:
text_tokenized = tokenizer(sent, padding='max_length',
                           max_length=128, truncation=True,
                           return_tensors="pt" ,is_split_into_words=True)

NameError: name 'sent' is not defined

In [142]:
label_tokenized = tokenizer(labs, padding='max_length',max_length=512, truncation=True,return_tensors="pt",is_split_into_words=True)

In [143]:
text_tokenized["input_ids"]

tensor([[  101,   101,   118,  ...,     0,     0,     0],
        [  101,   101,  1943,  ...,     0,     0,     0],
        [  101,   101,  1860,  ...,     0,     0,     0],
        ...,
        [  101,   101,   146,  ...,     0,     0,     0],
        [  101,   101, 24819,  ...,     0,     0,     0],
        [  101,   101,   118,  ...,     0,     0,     0]])

In [144]:
label_tokenized["input_ids"]

tensor([[101, 101, 152,  ...,   0,   0,   0],
        [101, 101, 139,  ...,   0,   0,   0],
        [101, 101, 139,  ...,   0,   0,   0],
        ...,
        [101, 101, 139,  ...,   0,   0,   0],
        [101, 101, 139,  ..., 118, 153, 102],
        [101, 101, 152,  ...,   0,   0,   0]])

In [145]:
## 因为序列长度不再匹配原始标签的长度，因此这是在Tokenization之后需要做的一个步骤--调整标签长度
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][1]))

['[CLS]', '[CLS]', 'Peter', 'Blackburn', 'BR', '##US', '##SE', '##LS', '1996', '-', '08', '-', '22', 'The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 's', '##hun', 'British', 'la', '##mb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', '[SEP]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [146]:
# 获取token与原始词的对其信息
word_ids = text_tokenized.word_ids()
print(word_ids)

[None, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

In [147]:
print(text_tokenized["input_ids"].shape)

torch.Size([7374, 512])


In [148]:
# 标签对齐


In [29]:
import torch
def align_label(texts, labels):
    # 首先tokenizer输入文本
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=128, truncation=True)
  # 获取word_ids
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []
    # 采用上述的第一中方法来调整标签，使得标签与输入数据对其。
    for word_idx in word_ids:
        # 如果token不在word_ids内，则用 “-100” 填充
        if word_idx is None:
            label_ids.append(-100)
        # 如果token在word_ids内，且word_idx不为None，则从labels_to_ids获取label id
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        # 如果token在word_ids内，且word_idx为None
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids
# 构建自己的数据集类
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, df):
        # 根据空格拆分labels
        lb = [i.split() for i in labels.values.tolist()]
        # tokenizer 向量化文本
        txt = sentence.values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 128,
                                truncation=True, return_tensors="pt") for i in txt]
        # 对齐标签
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

In [30]:
import numpy as np
df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

  return bound(*args, **kwds)


In [31]:
from transformers import BertForTokenClassification
class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained(
                       'bert-base-cased',
                                     num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask,
                           labels=label, return_dict=False)
        return output

In [32]:
def train_loop(model, df_train, df_val):
    # 定义训练和验证集数据
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)
    # 批量获取训练和验证集数据
    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=1)
    # 判断是否使用GPU，如果有，尽量使用，可以加快训练速度
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # 定义优化器
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
    if use_cuda:
        model = model.cuda()
    # 开始训练循环
    best_acc = 0
    best_loss = 1000
    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0
        # 训练模型
        model.train()
        # 按批量循环训练模型
        for train_data, train_label in tqdm(train_dataloader):
      # 从train_data中获取mask和input_id
            train_label = train_label[0].to(device)
            mask = train_data['attention_mask'][0].to(device)
            input_id = train_data['input_ids'][0].to(device)
            # 梯度清零！！
            optimizer.zero_grad()
            # 输入模型训练结果：损失及分类概率
            loss, logits = model(input_id, mask, train_label)
            # 过滤掉特殊token及padding的token
            logits_clean = logits[0][train_label != -100]
            label_clean = train_label[train_label != -100]
            # 获取最大概率值
            predictions = logits_clean.argmax(dim=1)
      # 计算准确率
            acc = (predictions == label_clean).float().mean()
            total_acc_train += acc
            total_loss_train += loss.item()
      # 反向传递
            loss.backward()
            # 参数更新
            optimizer.step()
        # 模型评估
        model.eval()

        total_acc_val = 0
        total_loss_val = 0
        for val_data, val_label in val_dataloader:
      # 批量获取验证数据
            val_label = val_label[0].to(device)
            mask = val_data['attention_mask'][0].to(device)
            input_id = val_data['input_ids'][0].to(device)
      # 输出模型预测结果
            loss, logits = model(input_id, mask, val_label)
      # 清楚无效token对应的结果
            logits_clean = logits[0][val_label != -100]
            label_clean = val_label[val_label != -100]
            # 获取概率值最大的预测
            predictions = logits_clean.argmax(dim=1)
            # 计算精度
            acc = (predictions == label_clean).float().mean()
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'''Epochs: {epoch_num + 1} |
                Loss: {total_loss_train / len(df_train): .3f} |
                Accuracy: {total_acc_train / len(df_train): .3f} |
                Val_Loss: {total_loss_val / len(df_val): .3f} |
                Accuracy: {total_acc_val / len(df_val): .3f}''')

LEARNING_RATE = 1e-2
EPOCHS = 5
model = BertModel()
train_loop(model, df_train, df_val)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`attention_mask` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [33]:
# 评估模型
def evaluate(model, df_test):
    # 定义测试数据
    test_dataset = DataSequence(df_test)
    # 批量获取测试数据
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
   # 使用GPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_acc_test = 0.0
    for test_data, test_label in test_dataloader:
        test_label = test_label[0].to(device)
        mask = test_data['attention_mask'][0].to(device)
        input_id = test_data['input_ids'][0].to(device)

        loss, logits = model(input_id, mask, test_label.long())
        logits_clean = logits[0][test_label != -100]
        label_clean = test_label[test_label != -100]
        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_test += acc
    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')

evaluate(model, df_test)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`attention_mask` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


ner_results = nlp(tokens)
print(ner_results)

In [None]:
ner_results