## 微博命名实体识别

In [1]:
from fastNLP.io import WeiboNERPipe
data_bundle = WeiboNERPipe().process_from_file()
print(data_bundle.get_dataset('train')[:2])

100%|██████████| 188k/188k [00:00<00:00, 4.83MB/s]


http://212.129.155.247/dataset/weibo_NER.zip not found in cache, downloading to /tmp/tmpmstc2nws
Finish download from http://212.129.155.247/dataset/weibo_NER.zip
Copy file to /root/.fastNLP/dataset/weibo_NER
+---------------------+--------------------+--------------------+---------+
| raw_chars           | target             | chars              | seq_len |
+---------------------+--------------------+--------------------+---------+
| ['科', '技', '全... | [0, 0, 0, 0, 0,... | [792, 1015, 156... | 26      |
| ['对', '，', '输... | [0, 0, 0, 0, 0,... | [123, 2, 1205, ... | 15      |
+---------------------+--------------------+--------------------+---------+


### 模型构建
- embedding 选择 word2vec
- 模型选择 BiLSTMCRF

In [3]:
## Embedding 类型
## 这里选择word2vec
from fastNLP.embeddings import StaticEmbedding
embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d')

 22%|██▏       | 819k/3.70M [00:00<00:00, 8.13MB/s]

http://212.129.155.247/embedding/cn_char_fastnlp_100d.zip not found in cache, downloading to /tmp/tmpmtdi75t_


100%|██████████| 3.70M/3.70M [00:00<00:00, 10.4MB/s]


Finish download from http://212.129.155.247/embedding/cn_char_fastnlp_100d.zip
Copy file to /root/.fastNLP/embedding/cn_char_fastnlp_100d
Found 3321 out of 3471 words in the pre-training embedding.




In [4]:
## 使用BiLSTMCRF 模型来做命名实体识别
from fastNLP.models import BiLSTMCRF
data_bundle.rename_field('chars','words') #这是由于BiLSTMCRF模型的forward函数接受的是words， 而不是chars, 所以需要把这一列重命名
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), 
                  num_layers=1, hidden_size=200, dropout=0.5, target_vocab=data_bundle.get_vocab('target'))

### 进行训练
- 选择评估模型的metric
- 选择优化函数

In [6]:
from fastNLP import SpanFPreRecMetric
from torch.optim import Adam
from fastNLP import LossInForward
metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=1e-2)
loss = LossInForward()

In [8]:
## 使用trainer进行训练
from fastNLP import Trainer
import torch
device = 0 if torch.cuda.is_available() else "cpu"
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
                   dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)
trainer.train()

input fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
	words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) 
target fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2020-09-07-01-51-40-217911


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=430.0), HTML(value='')), layout=Layout(di…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.48 seconds!
Evaluation on dev at Epoch 1/10. Step:43/430: 
SpanFPreRecMetric: f=0.262877, pre=0.425287, rec=0.190231



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.48 seconds!
Evaluation on dev at Epoch 2/10. Step:86/430: 
SpanFPreRecMetric: f=0.393162, pre=0.586735, rec=0.29563



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.49 seconds!
Evaluation on dev at Epoch 3/10. Step:129/430: 
SpanFPreRecMetric: f=0.438849, pre=0.730539, rec=0.313625



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.47 seconds!
Evaluation on dev at Epoch 4/10. Step:172/430: 
SpanFPreRecMetric: f=0.507331, pre=0.590444, rec=0.44473



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.46 seconds!
Evaluation on dev at Epoch 5/10. Step:215/430: 
SpanFPreRecMetric: f=0.520059, pre=0.616197, rec=0.449871



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.5 seconds!
Evaluation on dev at Epoch 6/10. Step:258/430: 
SpanFPreRecMetric: f=0.47138, pre=0.682927, rec=0.359897



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.46 seconds!
Evaluation on dev at Epoch 7/10. Step:301/430: 
SpanFPreRecMetric: f=0.460751, pre=0.685279, rec=0.347044



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.47 seconds!
Evaluation on dev at Epoch 8/10. Step:344/430: 
SpanFPreRecMetric: f=0.491551, pre=0.610687, rec=0.411311



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.42 seconds!
Evaluation on dev at Epoch 9/10. Step:387/430: 
SpanFPreRecMetric: f=0.477093, pre=0.618852, rec=0.388175



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.41 seconds!
Evaluation on dev at Epoch 10/10. Step:430/430: 
SpanFPreRecMetric: f=0.486656, pre=0.625, rec=0.398458

Reloaded the best model.

In Epoch:5/Step:215, got best dev performance:
SpanFPreRecMetric: f=0.520059, pre=0.616197, rec=0.449871


{'best_eval': {'SpanFPreRecMetric': {'f': 0.520059,
   'pre': 0.616197,
   'rec': 0.449871}},
 'best_epoch': 5,
 'best_step': 215,
 'seconds': 67.04}

In [9]:
from fastNLP import Tester
tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.7 seconds!
[tester] 
SpanFPreRecMetric: f=0.452599, pre=0.627119, rec=0.354067


{'SpanFPreRecMetric': {'f': 0.452599, 'pre': 0.627119, 'rec': 0.354067}}

## 使用BERT 做embedding

In [None]:
from fastNLP.io import WeiboNERPipe
from fastNLP.models import BiLSTMCRF
data_bundle = WeiboNERPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

from fastNLP.embeddings import BertEmbedding
embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn')
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1,
                 target_vocab=data_bundle.get_vocab('target'))

from fastNLP import SpanFPreRecMetric
from torch.optim import Adam
from fastNLP import LossInForward
metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2e-5)
loss = LossInForward()

from fastNLP import Trainer
import torch
device=0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, 
                  optimizer=optimizer, batch_size=6,
                  dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)
trainer.train()

from fastNLP import Tester
tester = Tester(data_bundle.get_dataset('test'), model, metrics=metrics)
tester.test()

loading vocabulary file /root/.fastNLP/embedding/bert-chinese-wwm/vocab.txt
Load pre-trained BERT parameters from file /root/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.
Start to generate word pieces for word.
87 words are unsegmented. Among them, 51 added to the BPE vocab.
input fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
	words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) 
target fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2020-09-07-02-02-14-850967


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2250.0), HTML(value='')), layout=Layout(d…