### Explore some Chinese transformer models

##### LTP chinese tokenizer 

In [None]:
#!pip install ltp  ## for chinese tokenization

In [109]:
from ltp import LTP
LTP_RESOURCE='../data/pre_trained_model/ltp/base/'  
#download from here https://pypi.org/project/ltp/
#https://github.com/HIT-SCIR/ltp

In [110]:
ltp = LTP(LTP_RESOURCE) # 默认加载 Small 模型
seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

  else:


In [111]:
print('word segments: {}'.format(seg))
print('pos tags: {}'.format(pos))
print('NER: {}'.format(ner))
print('Dependency: {}'.format(dep))

word segments: [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
pos tags: [['r', 'v', 'nh', 'v', 'v', 'n', 'wp']]
NER: [[('Nh', 2, 2)]]
Dependency: [[(1, 2, 'SBV'), (2, 0, 'HED'), (3, 2, 'DBL'), (4, 2, 'VOB'), (5, 4, 'COO'), (6, 5, 'VOB'), (7, 2, 'WP')]]


#### Some community pretraiend chinese models

In [112]:
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer,BertModel

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f8851206710>

In [210]:
# Let's load Chinese BERT and other models trained by community 
# https://huggingface.co/hfl
# https://github.com/ymcui/Chinese-BERT-wwm
cn_bert = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
cn_tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")

In [229]:
cn_input = cn_tokenizer(["Hugging Face是一个不错的package",
                        "这是第二个测试句子~~!``fasdfqprulz;n"],
                        add_special_tokens=True, truncation=True, max_length=512, padding=True ) 
## for some reason return_tensor = "pt" does not work
def convert_to_pt(input_dict):
    out = {k:torch.tensor(v) for k,v in input_dict.items()}
    return out 
cn_input = convert_to_pt(cn_input)
print("Single segment token      : {}".format(
    [cn_tokenizer.convert_ids_to_tokens(i) for i in cn_input['input_ids']]
    ))
      
print("Single segment token      : {}".format(cn_input['input_ids']))
print("Single segment type       : {}".format(cn_input['token_type_ids']))
print("Single segment type       : {}".format(cn_input['attention_mask']))
print()

Single segment token      : [['[CLS]', 'hu', '##gg', '##ing', 'face', '是', '一', '个', '不', '错', '的', 'pack', '##age', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[CLS]', '这', '是', '第', '二', '个', '测试', '句子', '~', '~', '!', '[UNK]', '[UNK]', 'fa', '##sd', '##f', '##q', '##pr', '##ul', '##z', ';', 'n', '[SEP]']]
Single segment token      : tensor([[  101, 12199,  9949,  8221, 10656,  3221,   671,   702,   679,  7231,
          4638, 12736,  9103,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  6821,  3221,  5018,   753,   702, 21128, 21129,   172,   172,
           106,   100,   100, 12289, 10117,  8189,  8326, 11426, 10086,  8253,
           132,   156,   102]])
Single segment type       : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Single segment type       : tensor([[1, 1, 1, 1

In [212]:
output, pooled = cn_bert(**cn_input)

print("Token wise output: {}, Pooled output: {}".format(output.shape, pooled.shape))

Token wise output: torch.Size([2, 25, 1024]), Pooled output: torch.Size([2, 1024])


## How to add tokens 

- look at documentation here 
https://huggingface.co/transformers/internal/tokenization_utils.html?highlight=add_token#transformers.tokenization_utils_base.SpecialTokensMixin.add_tokens

In [226]:
## load tokenizer and add new vocabulary
cn_tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
print('Original length of tokenizer: {}'.format(len(cn_tokenizer)))
num_added_toks = cn_tokenizer.add_tokens(new_tokens=['测试','句子','随便一个','什么东西'],special_tokens=False)
print('We have added', num_added_toks, 'tokens')
print('after adding, length of tokenizer: {}'.format(len(cn_tokenizer)))

Original length of tokenizer: 21128
We have added 4 tokens
after adding, length of tokenizer: 21132


In [227]:
# afterward we need to resize our model to reflect that 
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
cn_bert.resize_token_embeddings(len(tokenizer))

Embedding(30522, 1024)

- Run the same example with update tokenizer and model 

In [230]:
cn_input = cn_tokenizer(["Hugging Face是一个不错的package",
                        "这是第二个测试句子~~!``fasdfqprulz;n"],
                        add_special_tokens=True, truncation=True, max_length=512, padding=True ) 
## for some reason return_tensor = "pt" does not work
def convert_to_pt(input_dict):
    out = {k:torch.tensor(v) for k,v in input_dict.items()}
    return out 
cn_input = convert_to_pt(cn_input)
print("Single segment token      : {}".format(
    [cn_tokenizer.convert_ids_to_tokens(i) for i in cn_input['input_ids']]
    ))
      
print("Single segment token      : {}".format(cn_input['input_ids']))
print("Single segment type       : {}".format(cn_input['token_type_ids']))
print("Single segment type       : {}".format(cn_input['attention_mask']))
print()

Single segment token      : [['[CLS]', 'hu', '##gg', '##ing', 'face', '是', '一', '个', '不', '错', '的', 'pack', '##age', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[CLS]', '这', '是', '第', '二', '个', '测试', '句子', '~', '~', '!', '[UNK]', '[UNK]', 'fa', '##sd', '##f', '##q', '##pr', '##ul', '##z', ';', 'n', '[SEP]']]
Single segment token      : tensor([[  101, 12199,  9949,  8221, 10656,  3221,   671,   702,   679,  7231,
          4638, 12736,  9103,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  6821,  3221,  5018,   753,   702, 21128, 21129,   172,   172,
           106,   100,   100, 12289, 10117,  8189,  8326, 11426, 10086,  8253,
           132,   156,   102]])
Single segment type       : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Single segment type       : tensor([[1, 1, 1, 1

In [231]:
#cn_tokenizer.save_pretrained('.')
#cn_tokenizer.from_pretrained('.')