## Tokenization

In [11]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer
import torch

In [12]:
PRE_TRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=624.0), HTML(value='')))




In [13]:
example = "合作委员会"
tokens = tokenizer.tokenize(example)

In [14]:
tokens

['合', '作', '委', '员', '会']

# CRF

In [15]:
def char2features(word, i):
    '''
    word: whole word
    i: index of character
    
    return features
        where features is a dictionary containing:
            char: target character
    '''
    features = {
        'bias': 1.0,
        'char': word[i],
        'i': i,
        'word length': len(word)
    }
    
    # previous char info
    if i > 0:
        features['Prev'] = word[i-1]
        if i > 1:
            features['Prev2'] = word[i-2]
        else:
            features['Prev2'] = "None"
    else:
        features.update({
            'Prev': "None",
            'Prev2': "None"
        })
    
    # post char info
    if i < len(word)-1:
        features['Post'] = word[-1]
        if i < len(word)-2:
            features['Post2'] = word[-2]
        else:
            features['Post2'] = "None"
    else:
        features.update({
            'Post': "None",
            'Post2': "None"
        })
        
    return features

In [16]:
char2features(example, 2)

{'bias': 1.0,
 'char': '委',
 'i': 2,
 'word length': 5,
 'Prev': '作',
 'Prev2': '合',
 'Post': '会',
 'Post2': '员'}

In [17]:
def word2features(word):
    return [char2features(word, i) for i in range(len(word))]

## Creating dataset

In [37]:
import pickle
cleaned_AbbOri = pickle.load(open('Cleaned_AbbOri_tr.p','rb'))
Tagged_Abb = pickle.load(open('Tagged_tr.p','rb'))

In [38]:
cleaned_AbbOri[0:5]

[['史地', '历史和地理'],
 ['正选', '正式选举'],
 ['营运', '营业运行'],
 ['n', '尼亚加拉瀑布'],
 ['粮播', '粮食播种']]

In [39]:
Tagged_Abb[0:5]

[[('历', 'N'), ('史', 'A'), ('和', 'N'), ('地', 'A'), ('理', 'N')],
 [('正', 'A'), ('式', 'N'), ('选', 'A'), ('举', 'N')],
 [('营', 'A'), ('业', 'N'), ('运', 'A'), ('行', 'N')],
 [('尼', 'N'), ('亚', 'N'), ('加', 'N'), ('拉', 'N'), ('瀑', 'N'), ('布', 'N')],
 [('粮', 'A'), ('食', 'N'), ('播', 'A'), ('种', 'N')]]

In [40]:
x_train = [ word2features(word) for _, word in cleaned_AbbOri]

In [48]:
x_train[0]

[{'bias': 1.0,
  'char': '历',
  'i': 0,
  'word length': 5,
  'Prev': 'None',
  'Prev2': 'None',
  'Post': '理',
  'Post2': '地'},
 {'bias': 1.0,
  'char': '史',
  'i': 1,
  'word length': 5,
  'Prev': '历',
  'Prev2': 'None',
  'Post': '理',
  'Post2': '地'},
 {'bias': 1.0,
  'char': '和',
  'i': 2,
  'word length': 5,
  'Prev': '史',
  'Prev2': '历',
  'Post': '理',
  'Post2': '地'},
 {'bias': 1.0,
  'char': '地',
  'i': 3,
  'word length': 5,
  'Prev': '和',
  'Prev2': '史',
  'Post': '理',
  'Post2': 'None'},
 {'bias': 1.0,
  'char': '理',
  'i': 4,
  'word length': 5,
  'Prev': '地',
  'Prev2': '和',
  'Post': 'None',
  'Post2': 'None'}]

In [42]:
def word2label(word):
    return [l for _, l in word]

In [49]:
y_train = [word2label(word) for word in Tagged_Abb]
y_train[0]

['N', 'A', 'N', 'A', 'N']

In [44]:
cleaned_AbbOri_te = pickle.load(open('Cleaned_AbbOri_te.p','rb'))
Tagged_Abb_te = pickle.load(open('Tagged_te.p','rb'))
x_test = [word2features(word) for _, word in cleaned_AbbOri_te]
y_test = [word2label(word) for word in Tagged_Abb_te]

In [45]:
# x_test[0]

In [46]:
# y_test[0]

In [47]:
# Tagged_Abb_te[0]

In [59]:
pickle.dump(x_train, open("x_train_crf.p", "wb"))
pickle.dump(y_train, open("y_train_crf.p", "wb"))
pickle.dump(x_test, open("x_test_crf.p", "wb"))
pickle.dump(y_test, open("y_test_crf.p", "wb"))

# Training and Evaluate

In [52]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(x_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [54]:
labels = list(crf.classes_)
labels

['N', 'A']

In [55]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [60]:
y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8235866843157867

In [57]:
y_pred

[['A', 'A', 'A', 'N'],
 ['A', 'N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'N', 'N', 'A'],
 ['A', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'N', 'A', 'N', 'N', 'A'],
 ['N', 'A', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'A', 'N'],
 ['A', 'N', 'N', 'A', 'N', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N'],
 ['N', 'N', 'N', 'N', 'A'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['A', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N'],
 ['A', 'N', 'N', 'A'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N'],
 ['A', 'N', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N', 'N', 'N', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A'

In [61]:
y_test

[['N', 'A', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N', 'A', 'N', 'A'],
 ['A', 'A', 'N', 'N', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'N', 'A', 'N', 'N', 'A'],
 ['N', 'A', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'A', 'N'],
 ['A', 'N', 'N', 'N', 'N', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A'],
 ['N', 'N', 'N', 'N', 'N'],
 ['A', 'A', 'A', 'N', 'N'],
 ['N', 'A', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N', 'A', 'N', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['A', 'N', 'N', 'A'],
 ['A', 'N', 'A', 'N', 'A', 'N'],
 ['A', 'N', 'A', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N'],
 ['A', 'A', 'A', 'N', 'A', 'N'],
 ['A', 'N', 'N', 'A', 'N'],
 ['N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'A', 'N', 'N'],
 ['A', 'N', 'A', 'N', 'A'

In [62]:
y_pred = crf.predict(x_train)
metrics.flat_f1_score(y_train, y_pred,
                      average='weighted', labels=labels)

0.8878108797952919