## Tokenization

In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer
import torch

In [2]:
PRE_TRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [3]:
example = "合作委员会"
tokens = tokenizer.tokenize(example)

In [4]:
tokens

['合', '作', '委', '员', '会']

In [5]:
import pynlpir as pp
pp.open()
example = "合作委员会"

pp.segment(example)

[('合作', 'verb'), ('委员会', 'noun')]

In [6]:
word = '历史和地理'

In [7]:
segmented = pp.segment(word)
segmented

[('历史', 'noun'), ('和', 'conjunction'), ('地理', 'noun')]

In [8]:
seg_ind = {}
for i in range(len(segmented)):
    print(i)
    if i==0:
        seg_ind[i] = {'ini': 0,'end':len(segmented[i][0])}
    else:
        ini = seg_ind[i-1]['end']
        seg_ind[i] = {'ini':ini,'end':ini+len(segmented[i][0])}
seg_ind

0
1
2


{0: {'ini': 0, 'end': 2}, 1: {'ini': 2, 'end': 3}, 2: {'ini': 3, 'end': 5}}

# CRF

In [9]:
def seg2dict(segmented):
    seg_ind = {}
    for i in range(len(segmented)):
        if i==0:
            seg_ind[i] = {'ini': 0,'end':len(segmented[i][0])}
        else:
            ini = seg_ind[i-1]['end']
            seg_ind[i] = {'ini':ini,'end':ini+len(segmented[i][0])}
    return seg_ind

In [10]:
def char2features(word, i):
    '''
    word: whole word
    i: index of character
    
    return features
        where features is a dictionary containing:
            char: target character
    '''
    segmented = pp.segment(word)
    seg_ind = seg2dict(segmented)
    
#     print(word)
#     print(seg_ind, i)
    
    
    for k in seg_ind.keys():
        end = seg_ind[k]['end']
        if i < end:
            seg_word = segmented[k][0]
            ini = seg_ind[k]['ini']
            posInSeg = i - ini
            POS_seg = segmented[k][1]
            break
    
    if i >= end:
        seg_end = end
        seg_word = word[seg_end:]
        posInSeg = i - seg_end
        POS_seg = "noun"
    
    features = {
        'bias': 1.0,
        'char': word[i],
        'i': i,
        'word length': len(word),
        'segment': seg_word,
        'position in seg': posInSeg,
        'POS of seg': POS_seg
        
    }
    
    # previous char info
    if i > 0:
        features['Prev'] = word[i-1]
        if i > 1:
            features['Prev2'] = word[i-2]
        else:
            features['Prev2'] = "None"
    else:
        features.update({
            'Prev': "None",
            'Prev2': "None"
        })
    
    # post char info
    if i < len(word)-1:
        features['Post'] = word[i+1]
        if i < len(word)-2:
            features['Post2'] = word[i+2]
        else:
            features['Post2'] = "None"
    else:
        features.update({
            'Post': "None",
            'Post2': "None"
        })
        
    return features

In [11]:
# char2features(example, 2)

In [12]:
def word2features(word):
    return [char2features(word, i) for i in range(len(word))]

## Creating dataset

In [13]:
import pickle
cleaned_AbbOri = pickle.load(open('Cleaned_AbbOri_tr.p','rb'))
Tagged_Abb = pickle.load(open('Tagged_tr.p','rb'))

In [14]:
cleaned_AbbOri[0:5]

[['史地', '历史和地理'],
 ['正选', '正式选举'],
 ['营运', '营业运行'],
 ['n', '尼亚加拉瀑布'],
 ['粮播', '粮食播种']]

In [15]:
# Tagged_Abb[0:5]

In [16]:
x_train = [ word2features(word) for _, word in cleaned_AbbOri]

In [17]:
# x_train[0]

In [18]:
def word2label(word):
    return [l for _, l in word]

In [19]:
y_train = [word2label(word) for word in Tagged_Abb]
y_train[0]

['N', 'A', 'N', 'A', 'N']

In [20]:
cleaned_AbbOri_te = pickle.load(open('Cleaned_AbbOri_te.p','rb'))
Tagged_Abb_te = pickle.load(open('Tagged_te.p','rb'))
x_test = [word2features(word) for _, word in cleaned_AbbOri_te]
y_test = [word2label(word) for word in Tagged_Abb_te]

In [21]:
# x_test[0]

In [22]:
# y_test[0]

In [23]:
# Tagged_Abb_te[0]

In [24]:
pickle.dump(x_train, open("x_train_crf.p", "wb"))
pickle.dump(y_train, open("y_train_crf.p", "wb"))
pickle.dump(x_test, open("x_test_crf.p", "wb"))
pickle.dump(y_test, open("y_test_crf.p", "wb"))

# Training and Evaluate

In [25]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(x_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [26]:
labels = list(crf.classes_)
labels

['N', 'A']

In [27]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [28]:
y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8295970856006005

In [29]:
# y_pred

In [30]:
# y_test

In [31]:
y_pred = crf.predict(x_train)
metrics.flat_f1_score(y_train, y_pred,
                      average='weighted', labels=labels)

0.9246451700059085