# Chinese Word Segmentation

The training/test data is publicly avaialbe here: http://sighan.cs.uchicago.edu/bakeoff2005/

## Prepare Training Data

In [1]:
raw_train = []
raw_test = []
with open("data/as_training.utf8") as fin:
    for line in fin:
        raw_train.append(line.strip().split("　"))   # It is a full white space

with open("data/as_testing_gold.utf8") as fin:
    for line in fin:
        raw_test.append(line.strip().split("　"))   # It is a full white space

print("Number of sentences in the training data: %d" % len(raw_train))
print("Number of sentences in the test data: %d" % len(raw_test))


Number of sentences in the training data: 708953
Number of sentences in the test data: 14432


## Use jieba

pip3 install jieba

In [3]:
import jieba

print(list(jieba.cut("".join(raw_test[0]))))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/q2/3j47nw291jl5bwz3l9y9cb3r0000gn/T/jieba.cache
Loading model cost 0.783 seconds.
Prefix dict has been built succesfully.


['許多', '社區長', '青學苑', '多', '開設', '有書法', '、', '插花', '、', '土風', '舞班', '，']


pip3 install hanziconv

In [6]:
from hanziconv.hanziconv import HanziConv

print(list(jieba.cut(HanziConv.toSimplified("".join(raw_test[0])))))

['许多', '社区', '长青', '学苑', '多', '开设', '有', '书法', '、', '插花', '、', '土风舞', '班', '，']


In [14]:
def restore(text, toks):
    results = []
    offset = 0
    for tok in toks:
        results.append(text[offset:offset + len(tok)])
        offset += len(tok)
    return results

text = "".join(raw_test[0])
print(restore(text, list(jieba.cut(HanziConv.toSimplified(text)))))

['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞', '班', '，']


## Build Our Own Model

### Convert a list of words to a sequence of tags


In [7]:
def words_to_tags(words):
    tags = []
    for word in words:
        if len(word) == 1:
            tags.append('S')
        else:
            for i in range(len(word)):
                if i == 0:
                    tags.append('L')
                elif i == len(word) - 1:
                    tags.append('R')
                else:
                    tags.append('M')
    return tags
    
train_X = []
train_Y = []

test_X = []
test_Y = []

for sent in raw_train:
    train_X.append(list("".join(sent)))  # Make the unsegmented sentence as a sequence of characters
    train_Y.append(words_to_tags(sent))
    
for sent in raw_test:
    test_X.append(list("".join(sent)))  # Make the unsegmented sentence
    test_Y.append(words_to_tags(sent))
    
print(test_X[0])
print(test_Y[0])


['許', '多', '社', '區', '長', '青', '學', '苑', '多', '開', '設', '有', '書', '法', '、', '插', '花', '、', '土', '風', '舞', '班', '，']
['L', 'R', 'L', 'R', 'L', 'R', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'M', 'M', 'R', 'S']


## Create a CRF model for word segmentation 

In [8]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics

def extract_sent_features(x):
    sent_features = []
    for i in range(len(x)):
        sent_features.append(extract_char_features(x, i))
    return sent_features
    
def extract_char_features(sent, position):
    char_features = {}
    for i in range(-3, 4):
        if len(sent) > position + i >= 0:
            char_features['char_at_%d' % i] = sent[position + i]
    return char_features

crf_tagger = sklearn_crfsuite.CRF(algorithm='lbfgs', min_freq=20, max_iterations=300, verbose=True)

feature_X = []
for x in train_X:
    feature_X.append(extract_sent_features(x))
crf_tagger.fit(feature_X, train_Y)

loading training data to CRFsuite: 100%|██████████| 708953/708953 [00:54<00:00, 13081.84it/s]



Feature generation
type: CRF1d
feature.minfreq: 20.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 70338
Seconds required: 12.666

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=4.48  loss=9523529.41 active=70226 feature_norm=1.00
Iter 2   time=2.19  loss=5960274.10 active=70338 feature_norm=5.08
Iter 3   time=4.35  loss=5632639.26 active=70338 feature_norm=5.60
Iter 4   time=2.17  loss=5317692.30 active=70338 feature_norm=6.39
Iter 5   time=2.16  loss=3919265.17 active=70338 feature_norm=12.31
Iter 6   time=2.14  loss=3640476.35 active=70338 feature_norm=14.54
Iter 7   time=2.15  loss=3411889.18 active=70338 feature_norm=14.45
Iter 8   time=2.15  loss=3244375.69 active=70338 feature_norm=16.20
Iter 9   time=2.17  loss=3037585.23 active=70338 feature_

Iter 122 time=2.15  loss=1287876.14 active=70338 feature_norm=105.85
Iter 123 time=2.18  loss=1285259.39 active=70338 feature_norm=106.74
Iter 124 time=2.18  loss=1283987.74 active=70338 feature_norm=107.11
Iter 125 time=2.17  loss=1283152.19 active=70338 feature_norm=107.31
Iter 126 time=4.34  loss=1282309.57 active=70338 feature_norm=107.70
Iter 127 time=2.22  loss=1281158.09 active=70338 feature_norm=108.08
Iter 128 time=2.29  loss=1280696.61 active=70338 feature_norm=108.27
Iter 129 time=2.34  loss=1280210.39 active=70338 feature_norm=108.91
Iter 130 time=2.23  loss=1279518.39 active=70338 feature_norm=108.85
Iter 131 time=2.13  loss=1279258.60 active=70338 feature_norm=108.85
Iter 132 time=2.27  loss=1278730.65 active=70338 feature_norm=109.00
Iter 133 time=2.20  loss=1278041.12 active=70338 feature_norm=109.40
Iter 134 time=2.18  loss=1277931.12 active=70338 feature_norm=109.74
Iter 135 time=2.14  loss=1276438.89 active=70338 feature_norm=110.42
Iter 136 time=2.15  loss=1275877.7

Iter 241 time=2.13  loss=1238402.99 active=70338 feature_norm=131.31
Iter 242 time=2.15  loss=1238034.21 active=70338 feature_norm=131.50
Iter 243 time=2.17  loss=1237771.79 active=70338 feature_norm=131.70
Iter 244 time=2.13  loss=1237598.45 active=70338 feature_norm=131.86
Iter 245 time=2.15  loss=1237430.00 active=70338 feature_norm=131.92
Iter 246 time=2.15  loss=1237184.25 active=70338 feature_norm=132.01
Iter 247 time=2.16  loss=1237140.86 active=70338 feature_norm=132.24
Iter 248 time=2.14  loss=1236922.59 active=70338 feature_norm=132.22
Iter 249 time=2.14  loss=1236846.51 active=70338 feature_norm=132.25
Iter 250 time=2.13  loss=1236786.50 active=70338 feature_norm=132.32
Iter 251 time=2.14  loss=1236682.09 active=70338 feature_norm=132.38
Iter 252 time=2.12  loss=1236417.66 active=70338 feature_norm=132.61
Iter 253 time=4.27  loss=1236304.73 active=70338 feature_norm=132.63
Iter 254 time=2.12  loss=1236138.68 active=70338 feature_norm=132.70
Iter 255 time=2.13  loss=1235968.6

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=None, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
  max_linesearch=None, min_freq=20, model_filename=None, num_memories=None,
  pa_type=None, period=None, trainer_cls=None, variance=None, verbose=True)

In [15]:
def segment(sent):
    tags = crf_tagger.predict_single(extract_sent_features(list(sent)))
    tokens = []
    tok = ""
    for ch, tag in zip(list(sent), tags):
        if tag in ['S', 'L'] and tok != "":
            tokens.append(tok)
            tok = ""
        tok += ch
    if tok:
        tokens.append(tok)
    return tokens
            
print(segment("法國總統馬克宏已到現場勘災，初步傳出火警可能與目前聖母院的維修工程有關。"))
    

['法國', '總統', '馬克宏', '已', '到', '現場', '勘災', '，', '初步', '傳出', '火警', '可能', '與', '目前', '聖母院', '的', '維修', '工程', '有關', '。']


## Evaluation

### Scorer for CWS

In [16]:
def compare(actual_toks, pred_toks):
    i = 0
    j = 0
    p = 0
    q = 0
    tp = 0
    fp = 0
    while i < len(actual_toks) and j < len(pred_toks):
        if p == q:
            if actual_toks[i] == pred_toks[j]:
                tp += 1
            else:
                fp += 1
            p += len(actual_toks[i])
            q += len(pred_toks[j])
            i += 1
            j += 1
        elif p < q:
            p += len(actual_toks[i])
            i += 1
        else:
            fp += 1
            q += len(pred_toks[j])
            j += 1
    return tp, fp, len(actual_toks)
    
def score(actual_sents, pred_sents):
    tp = 0
    fp = 0
    total = 0
    for actual_toks, pred_toks in zip(actual_sents, pred_sents):
        tp_, fp_, total_ = compare(actual_toks, pred_toks)
        tp += tp_
        fp += fp_
        total += total_
    recall = float(tp) / total
    precision = float(tp) / (tp + fp)
    f1 = 2.0 * recall * precision / (recall + precision)
    return recall, precision, f1        

### Testing

In [17]:
pred = []
actual = []
for sent in raw_test:
    pred.append(segment("".join(sent)))
    actual.append(sent)
print(actual[0])
print(pred[0])

print(score(actual, pred))

['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞班', '，']
['許多', '社區長', '青學', '苑多', '開設', '有', '書法', '、', '插花', '、', '土風舞班', '，']
(0.8590559143994259, 0.8518791094145525, 0.8554524597486447)


### Compared with jieba

In [18]:
pred = []
actual = []
fout = open("jieba.out", "w")
for sent in raw_test:
    text = "".join(sent)
    r = list(jieba.cut(HanziConv.toSimplified(text)))
    r = restore(text, r)
    fout.write(" ".join(r) + "\n")
    pred.append(r)
    actual.append(sent)
print(actual[0])
print(pred[0])

print(score(actual, pred))

['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞班', '，']
['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞', '班', '，']
(0.8148284073856593, 0.8291644535918204, 0.8219339234591463)


### Stanford CoreNLP

pip3 install stanfordnlp

In [19]:
import stanfordnlp

stanfordnlp.download('zh')

Using the default treebank "zh_gsd" for language "zh".
Would you like to download the models for: zh_gsd now? (Y/n)
Y

Default download directory: /Users/hhhuang/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: zh_gsd
Download location: /Users/hhhuang/stanfordnlp_resources/zh_gsd_models.zip


100%|██████████| 234M/234M [06:54<00:00, 568kB/s] 



Download complete.  Models saved to: /Users/hhhuang/stanfordnlp_resources/zh_gsd_models.zip
Extracting models file for: zh_gsd
Cleaning up...Done.
Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_tokenizer.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_tagger.pt', 'pretrain_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd.pretrain.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_lemmatizer.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings:



In [28]:
nlp = stanfordnlp.Pipeline(lang='zh')

text = "".join(raw_test[0])
results = nlp(HanziConv.toSimplified(text))
results.sentences[0].tokens

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_tokenizer.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_tagger.pt', 'pretrain_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd.pretrain.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_lemmatizer.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/hhhuang/stanfordnlp_resources/zh_gsd_models/zh_gsd_parser.pt', 'pretrain_path': '/Users/hhhuang/stanfordnlp_resources/zh_gs



[<Token index=1;words=[<Word index=1;text=许多;lemma=许多;upos=PROPN;xpos=NNP;feats=_;governor=4;dependency_relation=nmod>]>,
 <Token index=2;words=[<Word index=2;text=社区;lemma=社区;upos=NOUN;xpos=NN;feats=_;governor=4;dependency_relation=nmod>]>,
 <Token index=3;words=[<Word index=3;text=长青;lemma=长青;upos=ADJ;xpos=JJ;feats=_;governor=4;dependency_relation=amod>]>,
 <Token index=4;words=[<Word index=4;text=学苑;lemma=学苑;upos=NOUN;xpos=NN;feats=_;governor=6;dependency_relation=nsubj>]>,
 <Token index=5;words=[<Word index=5;text=多;lemma=多;upos=NUM;xpos=CD;feats=NumType=Card;governor=6;dependency_relation=advmod>]>,
 <Token index=6;words=[<Word index=6;text=开;lemma=开;upos=VERB;xpos=VV;feats=_;governor=0;dependency_relation=root>]>,
 <Token index=7;words=[<Word index=7;text=设;lemma=设;upos=VERB;xpos=VV;feats=_;governor=6;dependency_relation=xcomp>]>,
 <Token index=8;words=[<Word index=8;text=有;lemma=有;upos=VERB;xpos=VV;feats=_;governor=7;dependency_relation=xcomp>]>,
 <Token index=9;words=[<Word ind

In [29]:
toks = []
for t in results.sentences[0].tokens:
    toks.append(t.text)
    
print(toks)

['许多', '社区', '长青', '学苑', '多', '开', '设', '有', '书法', '、', '插花', '、', '土风', '舞班', '，']


In [31]:
print(restore(text, toks))

['許多', '社區', '長青', '學苑', '多', '開', '設', '有', '書法', '、', '插花', '、', '土風', '舞班', '，']
