In [1]:
# Sources: https://github.com/jiaeyan/Jiayan
#          https://github.com/kpu/kenlm

In [2]:
import pandas as pd
from jiayan import load_lm
from jiayan import CharHMMTokenizer
from jiayan import CRFPOSTagger
from jiayan import CRFSentencizer

In [3]:
columns = ["type", "content"]
data = pd.read_csv("/Users/jojoli/Documents/夏校申请:项目制作/英才计划/正式培养/chinese/datasets/Tang_Dynasty.txt",
                    delimiter=":", header=None,names=columns,skiprows=1)
data.head()

Unnamed: 0,type,content
0,初晴落景,晚霞聊自怡，初晴弥可喜。日晃百花色，风动千林翠。池鱼跃不同，园鸟声还异。寄言博通者，知予物外志。
1,初夏,一朝春夏改，隔夜鸟花迁。阴阳深浅叶，晓夕重轻烟。哢莺犹响殿，横丝正网天。珮高兰影接，绶细草纹...
2,度秋,夏律昨留灰，秋箭今移晷。峨嵋岫初出，洞庭波渐起。桂白发幽岩，菊黄开灞涘。运流方可叹，含毫属微理。
3,仪鸾殿早秋,寒惊蓟门叶，秋发小山枝。松阴背日转，竹影避风移。提壶菊花岸，高兴芙蓉池。欲知凉气早，巢空燕不窥。
4,秋日即目,爽气浮丹阙，秋光澹紫宫。衣碎荷疏影，花明菊点丛。袍轻低草露，盖侧舞松风。散岫飘云叶，迷路飞烟...


## An Example of Jiayan
### Jiayan POS Tags

Tag | Description | Example
--- | --- | ---
a | adjective | 幽明
b | other noun-modifier | 男，女
c | conjunction | 与，而
d | adverb | 皆
e | exclamation | 呜呼
g | morpheme | 甥
h | prefix | 非
i | idiom | 发愤忘食
j | abbreviation | 五帝
k | suffix | 者
m | number | 一，百
n | general noun | 鬼神，山川
nd | direction noun | 东，西，南，北
nh | person name | 轩辕
ni | organization name | 辽队
nl | location noun | 城北
ns | geographical name | 襄平县
nt | temporal noun | 春，夏，秋，冬
nz | other proper noun | 山海经
o | onomatopoeia | 呜呜
p | preposition | 以，为
q | quantity | 年，岁
r | pronoun | 其，斯
u | auxiliary | 之，所
v | verb | 赐
wp | punctuation | ，。！
ws | foreign words | CPU
x | non-lexeme | 萄, 翱
z | descriptive words | 默然，区区

In [6]:
text1 = '别来春半，触目柔肠断。砌下落梅如雪乱，拂了一身还满。雁来音信无凭，路遥归梦难成。离恨恰如春草，更行更远还生。'
text2 = '别来春半触目柔肠断砌下落梅如雪乱拂了一身还满雁来音信无凭路遥归梦难成离恨恰如春草更行更远还生'

lm = load_lm('/Users/jojoli/jiayan_models/jiayan.klm')
tokenizer = CharHMMTokenizer(lm)
words = list(tokenizer.tokenize(text1))
print(words)

postagger = CRFPOSTagger()
postagger.load('/Users/jojoli/jiayan_models/pos_model')
print('\n', postagger.postag(words))

sentencizer = CRFSentencizer(lm)
sentencizer.load('/Users/jojoli/jiayan_models/cut_model')
print('\n', sentencizer.sentencize(text2))

['别来', '春', '半', '，', '触目', '柔', '肠断', '。', '砌下', '落梅', '如雪', '乱', '，', '拂', '了一', '身', '还', '满', '。', '雁来', '音信', '无', '凭', '，', '路遥', '归', '梦难成', '。', '离恨', '恰如', '春草', '，', '更', '行', '更远', '还', '生', '。']

 ['nh', 'nt', 'm', 'wp', 'v', 'a', 'n', 'wp', 'v', 'v', 'n', 'a', 'wp', 'v', 'v', 'n', 'd', 'a', 'wp', 'v', 'n', 'v', 'n', 'wp', 'n', 'v', 'n', 'wp', 'v', 'v', 'n', 'wp', 'd', 'v', 'n', 'd', 'v', 'wp']

 ['别来春半', '触目柔肠断砌下落', '梅如雪乱', '拂了一身还满', '雁来音信', '无凭路遥', '归梦难成离恨', '恰如春草', '更行更远还生']
