In [4]:
import nltk
from nltk.corpus import treebank
from nltk.probability import LaplaceProbDist, WittenBellProbDist
from nltk.tag.hmm import HiddenMarkovModelTrainer

In [5]:
tagged_sents = treebank.tagged_sents()

train_data = tagged_sents[0:3000]
test_data = tagged_sents[3000:]

In [3]:
trainer = HiddenMarkovModelTrainer()

In [6]:
tagger = trainer.train_supervised(train_data,
                                 estimator=LaplaceProbDist)

In [7]:
tagger.evaluate(train_data)

0.8843906026241437

In [8]:
tagger.evaluate(test_data)

0.8444636304770128

In [9]:
tagger_witten = trainer.train_supervised(train_data,
                                 estimator=WittenBellProbDist)

In [11]:
tagger_witten.evaluate(train_data)

0.9677594147927391

In [12]:
tagger_witten.evaluate(test_data)

0.9192747679689186

In [13]:
def get_text(sent):
    words = [item[0] for item in sent]
    return " ".join(words)

def get_pos(sent):
    words = [item[1] for item in sent]
    return " ".join(words)

In [15]:
s = get_text(test_data[0]).split()
tagger_witten.tag(s)

[('At', 'IN'),
 ('Tokyo', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'JJ'),
 ('index', 'NN'),
 ('of', 'IN'),
 ('225', 'CD'),
 ('selected', 'CD'),
 ('issues', 'NNS'),
 (',', ','),
 ('which', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('gained', 'VBD'),
 ('132', 'NNP'),
 ('points', 'NNP'),
 ('Tuesday', 'NNP'),
 (',', ','),
 ('added', 'VBD'),
 ('14.99', 'JJ'),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35564.43', 'VB'),
 ('.', '.')]

In [19]:
from nltk.tag.crf import CRFTagger
import pycrfsuite
crf_tagger = CRFTagger()
crf_tagger.train(train_data, 'tmp.mdl')

In [20]:
crf_tagger.evaluate(train_data)

0.959244494329837

In [21]:
crf_tagger.evaluate(test_data)

0.9474638463198791

In [22]:
crf_tagger.tag(s)

[('At', 'IN'),
 ('Tokyo', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'NNP'),
 ('index', 'NN'),
 ('of', 'IN'),
 ('225', 'CD'),
 ('selected', 'JJ'),
 ('issues', 'NNS'),
 (',', ','),
 ('which', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('gained', 'VBN'),
 ('132', '-NONE-'),
 ('points', 'VBZ'),
 ('Tuesday', 'NNP'),
 (',', ','),
 ('added', 'VBD'),
 ('14.99', 'CD'),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35564.43', 'CD'),
 ('.', '.')]

In [None]:
train_data_small = [[('tính', 'N'), ('tốt', 'A')],
                   [('tính', 'V'), ('nhanh', 'N')],
                   [('rèn', 'V'), ('tính', 'V')]]

crf_tagger.train(train_data_small, 'tmp.mdl')