# 4-1. **POS Tagging**
POS tagging is the process of marking up a word in a corpus to a corresponding part of a speech tag, based on its context and definition. This task is not straightforward, as a particular word may have a different part of speech based on the context in which the word is used

## Regular Expression Tagger

The regular expression tagger assigns tags to tokens on the basis of matching patterns. For instance, we might guess that any word ending in ed is the past participle of a verb, and any word ending with 's is a possessive noun. We can express these as a list of regular expressions:



In [None]:
import nltk

# Downloading required corpus
nltk.download('punkt')
nltk.download('brown')

from nltk import word_tokenize
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
 # Define regular expression patterns
patterns = [
            (r'.*ing$', 'VBG'),               # gerunds
            (r'.*ed$', 'VBD'),                # simple past
            (r'.*es$', 'VBZ'),                # 3rd singular present
            (r'.*ould$', 'MD'),               # modals
            (r'.*\'s$', 'NN$'),               # possessive nouns
            (r'.*s$', 'NNS'),                 # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')                     # nouns (default)
        ]

In [None]:
# Build regular expression tagger using the defined patterns
regexp_tagger = nltk.RegexpTagger(patterns)

# Print one of the sentences
print(brown_sents[3])
# Print one of the tagged sentences
print(regexp_tagger.tag(brown_sents[3]))

['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', "''", ',', 'the', 'jury', 'said', ',', '``', 'considering', 'the', 'widespread', 'interest', 'in', 'the', 'election', ',', 'the', 'number', 'of', 'voters', 'and', 'the', 'size', 'of', 'this', 'city', "''", '.']
[('``', 'NN'), ('Only', 'NN'), ('a', 'NN'), ('relative', 'NN'), ('handful', 'NN'), ('of', 'NN'), ('such', 'NN'), ('reports', 'NNS'), ('was', 'NNS'), ('received', 'VBD'), ("''", 'NN'), (',', 'NN'), ('the', 'NN'), ('jury', 'NN'), ('said', 'NN'), (',', 'NN'), ('``', 'NN'), ('considering', 'VBG'), ('the', 'NN'), ('widespread', 'NN'), ('interest', 'NN'), ('in', 'NN'), ('the', 'NN'), ('election', 'NN'), (',', 'NN'), ('the', 'NN'), ('number', 'NN'), ('of', 'NN'), ('voters', 'NNS'), ('and', 'NN'), ('the', 'NN'), ('size', 'NN'), ('of', 'NN'), ('this', 'NNS'), ('city', 'NN'), ("''", 'NN'), ('.', 'NN')]


In [None]:
print(brown_tagged_sents[3])

[('``', '``'), ('Only', 'RB'), ('a', 'AT'), ('relative', 'JJ'), ('handful', 'NN'), ('of', 'IN'), ('such', 'JJ'), ('reports', 'NNS'), ('was', 'BEDZ'), ('received', 'VBN'), ("''", "''"), (',', ','), ('the', 'AT'), ('jury', 'NN'), ('said', 'VBD'), (',', ','), ('``', '``'), ('considering', 'IN'), ('the', 'AT'), ('widespread', 'JJ'), ('interest', 'NN'), ('in', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('the', 'AT'), ('number', 'NN'), ('of', 'IN'), ('voters', 'NNS'), ('and', 'CC'), ('the', 'AT'), ('size', 'NN'), ('of', 'IN'), ('this', 'DT'), ('city', 'NN'), ("''", "''"), ('.', '.')]


In [None]:
# Evaluate the tagger (Calculate the accuracy/performance)
regexp_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  regexp_tagger.evaluate(brown_tagged_sents)


0.20326391789486245

In [None]:
text = 'This race is awesome, I want to race too'

In [None]:
tokens = word_tokenize(text)

print(regexp_tagger.tag(tokens))

[('This', 'NNS'), ('race', 'NN'), ('is', 'NNS'), ('awesome', 'NN'), (',', 'NN'), ('I', 'NN'), ('want', 'NN'), ('to', 'NN'), ('race', 'NN'), ('too', 'NN')]


## Hidden Markov Models

A hidden Markov model (HMM) allows us to talk about both observed events (like words that we see in the input) and hidden events (like part-of-speech tags) that we think of as causal factors in our probabilistic model.

In [None]:
# Hidden Markov Models in Python
# Katrin Erk, March 2013 updated March 2016
#
# This HMM addresses the problem of part-of-speech tagging. It estimates
# the probability of a tag sequence for a given word sequence as follows:
#
# Say words = w1....wN
# and tags = t1..tN
#
# then
# P(tags | words) is_proportional_to product P(ti | t{i-1}) P(wi | ti)
#
# To find the best tag sequence for a given sequence of words,
# we want to find the tag sequence that has the maximum P(tags | words)
import nltk
import sys
nltk.download('brown')

from nltk.corpus import brown
from nltk.corpus import treebank

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
# Estimating P(wi | ti) from corpus data using Maximum Likelihood Estimation (MLE):
# P(wi | ti) = count(wi, ti) / count(ti)
#
# We add an artificial 'start' tag at the beginning of each sentence, and
# We add an artificial 'end' tag at the end of each sentence.
# So we start out with the brown tagged sentences,
# add the two artificial tags,
# and then make one long list of all the tag/word pairs.

brown_tags_words = []
brown_tagged_sents = brown.tagged_sents()

for sent in brown_tagged_sents:
    # sent is a list of word/tag pairs
    # add START/START at the beginning
    brown_tags_words.append(('START', 'START'))
    # then all the tag/word pairs for the word/tag pairs in the sentence.
    # shorten tags to 2 characters each
    brown_tags_words.extend([(tag[:2], word) for (word, tag) in sent])
    # then END/END
    brown_tags_words.append( ('END', 'END') )

# conditional frequency distribution:
# count(wi, ti)
cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
# conditional probability distribution, using
# maximum likelihood estimate:
# P(wi | ti)
cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

print('The probability of an adjective (JJ) being "new" is', cpd_tagwords['JJ'].prob('new'))
print('The probability of a verb (VB) being "duck" is', cpd_tagwords['VB'].prob('duck'))

In [None]:
# Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE):
# P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1})
#
brown_tags = [tag for (tag, word) in brown_tags_words ]
# make conditional frequency distribution:
# count(t{i-1}, ti)
cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
# make conditional probability distribution, using
# maximum likelihood estimate:
# P(ti | t{i-1})
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

print('If we have just seen "DT", the probability of "NN" is', cpd_tags['DT'].prob('NN'))
print( 'If we have just seen "VB", the probability of "JJ" is', cpd_tags['VB'].prob('DT'))
print( 'If we have just seen "VB", the probability of "NN" is', cpd_tags['VB'].prob('NN'))

The probability of an adjective (JJ) being "new" is 0.01472344917632025
The probability of a verb (VB) being "duck" is 6.042713350943527e-05
If we have just seen "DT", the probability of "NN" is 0.5057722522030194
If we have just seen "VB", the probability of "JJ" is 0.016885067592065053
If we have just seen "VB", the probability of "NN" is 0.10970977711020183


##  Train HMM Tagger with NLTK HMM Trainer

In [None]:
# Pretagged training data
brown_tagged_sents = brown.tagged_sents()

print(brown_tagged_sents)

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [None]:
# Import HMM module
from nltk.tag import hmm

# Setup a trainer with default(None) values
# And train with the data
trainer = hmm.HiddenMarkovModelTrainer()
trained_tagger = trainer.train_supervised(brown_tagged_sents)

print (trained_tagger)
# Prints the basic data about the tagger

tokens = word_tokenize(text)
print(trained_tagger.tag(tokens))

<HiddenMarkovModelTagger 472 states and 56057 output symbols>
[('This', 'DT'), ('race', 'NN'), ('is', 'BEZ'), ('awesome', 'JJ'), (',', ','), ('I', 'PPSS'), ('want', 'VB'), ('to', 'TO'), ('race', 'VB'), ('too', 'QL')]


In [None]:
korean_text = '이번 경주는 정말 멋진데, 나도 경주하고 싶다'

In [None]:
tokens = word_tokenize(korean_text)
print(trained_tagger.tag(tokens))

[('이번', 'AT'), ('경주는', 'AT'), ('정말', 'AT'), ('멋진데', 'AT'), (',', 'AT'), ('나도', 'AT'), ('경주하고', 'AT'), ('싶다', 'AT')]


## POS Tagging for Korean with [Kkma](http://kkma.snu.ac.kr/documents/?doc=postag)

In [None]:
# Install konlpy
!pip install -q konlpy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import Kkma module
from konlpy.tag import Kkma
kkma = Kkma()

In [None]:
# tokenization
tokens = kkma.morphs(text)
print(tokens)

# POS tagging
tags = kkma.pos(text)
print(tags)

['This', 'race', 'is', 'awesome', ',', 'I', 'want', 'to', 'race', 'too']
[('This', 'OL'), ('race', 'OL'), ('is', 'OL'), ('awesome', 'OL'), (',', 'SP'), ('I', 'OL'), ('want', 'OL'), ('to', 'OL'), ('race', 'OL'), ('too', 'OL')]


In [None]:
tokens = kkma.morphs(korean_text)
print(tokens)

tags = kkma.pos(korean_text)
print(tags)

['이번', '경주', '는', '정말', '멋지', 'ㄴ데', ',', '나도', '경주', '하', '고', '싶', '다']
[('이번', 'NNG'), ('경주', 'NNG'), ('는', 'JX'), ('정말', 'MAG'), ('멋지', 'VA'), ('ㄴ데', 'ECE'), (',', 'SP'), ('나도', 'NNG'), ('경주', 'NNG'), ('하', 'XSV'), ('고', 'ECE'), ('싶', 'VXA'), ('다', 'EFN')]
