# Hidden Markov Models for "Parts of speach tagging"

In [2]:
# Paweł Lorek

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.tag import hmm
import time



## Parts of speach tagging:

\begin{array}{ccccc}
\textrm{The} & \textrm{dog} & \textrm{ate} & \textrm{the} & \textrm{cat} \\
\downarrow & \downarrow & \downarrow & \downarrow & \downarrow\\
\textrm{DT} & \textrm{NN} & \textrm{VBD} & \textrm{DT} & \textrm{NN}
\end{array}

So-called **Penn's tags** (See https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html or https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)

\begin{array}{ll|ll}
1. & \textrm{CC}   & \textrm{Coordinating conjunction}   & 19. & \textrm{PRP\$} & \textrm{Possessive pronoun} \\
2. & \textrm{CD}   & \textrm{Cardinal number}            & 20. & \textrm{RB}    & \textrm{Adverb} \\
3. & \textrm{DT}   & \textrm{Determiner}                 & 21. & \textrm{RBR}   & \textrm{Adverb, comparative} \\
4. & \textrm{EX}   & \textrm{Existential }  there         & 22. & \textrm{RBS}   & \textrm{Adverb, superlative} \\
5. & \textrm{FW}   & \textrm{Foreign word}               & 23. & \textrm{RP}    & \textrm{Particle} \\
6. & \textrm{IN}   & \textrm{Preposition or subordinating conjunction} & 24. & \textrm{SYM}   & \textrm{Symbol} \\
7. & \textrm{JJ}   & \textrm{Adjective}                  & 25. & \textrm{TO}    & {to} \\
8. & \textrm{JJR}  & \textrm{Adjective, comparative}     & 26. & \textrm{UH}    & \textrm{Interjection} \\
9. & \textrm{JJS}  & \textrm{Adjective, superlative}     & 27. & \textrm{VB}    & \textrm{Verb, base form} \\
10. & \textrm{LS}  & \textrm{List item marker}           & 28. & \textrm{VBD}   & \textrm{Verb, past tense} \\
11. & \textrm{MD}  & \textrm{Modal}                      & 29. & \textrm{VBG}   & \textrm{Verb, gerund or present participle} \\
12. & \textrm{NN}  & \textrm{Noun, singular or mass}     & 30. & \textrm{VBN}   & \textrm{Verb, past participle} \\
13. & \textrm{NNS} & \textrm{Noun, plural}               & 31. & \textrm{VBP}   & \textrm{Verb, non-3rd person singular present} \\
14. & \textrm{NNP} & \textrm{Proper noun, singular}      & 32. & \textrm{VBZ}   & \textrm{Verb, 3rd person singular present} \\
15. & \textrm{NNPS}& \textrm{Proper noun, plural}        & 33. & \textrm{WDT}   & \textrm{Wh-determiner} \\
16. & \textrm{PDT} & \textrm{Predeterminer}              & 34. & \textrm{WP}    & \textrm{Wh-pronoun} \\
17. & \textrm{POS} & \textrm{Possessive ending}          & 35. & \textrm{WP\$}  & \textrm{Possessive wh-pronoun} \\
18. & \textrm{PRP} & \textrm{Personal pronoun}           & 36. & \textrm{WRB}   & \textrm{Wh-adverb} \\
\end{array}


Read data, split into training and test sets

In [5]:
#'Brown' corpus
nltk.download('brown')

#'Tree bank' corpus
nltk.download('treebank')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [6]:
# take only 40k

all_data = list(brown.tagged_sents()[:40000])
print("len(all_data) = ", len(all_data))
#all_data = list(treebank.tagged_sents()[:5000])

train_size = int(0.9*len(all_data))

len(all_data) =  40000


In [4]:
all_data[:2] # two sentences and corresponding speach tags

NameError: name 'all_data' is not defined

## Porter Stemmer
**The Porter Stemming algorithm** (or **Porter Stemmer**) is used to remove the suffixes from an English word and obtain its stem which becomes very useful in the field of Information Retrieval (IR).

In [7]:
words = ["program", "programs", "programmer", "programming", "programmers", "likes", "liked","likely","liking"]

porter = PorterStemmer()

for w in words:
    print(w, " : ", porter.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm
likes  :  like
liked  :  like
likely  :  like
liking  :  like


Shuffling data and stemming:

In [8]:
random.shuffle(all_data)


start_time = time.time()



porter = PorterStemmer()
train_data = [ [(porter.stem(word.lower()), tag) for word, tag in sent] for sent in all_data[:train_size]]
test_data = [ [(porter.stem(word.lower()), tag) for word, tag in sent] for sent in all_data[train_size:]]
print("\t\t took %s seconds " % round((time.time() - start_time),5))

		 took 17.19462 seconds 


In [9]:
train_data[0]

[('fruit', 'NN'), ('doe', 'DOZ'), ('nice', 'RB'), ('.', '.')]

## $n$-gram taggers:
"predict" tag for word $w$ using last $n$ words. E.g., for $n=2$, if we are to tag **ate** in sentence `The dog ate the cat`. Then we take into account words 'dog ate` and check what was most frequent tag for `ate` in training data where  `dog ate` appeared.

bigram tagger:

In [10]:
print("Calculating bigram tagger...", end="", flush=True)
start_time = time.time()
bigram_tagger = nltk.BigramTagger(train_data)
bigram_tagger_eval =   bigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("bigram tagger: ", bigram_tagger_eval)

Calculating bigram tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_tagger_eval =   bigram_tagger.evaluate(test_data)


		 took 5.15385 seconds 
bigram tagger:  0.30751828925154756


trigram tagger:

In [11]:
print("Calculating trigram tagger...", end="", flush=True)
start_time = time.time()
trigram_tagger = nltk.TrigramTagger(train_data)
trigram_tagger_eval =   trigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("trigram tagger: ", trigram_tagger_eval)

Calculating trigram tagger...		 took 5.71345 seconds 
trigram tagger:  0.14016882386043894


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  trigram_tagger_eval =   trigram_tagger.evaluate(test_data)


# Hidden Markov Model tagger

Hidden states: tags
Observations: sentences



![picture](https://raw.githubusercontent.com/lorek/MethodsClassDimRed/main/figures/hmm_speach_tag.png)

**NOTE: may take ~ 5 min**

In [12]:
print("Calculating Hmm tagger...", end="", flush=True)
start_time = time.time()
hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_data)
hmm_tagger_eval = hmm_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("hmm_tagger: ",hmm_tagger_eval )

Calculating Hmm tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_tagger_eval = hmm_tagger.evaluate(test_data)
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])


		 took 313.82923 seconds 
hmm_tagger:  0.7316038266741699


In [13]:

print("Comparison:")
print("bigram tagger: \t\t", bigram_tagger_eval)
print("trigram tagger: \t", trigram_tagger_eval)
print("hmm_tagger: \t\t",hmm_tagger_eval )

Comparison:
bigram tagger: 		 0.30751828925154756
trigram tagger: 	 0.14016882386043894
hmm_tagger: 		 0.7316038266741699


# Q9.1
* We performed speech tagging for `brown` dataset. Do the same for `treebank` dataset. Compare results with and without stemming.

In [14]:
all_data = list(treebank.tagged_sents())
print("len(all_data) = ", len(all_data))
train_size = int(0.9*len(all_data))
all_data[:2] # two sentences and corresponding speach tags

len(all_data) =  3914


[[('Pierre', 'NNP'),
  ('Vinken', 'NNP'),
  (',', ','),
  ('61', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  (',', ','),
  ('will', 'MD'),
  ('join', 'VB'),
  ('the', 'DT'),
  ('board', 'NN'),
  ('as', 'IN'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('Nov.', 'NNP'),
  ('29', 'CD'),
  ('.', '.')],
 [('Mr.', 'NNP'),
  ('Vinken', 'NNP'),
  ('is', 'VBZ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Elsevier', 'NNP'),
  ('N.V.', 'NNP'),
  (',', ','),
  ('the', 'DT'),
  ('Dutch', 'NNP'),
  ('publishing', 'VBG'),
  ('group', 'NN'),
  ('.', '.')]]

In [15]:
random.shuffle(all_data)

start_time = time.time()

train_data = [ [(word.lower(), tag) for word, tag in sent] for sent in all_data[:train_size]]
test_data = [ [(word.lower(), tag) for word, tag in sent] for sent in all_data[train_size:]]
print("\t\t took %s seconds " % round((time.time() - start_time),5))

print("Calculating bigram tagger...", end="", flush=True)
start_time = time.time()
bigram_tagger = nltk.BigramTagger(train_data)
bigram_tagger_eval =   bigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("bigram tagger: ", bigram_tagger_eval)


print("Calculating trigram tagger...", end="", flush=True)
start_time = time.time()
trigram_tagger = nltk.TrigramTagger(train_data)
trigram_tagger_eval =   trigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("trigram tagger: ", trigram_tagger_eval)

print("Calculating Hmm tagger...", end="", flush=True)
start_time = time.time()
hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_data)
hmm_tagger_eval = hmm_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("hmm_tagger: ",hmm_tagger_eval )


print("Comparison:")
print("bigram tagger: \t\t", bigram_tagger_eval)
print("trigram tagger: \t", trigram_tagger_eval)
print("hmm_tagger: \t\t",hmm_tagger_eval )

		 took 0.06001 seconds 
Calculating bigram tagger...		 took 0.90791 seconds 
bigram tagger:  0.16463474105139958
Calculating trigram tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_tagger_eval =   bigram_tagger.evaluate(test_data)


		 took 0.51982 seconds 
trigram tagger:  0.08456061640495464
Calculating Hmm tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  trigram_tagger_eval =   trigram_tagger.evaluate(test_data)
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_tagger_eval = hmm_tagger.evaluate(test_data)


		 took 3.91303 seconds 
hmm_tagger:  0.5199453818394616
Comparison:
bigram tagger: 		 0.16463474105139958
trigram tagger: 	 0.08456061640495464
hmm_tagger: 		 0.5199453818394616


In [None]:
random.shuffle(all_data)

start_time = time.time()

porter = PorterStemmer()
train_data = [ [(porter.stem(word.lower()), tag) for word, tag in sent] for sent in all_data[:train_size]]
test_data = [ [(porter.stem(word.lower()), tag) for word, tag in sent] for sent in all_data[train_size:]]
print("\t\t took %s seconds " % round((time.time() - start_time),5))

print("Calculating bigram tagger...", end="", flush=True)
start_time = time.time()
bigram_tagger = nltk.BigramTagger(train_data)
bigram_tagger_eval =   bigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("bigram tagger: ", bigram_tagger_eval)


print("Calculating trigram tagger...", end="", flush=True)
start_time = time.time()
trigram_tagger = nltk.TrigramTagger(train_data)
trigram_tagger_eval =   trigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("trigram tagger: ", trigram_tagger_eval)

print("Calculating Hmm tagger...", end="", flush=True)
start_time = time.time()
hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_data)
hmm_tagger_eval = hmm_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("hmm_tagger: ",hmm_tagger_eval )


print("Comparison:")
print("bigram tagger: \t\t", bigram_tagger_eval)
print("trigram tagger: \t", trigram_tagger_eval)
print("hmm_tagger: \t\t",hmm_tagger_eval )

		 took 1.56982 seconds 
Calculating bigram tagger...		 took 0.63214 seconds 
bigram tagger:  0.17774282343688558
Calculating trigram tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_tagger_eval =   bigram_tagger.evaluate(test_data)


		 took 0.68951 seconds 
trigram tagger:  0.09683444750294927
Calculating Hmm tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  trigram_tagger_eval =   trigram_tagger.evaluate(test_data)
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_tagger_eval = hmm_tagger.evaluate(test_data)


		 took 3.53523 seconds 
hmm_tagger:  0.5467951238694455
Comparison:
bigram tagger: 		 0.17774282343688558
trigram tagger: 	 0.09683444750294927
hmm_tagger: 		 0.5467951238694455
