# **References**
This tutorial is based on https://github.com/hb20007/hands-on-nltk-tutorial

# 1.1 **Downloading NLTK Libraries:** *Getting ready to start!*

In [1]:
# We install and import necessary python libraries

# !pip install nltk TwitterSearch unidecode langdetect langid gensim

import nltk # https://www.nltk.org/install.html
import numpy # https://www.scipy.org/install.html
import matplotlib.pyplot # https://matplotlib.org/downloads.html
import tweepy # https://github.com/tweepy/tweepy
import TwitterSearch # https://github.com/ckoepp/TwitterSearch
import unidecode # https://pypi.python.org/pypi/Unidecode
import langdetect # https://pypi.python.org/pypi/langdetect
import langid # https://github.com/saffsd/langid.py
import gensim # https://radimrehurek.com/gensim/install.html


# Nltk has many extra functionalities that can be downloaded

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('reuters')
# nltk.download('wordnet')
# nltk.download('words')
# nltk.download('brown')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')
# nltk.download('maxent_ne_chunker')
# nltk.download('names')

# !apt-get install unzip
# !unzip /root/nltk_data/corpora/reuters.zip -d /root/nltk_data/corpora
# #!unzip /root/nltk_data/corpora/stopwords.zip -d /root/nltk_data/corpora
# #from nltk.corpus import reuters



# 3.3 **Creating a POS Tagger:** *Creating a Parts Of Speech tagger*

We can train a classifier to work out which suffixes are most informative for POS tagging. We can begin by finding out what the most common suffixes are

In [2]:
from nltk.corpus import brown
from nltk import FreqDist

suffix_fdist = FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
    
suffix_fdist

FreqDist({'e': 202946, ',': 175002, '.': 152999, 's': 128722, 'd': 105687, 't': 94459, 'he': 92084, 'n': 87889, 'a': 74912, 'of': 72978, ...})

In [3]:
word = brown.words()[0][-3:]
# print(word)
suffix_fdist


FreqDist({'e': 202946, ',': 175002, '.': 152999, 's': 128722, 'd': 105687, 't': 94459, 'he': 92084, 'n': 87889, 'a': 74912, 'of': 72978, ...})

In [4]:
suffix_fdist

FreqDist({'e': 202946, ',': 175002, '.': 152999, 's': 128722, 'd': 105687, 't': 94459, 'he': 92084, 'n': 87889, 'a': 74912, 'of': 72978, ...})

In [5]:
word = brown.words()[0]
word, word[-1:]

('The', 'e')

In [6]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']

Next, we'll define a feature extractor function which checks a given word for these suffixes:

In [7]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

for k, v in pos_features('test').items():
    if v == True:
        print(k, v)

endswith(t) True
endswith(st) True


In [8]:

for k, v in pos_features('noun').items():
    if v == True:
        print(k, v)

endswith(n) True


Now that we've defined our feature extractor, we can use it to train a new decision tree classifier:

In [9]:
brown.tagged_words(categories="news")

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [10]:
tagged_words = brown.tagged_words(categories='news')
tagged_words[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [11]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets[0]

({'endswith(e)': True,
  'endswith(,)': False,
  'endswith(.)': False,
  'endswith(s)': False,
  'endswith(d)': False,
  'endswith(t)': False,
  'endswith(he)': True,
  'endswith(n)': False,
  'endswith(a)': False,
  'endswith(of)': False,
  'endswith(the)': True,
  'endswith(y)': False,
  'endswith(r)': False,
  'endswith(to)': False,
  'endswith(in)': False,
  'endswith(f)': False,
  'endswith(o)': False,
  'endswith(ed)': False,
  'endswith(nd)': False,
  'endswith(is)': False,
  'endswith(on)': False,
  'endswith(l)': False,
  'endswith(g)': False,
  'endswith(and)': False,
  'endswith(ng)': False,
  'endswith(er)': False,
  'endswith(as)': False,
  'endswith(ing)': False,
  'endswith(h)': False,
  'endswith(at)': False,
  'endswith(es)': False,
  'endswith(or)': False,
  'endswith(re)': False,
  'endswith(it)': False,
  'endswith(``)': False,
  'endswith(an)': False,
  "endswith('')": False,
  'endswith(m)': False,
  'endswith(;)': False,
  'endswith(i)': False,
  'endswith(ly)': 

In [12]:
from nltk import DecisionTreeClassifier
from nltk.classify import accuracy

cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

In [13]:
len(featuresets), cutoff

(100554, 10055)

In [14]:
#classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers.

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
#bayes_classifier = SklearnClassifier(BernoulliNB(), sparse=False).train(train_set)
#svm_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
classifier = SklearnClassifier(DecisionTreeClassifier(), sparse=False).train(train_set)
#print(accuracy(classifier, test_set) , accuracy(bayes_classifier, test_set))
#print(accuracy(classifier, test_set) , accuracy(svm_classifier, test_set), accuracy(bayes_classifier, test_set))

In [15]:
#classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers.

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB

classifier = SklearnClassifier(DecisionTreeClassifier(), sparse=False).train(train_set)


In [16]:
accuracy(classifier, test_set)

0.6459472899055196

In [17]:
classifier.classify(pos_features('cats'))

'NNS'

In [18]:
classifier.classify(pos_features('Obama'))

'AT'

In [19]:
#classifier.pseudocode(depth=4)

## Assembling the classifier & Improving it

In [20]:
# Let's assemble the parts
from nltk.classify import SklearnClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import brown
from nltk import FreqDist
from nltk.classify import accuracy

""" Make the FreqDist """
suffix_fdist = FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

""" Get the Common Suffixes as a list """
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(200)]

""" Define a pos_features function """

def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features
    
""" Get tagged_words from Brown corpus and define featuresets """
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

""" Define a cutoff, and make the train and test sets """
cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

# This may take a few minutes!
""" Let's use the Decision Tree Classifier to train """
classifier = SklearnClassifier(DecisionTreeClassifier(), sparse=False).train(train_set)

In [21]:
accuracy(classifier, test_set)

0.6922923918448534

In [22]:
featuresets[0]

({'endswith(e)': True,
  'endswith(,)': False,
  'endswith(.)': False,
  'endswith(s)': False,
  'endswith(d)': False,
  'endswith(t)': False,
  'endswith(he)': True,
  'endswith(n)': False,
  'endswith(a)': False,
  'endswith(of)': False,
  'endswith(the)': True,
  'endswith(y)': False,
  'endswith(r)': False,
  'endswith(to)': False,
  'endswith(in)': False,
  'endswith(f)': False,
  'endswith(o)': False,
  'endswith(ed)': False,
  'endswith(nd)': False,
  'endswith(is)': False,
  'endswith(on)': False,
  'endswith(l)': False,
  'endswith(g)': False,
  'endswith(and)': False,
  'endswith(ng)': False,
  'endswith(er)': False,
  'endswith(as)': False,
  'endswith(ing)': False,
  'endswith(h)': False,
  'endswith(at)': False,
  'endswith(es)': False,
  'endswith(or)': False,
  'endswith(re)': False,
  'endswith(it)': False,
  'endswith(``)': False,
  'endswith(an)': False,
  "endswith('')": False,
  'endswith(m)': False,
  'endswith(;)': False,
  'endswith(i)': False,
  'endswith(ly)': 

In [23]:
# Let's see the accuracy
nltk.classify.accuracy(classifier, test_set)

0.6922923918448534

To improve the classifier, we can add contextual features:

Instead of working with tagged words, let's work with tagged sentences:
```py
tagged_sents = brown.tagged_sents(categories='news')
```

Now our input looks like:

tagged_sents[0] -> [('The', 'AT'), 
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'), ... ]

We can then improve this further by adding more features such as `prev-tag` etc.

In [24]:
""" Let's try incorporating some new features into a classifier
Make sure to use nltk's DecisionTree, so we can visualize the decision tree
"""

# Use the following imports:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC

def pos_features(sentence, i):
    """ Code here """
    features = {}
    # For each word, let's add four types of features:
    # 1. The last character of the word 
    # 2. The last two characters of the word 
    # 3. The last three characters of the word 
    features = {"suffix(1)":sentence[i][-1:],
    "suffix(2)":sentence[i][-2:],
    "suffix(3)":sentence[i][-3:],
    }
    # 4. The current's word's previous word (in the sentence)
    

    return features

In [25]:
ex_sent = ['hello', 'my','name']
pos_features(ex_sent,0)

{'suffix(1)': 'o', 'suffix(2)': 'lo', 'suffix(3)': 'llo'}

In [26]:
def pos_features(sentence, i):
    """ Code here """
    features = {}
    # For each word, let's add four types of features:
    # 1. The last character of the word 
    
    # 2. The last two characters of the word
    
    # 3. The last three characters of the word
    features = {"suffix(1)":sentence[i][-1:],
    "suffix(2)":sentence[i][-2:],
    "suffix(3)":sentence[i][-3:],
    }
    # 4. The current's word's previous word (in the sentence)

    if i == 0:
      features['prev-word'] = '<START>'
    else:
      features['prev-word'] = sentence[i-1]

    if i == len(sentence)-1:
      features["next-word"] = '<END>'
    else:
      features['next-word'] = sentence[i+1]

    if i == 0 or i==1:
      features["prev2-word"] = '<START>'
    else:
      features['prev2-word'] = sentence[i-2]

    return features

print(ex_sent)
print(pos_features(ex_sent, 2))

['hello', 'my', 'name']
{'suffix(1)': 'e', 'suffix(2)': 'me', 'suffix(3)': 'ame', 'prev-word': 'my', 'next-word': '<END>', 'prev2-word': 'hello'}


In [27]:
""" Let's train using the new features  """
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
#from sklearn.tree import DecisionTreeClassifier

""" This time, we get tagged_sents instead of tagged words """
tagged_sents = brown.tagged_sents(categories='news')

""" Next, make featuresets with our new pos_features function"""
featuresets = []
for tagged_sent in tagged_sents:
  untagged_sent = nltk.tag.untag(tagged_sent)
  for i, (word, tag) in enumerate(tagged_sent):
    featuresets.append((pos_features(untagged_sent,i), tag))

""" Define a cutoff, and make the train and test sets """
cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]


""" First use NLTK so we can see decision tree """
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7522625559423173

In [28]:
classifier.classify(pos_features(['i', 'like', 'cats'], 1))

'VB'

In [29]:
classifier.classify(pos_features(['president', 'Obama'], 0))

'NN'

In [30]:
featuresets[0]

({'suffix(1)': 'e',
  'suffix(2)': 'he',
  'suffix(3)': 'The',
  'prev-word': '<START>',
  'next-word': 'Fulton',
  'prev2-word': '<START>'},
 'AT')

In [31]:
# We can visualize nltk's decision tree!
print(classifier.pseudocode(depth=10))

if suffix(3) == '!': 
  if prev2-word == '$14': return '.-HL'
  if prev2-word == "''": return '.'
  if prev2-word == 'about': return '.'
  if prev2-word == 'blood': return '.'
  if prev2-word == 'burden': return '.'
  if prev2-word == 'Damascus': return '.'
  if prev2-word == 'de': return '.'
  if prev2-word == 'entertaining': return '.'
  if prev2-word == 'Fall': return '.'
  if prev2-word == 'for': return '.-HL'
  if prev2-word == 'goodness': return '.'
  if prev2-word == 'had': return '.'
  if prev2-word == 'it': return '.'
  if prev2-word == 'Jane': return '.'
  if prev2-word == 'luck': return '.'
  if prev2-word == 'of': return '.'
  if prev2-word == 'parties': return '.'
  if prev2-word == 'plans': return '.'
  if prev2-word == 'precedent': return '.'
  if prev2-word == 'read': return '.'
  if prev2-word == 'sake': return '.'
  if prev2-word == 'sang': return '.'
  if prev2-word == 'Sentry': return '.'
  if prev2-word == 'started': return '.'
  if prev2-word == 'Unconscionable': 

In [32]:
nltk.classify.accuracy(classifier, test_set)

0.7522625559423173

In [33]:
""" Try Sklearn's decision tree classifier - Does it perform better? """
sklearn_classifier = SklearnClassifier(DecisionTreeClassifier(), sparse=True).train(train_set)
nltk.classify.accuracy(sklearn_classifier, test_set)

0.8194927896568871

In [34]:
# Does it correctly tag "cat" now?
print(classifier.classify(pos_features(['there', 'was', 'a','cat'], 3)))
print(sklearn_classifier.classify(pos_features(['there', 'was', 'a','cat'], 3)))

.
JJ


In [35]:
print(classifier.classify(pos_features(['there', 'are', 'many','cats'], 3)))
print(sklearn_classifier.classify(pos_features(['there', 'are', 'many','cats'], 3)))

NNS
NNS-HL


In [36]:
print(classifier.classify(pos_features(['cat'], 0)))
print(sklearn_classifier.classify(pos_features(['cat'], 0)))

.
NP-HL


In [37]:
# Bonus example: Can you Incorporate the tag of the previous word as a feature too?
# Extra example: Incorporate history for sequential tagging


In [38]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

# Can you fill the following function?
def pos_features(sentence, i, history):
    #history=[feature1, featreu2, ...]
     # Reuse the features we used before
     features = features = {"suffix(1)":sentence[i][-1:],
    "suffix(2)":sentence[i][-2:],
    "suffix(3)":sentence[i][-3:],
    }


     # Here, we add additional features
     if i == 0: # This is the start of history
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else: # What should go here?
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]

     return features

class ConsecutivePosTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)

            # We now keep track of a history
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = SklearnClassifier(DecisionTreeClassifier(), sparse=True).train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


""" Define a cutoff, and make the train and test sets """
cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]


""" Train & eval """
tagger = ConsecutivePosTagger(train_set)
print(tagger.evaluate(test_set))

ValueError: too many values to unpack (expected 2)

In [None]:
# Let's test it
list(tagger.tag('there was a cat'.split(' ')))

# 3.4 **Parts of Speech and Meaning (English Only):** *Exploring awesome features offered by WordNet*

In [39]:
t = "Cyprus, officially the Republic of Cyprus, is an island country in the Eastern Mediterranean and the third largest and third most populous island in the Mediterranean. Cyprus is located south of Turkey, west of Syria and Lebanon, northwest of Israel, north of Egypt, and southeast of Greece. Cyprus is a major tourist destination in the Mediterranean. With an advanced, high-income economy and a very high Human Development Index, the Republic of Cyprus has been a member of the Commonwealth since 1961 and was a founding member of the Non-Aligned Movement until it joined the European Union on 1 May 2004. On 1 January 2008, the Republic of Cyprus joined the eurozone."

## 1. Tokenization

In [40]:
from nltk import sent_tokenize, word_tokenize
sentences = sent_tokenize(t.lower())
sentences

['cyprus, officially the republic of cyprus, is an island country in the eastern mediterranean and the third largest and third most populous island in the mediterranean.',
 'cyprus is located south of turkey, west of syria and lebanon, northwest of israel, north of egypt, and southeast of greece.',
 'cyprus is a major tourist destination in the mediterranean.',
 'with an advanced, high-income economy and a very high human development index, the republic of cyprus has been a member of the commonwealth since 1961 and was a founding member of the non-aligned movement until it joined the european union on 1 may 2004. on 1 january 2008, the republic of cyprus joined the eurozone.']

In [44]:
tokens = word_tokenize(sentences[2])
type(tokens)

list

## 2. POS Tagging

In [42]:
from nltk import pos_tag
tags = pos_tag(tokens)
tags

[('cyprus', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('major', 'JJ'),
 ('tourist', 'NN'),
 ('destination', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mediterranean', 'NN'),
 ('.', '.')]

To access documentation for tags, for example for `NN`:

In [43]:
import nltk.help
nltk.help.upenn_tagset('NN')
nltk.help.upenn_tagset('JJ')
nltk.help.upenn_tagset()

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,1

## What is WordNet?

**WordNet** is the lexical database i.e. dictionary for the English language, specifically designed for natural language processing.

**Synset** is a special kind of a simple interface that is present in NLTK to look up words in WordNet. Synset instances are the groupings of synonymous words that express the same concept. Some of the words have only one Synset and some have several.

In [47]:
# nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/kky8822/nltk_data...


KeyboardInterrupt: 

In [48]:

from nltk.corpus import wordnet

# Understanding Synset
syn = wordnet.synsets('hello')[0]
  
print ("Synset name :  ", syn.name())
  
# Defining the word
print ("\nSynset meaning : ", syn.definition())
  
# list of phrases that use the word in context
print ("\nSynset example : ", syn.examples())

Synset name :   hello.n.01

Synset meaning :  an expression of greeting

Synset example :  ['every morning they exchanged polite hellos']


In [49]:
wordnet.synsets('water')

[Synset('water.n.01'),
 Synset('body_of_water.n.01'),
 Synset('water.n.03'),
 Synset('water_system.n.02'),
 Synset('urine.n.01'),
 Synset('water.n.06'),
 Synset('water.v.01'),
 Synset('water.v.02'),
 Synset('water.v.03'),
 Synset('water.v.04')]

### Hypernyms and Hyponyms –

**Hypernyms**: More abstract terms

**Hyponyms**: More specific terms.

Both come to picture as Synsets are organized in a structure similar to that of an inheritance tree. This tree can be traced all the way up to a root hypernym. Hypernyms provide a way to categorize and group words based on their similarity to each other.

In [50]:
from nltk.corpus import wordnet

# Understanding Hypernyms and Hyponyms
syn = wordnet.synsets('hello')[0]
  
print ("Synset name :  ", syn.name())
  
print ("\nHYPER abstract term :  ", syn.hypernyms())
  
print ("\nHYPO specific term :  ", 
       syn.hypernyms()[0].hyponyms())
  
syn.root_hypernyms()
  
print ("\nSynset root hypernerm :  ", syn.root_hypernyms())

Synset name :   hello.n.01

HYPER abstract term :   [Synset('greeting.n.01')]

HYPO specific term :   [Synset('calling_card.n.02'), Synset('good_afternoon.n.01'), Synset('good_morning.n.01'), Synset('hail.n.03'), Synset('hello.n.01'), Synset('pax.n.01'), Synset('reception.n.01'), Synset('regard.n.03'), Synset('salute.n.02'), Synset('salute.n.03'), Synset('welcome.n.02'), Synset('well-wishing.n.01')]

Synset root hypernerm :   [Synset('entity.n.01')]


In [51]:
wordnet.synsets('happy')[0].root_hypernyms()


[Synset('happy.a.01')]

In [52]:
# POS in Synset
syn = wordnet.synsets('hello')[0]
print ("Syn tag : ", syn.pos())
  
syn = wordnet.synsets('doing')[0]
print ("Syn tag : ", syn.pos())
  
syn = wordnet.synsets('beautiful')[0]
print ("Syn tag : ", syn.pos())
  
syn = wordnet.synsets('quickly')[0]
print ("Syn tag : ", syn.pos())

Syn tag :  n
Syn tag :  v
Syn tag :  a
Syn tag :  r


## 3. Word senses (for homonyms)

WordNet is a lexical database for the English language in the form of a semantic graph. 

WordNet groups English words into sets of synonyms called synsets, provides short definitions and usage examples, and records a number of relations among these synonym sets or their members.

NLTK provides an interface to the WordNet API.

In [53]:
from nltk.corpus import wordnet as wn
wn.synsets('human')

[Synset('homo.n.02'),
 Synset('human.a.01'),
 Synset('human.a.02'),
 Synset('human.a.03')]

In [54]:
wn.synsets('human')[0].definition()

'any living or extinct member of the family Hominidae characterized by superior intelligence, articulate speech, and erect carriage'

In [55]:
wn.synsets('human')[1].definition()

'characteristic of humanity'

In [58]:
human = wn.synsets('Human', pos=wn.NOUN)[0]
human

Synset('homo.n.02')

In [59]:
human.hypernyms() # A hypernym is a word with a broad meaning constituting a category into which words with more specific meanings fall; a superordinate. For example, colour is a hypernym of red.

[Synset('hominid.n.01')]

In [60]:
human.hyponyms()

[Synset('homo_erectus.n.01'),
 Synset('homo_habilis.n.01'),
 Synset('homo_sapiens.n.01'),
 Synset('homo_soloensis.n.01'),
 Synset('neandertal_man.n.01'),
 Synset('rhodesian_man.n.01'),
 Synset('world.n.08')]

In [61]:
bike = wn.synsets('bicycle')[0]
bike

Synset('bicycle.n.01')

In [62]:
girl = wn.synsets('girl')[1]
girl

Synset('female_child.n.01')

In [63]:
bike.wup_similarity(human) # The Wu-Palmer metric (WUP) is a measure of similarity based on distance in the graph. There are many other metrics too.

0.34782608695652173

In [64]:
girl.wup_similarity(human)

0.5217391304347826

In [None]:
synonyms = []
for syn in wn.synsets('water'):
    for lemma in syn.lemmas(): #  A lemma is basically the dictionary form or base form of a word, as opposed to the various inflected forms of a word. 
        synonyms.append(lemma.name())
synonyms

['water',
 'H2O',
 'body_of_water',
 'water',
 'water',
 'water_system',
 'water_supply',
 'water',
 'urine',
 'piss',
 'pee',
 'piddle',
 'weewee',
 'water',
 'water',
 'water',
 'irrigate',
 'water',
 'water',
 'water']

In [None]:
antonyms = []
for syn in wn.synsets("girl"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms

## 4. Chunking and Entity Recognition

The goal of chunking is to divide a sentence into chunks. Usually each chunk contains a **head** and optionally additionally words and modifiers. Examples of chunks include noun groups and verb groups.

### 4.1. Chunking

In [None]:
from nltk.chunk import RegexpParser

In order to create a chunker, we need to first define a **chunk grammar**, consisting of rules that indicate how sentences should be chunked. 

We can define a simple grammar for a noun phrase (NP) chunker with a single regular-expression rule. This rule says that an NP chunk should be formed whenever the chunker finds an optional determiner (`DT`) followed by any number of adjectives (`JJ`) and then a noun (`NN`).

Note how grammatical structures which are not noun phrases are not chunked, which is totally fine:

In [None]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [None]:
chunker = RegexpParser(grammar)
result = chunker.parse(tags)
pprint.pprint(result)

### 4.2. Entity Recognition

The goal of entity recogintion is to detect entities such as Person, Location, Time, etc.

In [None]:
from nltk.chunk import ne_chunk # ne = named entity
pprint.pprint(ne_chunk(tags))

Note `ne_chunk` was unable to detect any entities in our sentence. That is because it is quite limited, being able to recognize only the following entities: 
> FACILITY, GPE (Geo-Political Entity), GSP (Geo-Socio-Political group), LOCATION, ORGANIZATION, PERSON 

# 3.5 **Trying out Spacy**

In [65]:
import spacy
nlp = spacy.load('en_core_web_sm')



OSError: [E053] Could not read config file from /home/kky8822/.local/share/virtualenvs/2022_AI_expert-VdvU_stD/lib/python3.10/site-packages/en_core_web_sm/en_core_web_sm-2.2.0/config.cfg

In [None]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [None]:
print(doc)

In [None]:
print([w for w in doc])
print([w.pos_ for w in doc])

[The, quick, brown, fox, jumped, over, the, lazy, dog, 's, back, .]
['DET', 'ADJ', 'ADJ', 'NOUN', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'PART', 'NOUN', 'PUNCT']


In [None]:
[w for w in doc][4].tag_
#[w for w in "The quick brown fox jumped over the lazy dog's back."]

'VBD'

In [None]:
# a word's type: 
# tags could be found in 'https://spacy.io/api/annotation#pos-tagging'
doc[4].pos_, doc[4].tag_

In [None]:
# for tags explanation spacy.explain method can be used
spacy.explain('VBD')

'verb, past tense'

In [None]:
# iterating over tokens and printing details
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


## Coarse-grained Part-of-speech Tags
Every token is assigned a POS Tag from the following list:


<table><tr><th>POS</th><th>DESCRIPTION</th><th>EXAMPLES</th></tr>
    
<tr><td>ADJ</td><td>adjective</td><td>*big, old, green, incomprehensible, first*</td></tr>
<tr><td>ADP</td><td>adposition</td><td>*in, to, during*</td></tr>
<tr><td>ADV</td><td>adverb</td><td>*very, tomorrow, down, where, there*</td></tr>
<tr><td>AUX</td><td>auxiliary</td><td>*is, has (done), will (do), should (do)*</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>*and, or, but*</td></tr>
<tr><td>CCONJ</td><td>coordinating conjunction</td><td>*and, or, but*</td></tr>
<tr><td>DET</td><td>determiner</td><td>*a, an, the*</td></tr>
<tr><td>INTJ</td><td>interjection</td><td>*psst, ouch, bravo, hello*</td></tr>
<tr><td>NOUN</td><td>noun</td><td>*girl, cat, tree, air, beauty*</td></tr>
<tr><td>NUM</td><td>numeral</td><td>*1, 2017, one, seventy-seven, IV, MMXIV*</td></tr>
<tr><td>PART</td><td>particle</td><td>*'s, not,*</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>*I, you, he, she, myself, themselves, somebody*</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>*Mary, John, London, NATO, HBO*</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>*., (, ), ?*</td></tr>
<tr><td>SCONJ</td><td>subordinating conjunction</td><td>*if, while, that*</td></tr>
<tr><td>SYM</td><td>symbol</td><td>*$, %, §, ©, +, −, ×, ÷, =, :), 😝*</td></tr>
<tr><td>VERB</td><td>verb</td><td>*run, runs, running, eat, ate, eating*</td></tr>
<tr><td>X</td><td>other</td><td>*sfpksdpsxmsa*</td></tr>
<tr><td>SPACE</td><td>space</td></tr>

___
## Fine-grained Part-of-speech Tags
Tokens are subsequently given a fine-grained tag as determined by morphology:
<table>
<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>
<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>
<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>
<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>
<tr><td>ADJ</td><td></td><td>PRP\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>
<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>
<tr><td>ADJ</td><td></td><td>WP\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>
<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>
<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>
<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>
<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>
<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>
<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>
<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>
<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>
<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>
<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>
<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>
<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>
<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>
<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>
<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>
<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>
<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>""</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>
<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>
<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>
<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>
<tr><td>SYM</td><td></td><td>\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>
<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>
<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary "be"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>HVS</td><td>forms of "have"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>
<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>
<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>
<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>
<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>
<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>
<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>
<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>
<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>
<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>
<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>
<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>
<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>
</table>

https://spacy.io/api/annotation#pos-tagging

In [None]:
# comparing different tenses 
doc1 = nlp(u"I read books on NLP.")
token1 = doc1[1]
print(f"{token1.text:{10}} {token1.pos_:{10}} {token1.tag_:{10}} {spacy.explain(token1.tag_)}")

In [None]:
# comparing different tenses 
doc1 = nlp(u"I like to read books on NLP.")
token1 = doc1[3]
print(f"{token1.text:{10}} {token1.pos_:{10}} {token1.tag_:{10}} {spacy.explain(token1.tag_)}")

In [None]:
token1.pos

In [None]:
doc2 = nlp(u"I read a book on NLP.")
token2 = doc2[1]
print(f"{token2.text:{10}} {token2.pos_:{10}} {token2.tag_:{10}} {spacy.explain(token2.tag_)}")

In [None]:
# counting the words according to their type
pos_counts = doc.count_by(spacy.attrs.POS)
pos_counts

In [None]:
# type of a word by its POS
doc.vocab[84].text

In [None]:
# type of a word by its part of speech
doc[2].pos_

In [None]:
# listing the counts and type of each pos
for k,v in sorted(pos_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

In [None]:
# listing the counts and type of each tag
tag_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(tag_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

In [None]:
# listing the counts and type of each dep
dep_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(dep_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

___
## Fine-grained POS Tag Examples
These are some grammatical examples (shown in **bold**) of specific fine-grained tags. We've removed punctuation and rarely used tags:
<table>
<tr><th>POS</th><th>TAG</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>ADJ</td><td>AFX</td><td>affix</td><td>The Flintstones were a **pre**-historic family.</td></tr>
<tr><td>ADJ</td><td>JJ</td><td>adjective</td><td>This is a **good** sentence.</td></tr>
<tr><td>ADJ</td><td>JJR</td><td>adjective, comparative</td><td>This is a **better** sentence.</td></tr>
<tr><td>ADJ</td><td>JJS</td><td>adjective, superlative</td><td>This is the **best** sentence.</td></tr>
<tr><td>ADJ</td><td>PDT</td><td>predeterminer</td><td>Waking up is **half** the battle.</td></tr>
<tr><td>ADJ</td><td>PRP\$</td><td>pronoun, possessive</td><td>**His** arm hurts.</td></tr>
<tr><td>ADJ</td><td>WDT</td><td>wh-determiner</td><td>It's blue, **which** is odd.</td></tr>
<tr><td>ADJ</td><td>WP\$</td><td>wh-pronoun, possessive</td><td>We don't know **whose** it is.</td></tr>
<tr><td>ADP</td><td>IN</td><td>conjunction, subordinating or preposition</td><td>It arrived **in** a box.</td></tr>
<tr><td>ADV</td><td>EX</td><td>existential there</td><td>**There** is cake.</td></tr>
<tr><td>ADV</td><td>RB</td><td>adverb</td><td>He ran **quickly**.</td></tr>
<tr><td>ADV</td><td>RBR</td><td>adverb, comparative</td><td>He ran **quicker**.</td></tr>
<tr><td>ADV</td><td>RBS</td><td>adverb, superlative</td><td>He ran **fastest**.</td></tr>
<tr><td>ADV</td><td>WRB</td><td>wh-adverb</td><td>**When** was that?</td></tr>
<tr><td>CONJ</td><td>CC</td><td>conjunction, coordinating</td><td>The balloon popped **and** everyone jumped.</td></tr>
<tr><td>DET</td><td>DT</td><td>determiner</td><td>**This** is **a** sentence.</td></tr>
<tr><td>INTJ</td><td>UH</td><td>interjection</td><td>**Um**, I don't know.</td></tr>
<tr><td>NOUN</td><td>NN</td><td>noun, singular or mass</td><td>This is a **sentence**.</td></tr>
<tr><td>NOUN</td><td>NNS</td><td>noun, plural</td><td>These are **words**.</td></tr>
<tr><td>NOUN</td><td>WP</td><td>wh-pronoun, personal</td><td>**Who** was that?</td></tr>
<tr><td>NUM</td><td>CD</td><td>cardinal number</td><td>I want **three** things.</td></tr>
<tr><td>PART</td><td>POS</td><td>possessive ending</td><td>Fred**'s** name is short.</td></tr>
<tr><td>PART</td><td>RP</td><td>adverb, particle</td><td>Put it **back**!</td></tr>
<tr><td>PART</td><td>TO</td><td>infinitival to</td><td>I want **to** go.</td></tr>
<tr><td>PRON</td><td>PRP</td><td>pronoun, personal</td><td>**I** want **you** to go.</td></tr>
<tr><td>PROPN</td><td>NNP</td><td>noun, proper singular</td><td>**Kilroy** was here.</td></tr>
<tr><td>PROPN</td><td>NNPS</td><td>noun, proper plural</td><td>The **Flintstones** were a pre-historic family.</td></tr>
<tr><td>VERB</td><td>MD</td><td>verb, modal auxiliary</td><td>This **could** work.</td></tr>
<tr><td>VERB</td><td>VB</td><td>verb, base form</td><td>I want to **go**.</td></tr>
<tr><td>VERB</td><td>VBD</td><td>verb, past tense</td><td>This **was** a sentence.</td></tr>
<tr><td>VERB</td><td>VBG</td><td>verb, gerund or present participle</td><td>I am **going**.</td></tr>
<tr><td>VERB</td><td>VBN</td><td>verb, past participle</td><td>The treasure was **lost**.</td></tr>
<tr><td>VERB</td><td>VBP</td><td>verb, non-3rd person singular present</td><td>I **want** to go.</td></tr>
<tr><td>VERB</td><td>VBZ</td><td>verb, 3rd person singular present</td><td>He **wants** to go.</td></tr>
</table>

### visualizing POS

In [None]:
from spacy import displacy

In [None]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [None]:
# defaulşt syntactic visualization
displacy.render(doc, style='dep', jupyter=True)

In [None]:
# modifying the visual
options = {'distance':100, 'compact':'True','color':'white', 'bg':'#09a3d5', 'font':'Times'}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [None]:
doc3 = nlp(u"This is a sentence. This is another sentence, possibly longer than other.")
spans = list(doc3.sents)

In [None]:
list(doc3.sents)[-1]

### named entity recognition (NER)

In [None]:
def show_ents(doc):
    if doc.ents:
        for entity in doc.ents:
            print(entity.text + " - " + entity.label_ + " - " + spacy.explain(entity.label_) )
    else:
        print("No entity found!")

In [None]:
doc4 = nlp(u"Hi, how are you?")
show_ents(doc4)

In [None]:
doc5 = nlp(u"May I go to Washington, DC next May to see the Washington Monument? or Can I have 500 dollars Microsoft stock?")
show_ents(doc5)

## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>

In [None]:
doc6 = nlp(u"Google to build a U.K. facotyr for $6 million.")
show_ents(doc6)
# notice that 'Google' isn't recognized as an 'ORG' entity

## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [None]:
# adding a word to the NER dictionary
from spacy.tokens import Span
# obtaining hash value of the entity label
ORG = doc6.vocab.strings[u"ORG"]
# creating a span for 'Google' in doc5 start=0, end=1
new_ent = Span(doc6, 0 , 1, label=ORG)
# adding entity to the entity list 
#doc6.ents = list(doc6.ents) + [new_ent]
show_ents(doc6)

In [None]:
doc7 = nlp(u"Our company created a brand new vacuum cleaner."
          u"This new vacuum-cleaner is the best in show.")
show_ents(doc7)

In [None]:
from spacy.matcher import PhraseMatcher
# creating matcher object
matcher = PhraseMatcher(nlp.vocab)
# creating phrase list
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
# creating phrase patterns for phrase list
phrase_patterns = [nlp(text) for text in phrase_list]
# adding phrase patterns to matcher
matcher.add('newproduct', None, *phrase_patterns)
# finding the matches
found_matches = matcher(doc7)
found_matches

In [None]:
from spacy.tokens import Span
PROD = doc7.vocab.strings[u"PRODUCT"]
new_ents = [Span(doc7, match[1], match[2], label=PROD) for match in found_matches]
doc7.ents = list(doc7.ents) + new_ents
show_ents(doc7)

In [None]:
doc8 = nlp(u"Originally I paid $29.95 for a toy and now it is marked as 15 dollars.")

In [None]:
money_entities = [ent for ent in doc8.ents if ent.label_ == "MONEY"]
money_entities

### visualizing NER

In [None]:
from spacy import displacy
doc9 = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
          u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [None]:
displacy.render(doc9, style = 'ent', jupyter=True)

In [None]:
# displaying the visual sentence by sentence
for sent in doc9.sents:
    displacy.render(sent, style = 'ent', jupyter=True)

In [None]:
# defining option s dictionary to change options of visual
colors = {'ORG':'red'}
options = {'ents':['PRODUCT', 'ORG'], 'colors' : colors}
displacy.render(doc9, style = 'ent', jupyter=True, options=options)

## **Applications and State of the Art**

[HMTL (Hierarchical Multi-Task Learning model) Demo](https://huggingface.co/hmtl/)

[HMTL (Hierarchical Multi-Task Learning model) Code](https://github.com/huggingface/hmtl)

![picture](https://github.com/huggingface/hmtl/raw/master/HMTL_architecture.png)

![picture](https://www.ibm.com/blogs/research/wp-content/uploads/2016/10/alexsmith2.jpg)