## Basic NLP Tasks with NLTK

### Counting vocabulary of words

In [1]:
import nltk
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
text9

<Text: The Man Who Was Thursday by G . K . Chesterton 1908>

In [4]:
sent9

['THE',
 'suburb',
 'of',
 'Saffron',
 'Park',
 'lay',
 'on',
 'the',
 'sunset',
 'side',
 'of',
 'London',
 ',',
 'as',
 'red',
 'and',
 'ragged',
 'as',
 'a',
 'cloud',
 'of',
 'sunset',
 '.']

In [5]:
len(sent9)

23

In [6]:
len(set(sent9))

19

In [9]:
list(set(sent9))[:10]

['THE', 'and', ',', 'London', 'suburb', 'a', 'as', 'cloud', 'lay', 'red']

In [14]:
list(set(sent7))[2:11]

['board', 'nonexecutive', 'join', 'Vinken', 'a', '29', 'will', ',', 'the']

In [26]:
dist = FreqDist(text7)
len(dist)

12408

In [27]:
len(set(text7))

12408

In [28]:
vocab1 = dist.keys()
list(vocab1)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [29]:
list(dist)[0:10]

[',', 'the', '.', 'of', 'to', 'a', 'in', 'and', '*-1', '0']

In [30]:
dist['years']

115

In [32]:
dist.most_common()[0:10]

[(',', 4885),
 ('the', 4045),
 ('.', 3828),
 ('of', 2319),
 ('to', 2164),
 ('a', 1878),
 ('in', 1572),
 ('and', 1511),
 ('*-1', 1123),
 ('0', 1099)]

In [35]:
freqwords = [w for w in vocab1 if len(w) >5 and dist[w] > 100]

In [36]:
freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

### Normalization and stemming

In [37]:
input1 = 'list, Listed, Listings, List, lists, Lists' # creating a list of strings with different casings and extensions

In [38]:
words1 = input1.lower().split(' ') # convert them to all lists 

In [39]:
words1

['list,', 'listed,', 'listings,', 'list,', 'lists,', 'lists']

In [40]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

['list,', 'listed,', 'listings,', 'list,', 'lists,', 'list']

### Lemmatization

In [41]:
udhr = nltk.corpus.udhr.words('English-Latin1')  # UN declaration of human rights corpus

In [42]:
udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [44]:
[porter.stem(t) for t in udhr[:20]] # still stemming, to show difference

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

In [45]:
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

### Tokenization 
The python split doesn't work perfectly sometimes, cause it joins stop words to words as one word, we can use NLTK 
tokenizing function to do this better 

In [46]:
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [49]:
tokenized = nltk.word_tokenize(text11)
tokenized

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

we can also tokenize sentences using the sent_tokenize function instead 

In [51]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
len(sentences)

4

In [53]:
sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

## Advanced NLP Tasks with NLTK


#### POS Tagging

In [56]:
nltk.help.upenn_tagset("NN") # to check the meaning of a particular tagset

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [57]:
text13 = nltk.word_tokenize(text11)

In [58]:
nltk.pos_tag(text13)

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

#### Ambuiguous sentences.  

In [61]:
text14 = nltk.word_tokenize("Visiting uncles can be a nuissance") # creating and tokenizing an ambiguous sentence

In [62]:
nltk.pos_tag(text14)

[('Visiting', 'VBG'),
 ('uncles', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuissance', 'NN')]

In [63]:
nltk.help.upenn_tagset("DT")

DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those


In [64]:
nltk.help.upenn_tagset("NNS")

NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


### Parsing a sentence structure, literally, creating an abstract syntax tree 

In [66]:
# Parsing sentence structure 

text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob' 
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [71]:
text16 = nltk.word_tokenize("I saw the man with the telescope")
grammar1 = nltk.data.load('mygrammar.cfg')
grammar1

<Grammar with 13 productions>

In [72]:
parser = nltk.ChartParser(grammar1)
trees = parser.parse_all(text16)

In [73]:
for tree in trees:
    print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det the) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det the) (N telescope))))))


In [75]:
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


In [77]:
nltk.help.upenn_tagset("IN")

IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...


### POS Tagging and Parsing Ambiguity 

In [78]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [79]:
text19 = nltk.word_tokenize("Colorless greenless idease sleep furiously")
nltk.pos_tag(text19)

[('Colorless', 'NNP'),
 ('greenless', 'NN'),
 ('idease', 'NN'),
 ('sleep', 'NN'),
 ('furiously', 'RB')]