In [1]:
import nltk

In [2]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
text1  # choosing text 1

<Text: Moby Dick by Herman Melville 1851>

In [4]:
sents()  # displaying the sentences

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [5]:
sent1  # displaying text 1

['Call', 'me', 'Ishmael', '.']

# Counting vocabulary of the words

In [3]:
text7

<Text: Wall Street Journal>

In [4]:
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [5]:
len(sent7) # length of the sentence

18

In [9]:
len(text7) # length of the entire text

100676

In [10]:
len(set(text7)) # number of unique characters and words in the sentence

12408

In [11]:
list(set(text7))[:10] # listing some of the words

['Reliance',
 '27-year',
 'durable',
 'improper',
 'systematic',
 'conditions',
 'privilege',
 'A-D',
 'proteins',
 'autions']

# Frequency of the words

In [6]:
dist = FreqDist(text7)  # creating a frequency distribution. It is bascially a dictionary with keys as words and values are frequency of the word.

In [7]:
len(dist)

12408

In [8]:
vocab1 = dist.keys()

In [15]:
list(vocab1)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [16]:
dist['four'] # checking frequency of word four

20

In [17]:
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] >100]

In [18]:
freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

# Normalizing and stemming words

Different forms of the same word

In [19]:
input1 = 'List listed listing lists listings' 

In [20]:
words1 = input1.lower().split(' ')

In [21]:
words1 # it is the normalization

['list', 'listed', 'listing', 'lists', 'listings']

In [22]:
# stemming is finding the root word

In [23]:
porter = nltk.PorterStemmer()

In [24]:
[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

# Lemmatization

Lemmatization is where you want to have the words that come out to be actually meaningful.

In [25]:
new = nltk.corpus.udhr.words('English-Latin1')  # universal declaration text

In [26]:
new[:10]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the']

In [27]:
[porter.stem(t) for t in new[:10]]

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the']

as we can see that these are not valid words, so we will use lemmatization.
sometimes we want to normalize something but preserving the actual meaning or the context.

In [28]:
wleam = nltk.WordNetLemmatizer()

In [29]:
[wleam.lemmatize(t) for t in new[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

# Tokenization

In [30]:
# previously 
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

As you can see that the full stop is not handled correctly and it should be a seperate word. Therefore, we will ue nltk buit in tokenizer.

In [31]:
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

In [32]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"

In [33]:
nltk.sent_tokenize(text12)

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

# Advance tasks

## POS (Part of speech) The ones we learn in the school
<table>
<tr>
<th>Tag</th>
<th>Meaning</th>
</tr>
<tr>
<td>CC</td>
<td>Conjunction</td>
</tr>
<tr>
<td>CD</td>
<td>Cardinal</td>
</tr>
<tr>
<td>DT</td>
<td>Determiner</td>
</tr>
<tr>
<td>IN</td>
<td>Preposition</td>
</tr>
<tr>
<td>JJ</td>
<td>Adjective</td>
</tr>
<tr>
<td>MD</td>
<td>Modal</td>
</tr>
<tr>
<td>NN</td>
<td>Noun</td>
</tr>
<tr>
<td>POS</td>
<td>Possessive</td>
</tr>
<tr>
<td>PRP</td>
<td>Pronoun</td>
</tr>
<tr>
<td>RB</td>
<td>Adverb</td>
</tr>
<tr>
<td>SYM</td>
<td>Symbol</td>
</tr>
<tr>
<td>VB</td>
<td>Verb</td>
</tr>

In [34]:
nltk.help.upenn_tagset('MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [35]:
text11

"Children shouldn't drink a sugary drink before bed."

In [36]:
text12

'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'

In [39]:
words = nltk.word_tokenize(text11)
nltk.pos_tag(words)

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [41]:
# Ambiguity in POS tagging
text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
nltk.pos_tag(text14)

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

It is a ambiguous sentence as it has two meanings. Firstly, it could be that the aunts that are visiting are nuisance or the visiting the aunts is nusiance. When we did one type of POS tagging, we get visiting as verb. An alternative POS tagging could be visiting as adjective. The nltk would give version as visiting is used more as a verb rather than an adjective.

# Parsing the sentence structure

making sense of the sentences is easy if they follow a well-defined grammatical structure.
1. A sentence is made up of Noun phrase and verb phrase. 
2. The verb phrase is itself made up of verb and noun phrase.


In [44]:
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [45]:
text16 = nltk.word_tokenize("I saw the man with a telescope")
grammar1 = nltk.data.load('mygrammar.cfg')
grammar1

<Grammar with 13 productions>

In [46]:
parser = nltk.ChartParser(grammar1)
trees = parser.parse_all(text16)
for tree in trees:
    print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))


It has returned two trees due to ambiguity in parsing. considering the sentence it is possible that 'saw' is a verb and remaining part is noun phrase. It is also possible that the phrase 'saw the man' is a verb phrase and remaining part is preposition phrase. 'I" remains noun phrase in sentence.

In [47]:
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
