In [1]:
import nltk
from nltk.book import *
from nltk.stem.porter import *
from nltk.stem import * 

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter


from nltk.tokenize import RegexpTokenizer #regular expression tokenizer
import re  # regular expression module

from nltk import FreqDist

from nltk.corpus import gutenberg as g
from nltk.corpus import brown
from nltk.corpus import nps_chat



*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
nltk.help.upenn_tagset('VBZ')

VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...


In [3]:
nltk.help.upenn_tagset('NN.*') # regular expression

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


In [4]:
nltk.help.brown_tagset('TO') 

TO: infinitival to
    to t'


In [5]:
nltk.corpus.brown.readme()

'BROWN CORPUS\n\nA Standard Corpus of Present-Day Edited American\nEnglish, for use with Digital Computers.\n\nby W. N. Francis and H. Kucera (1964)\nDepartment of Linguistics, Brown University\nProvidence, Rhode Island, USA\n\nRevised 1971, Revised and Amplified 1979\n\nhttp://www.hit.uib.no/icame/brown/bcm.html\n\nDistributed with the permission of the copyright holder,\nredistribution permitted.\n'

### Tagging with pos_tag

In [6]:
text = "Hello welcome to the world of learning the categorization and POS Tagging with NLTK and Python"
tok_text = word_tokenize(text)
nltk.pos_tag(tok_text)

[('Hello', 'NNP'),
 ('welcome', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('of', 'IN'),
 ('learning', 'VBG'),
 ('the', 'DT'),
 ('categorization', 'NN'),
 ('and', 'CC'),
 ('POS', 'NNP'),
 ('Tagging', 'NNP'),
 ('with', 'IN'),
 ('NLTK', 'NNP'),
 ('and', 'CC'),
 ('Python', 'NNP')]

In [7]:
# access individual (token,tag)
tg_text = nltk.pos_tag(tok_text)
tg_text



[('Hello', 'NNP'),
 ('welcome', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('of', 'IN'),
 ('learning', 'VBG'),
 ('the', 'DT'),
 ('categorization', 'NN'),
 ('and', 'CC'),
 ('POS', 'NNP'),
 ('Tagging', 'NNP'),
 ('with', 'IN'),
 ('NLTK', 'NNP'),
 ('and', 'CC'),
 ('Python', 'NNP')]

In [8]:
# Tagged Corpora

tagged_token = nltk.tag.str2tuple('Learn/VB')

print(tagged_token[0])
print(tagged_token[1])

print(tagged_token)

Learn
VB
('Learn', 'VB')


### Corpus reader functions are named based on the type of information they return. 


#### Tagged words method

In [9]:
# Reading a tagged corpora

nltk.corpus.brown.tagged_words(categories='romance')[1:50]

[('neither', 'CC'),
 ('liked', 'VBD'),
 ('nor', 'CC'),
 ('disliked', 'VBD'),
 ('the', 'AT'),
 ('Old', 'JJ-TL'),
 ('Man', 'NN-TL'),
 ('.', '.'),
 ('To', 'IN'),
 ('them', 'PPO'),
 ('he', 'PPS'),
 ('could', 'MD'),
 ('have', 'HV'),
 ('been', 'BEN'),
 ('the', 'AT'),
 ('broken', 'VBN'),
 ('bell', 'NN'),
 ('in', 'IN'),
 ('the', 'AT'),
 ('church', 'NN'),
 ('tower', 'NN'),
 ('which', 'WDT'),
 ('rang', 'VBD'),
 ('before', 'IN'),
 ('and', 'CC'),
 ('after', 'IN'),
 ('Mass', 'NN-TL'),
 (',', ','),
 ('and', 'CC'),
 ('at', 'IN'),
 ('noon', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('at', 'IN'),
 ('six', 'CD'),
 ('each', 'DT'),
 ('evening', 'NN'),
 ('--', '--'),
 ('its', 'PP$'),
 ('tone', 'NN'),
 (',', ','),
 ('repetitive', 'JJ'),
 (',', ','),
 ('monotonous', 'JJ'),
 (',', ','),
 ('never', 'RB'),
 ('breaking', 'VBG'),
 ('the', 'AT'),
 ('boredom', 'NN')]

In [10]:
# Reading a tagged corpora

nltk.corpus.brown.tagged_words(categories='romance',tagset='universal')[1:50]

[('neither', 'CONJ'),
 ('liked', 'VERB'),
 ('nor', 'CONJ'),
 ('disliked', 'VERB'),
 ('the', 'DET'),
 ('Old', 'ADJ'),
 ('Man', 'NOUN'),
 ('.', '.'),
 ('To', 'ADP'),
 ('them', 'PRON'),
 ('he', 'PRON'),
 ('could', 'VERB'),
 ('have', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('broken', 'VERB'),
 ('bell', 'NOUN'),
 ('in', 'ADP'),
 ('the', 'DET'),
 ('church', 'NOUN'),
 ('tower', 'NOUN'),
 ('which', 'DET'),
 ('rang', 'VERB'),
 ('before', 'ADP'),
 ('and', 'CONJ'),
 ('after', 'ADP'),
 ('Mass', 'NOUN'),
 (',', '.'),
 ('and', 'CONJ'),
 ('at', 'ADP'),
 ('noon', 'NOUN'),
 (',', '.'),
 ('and', 'CONJ'),
 ('at', 'ADP'),
 ('six', 'NUM'),
 ('each', 'DET'),
 ('evening', 'NOUN'),
 ('--', '.'),
 ('its', 'DET'),
 ('tone', 'NOUN'),
 (',', '.'),
 ('repetitive', 'ADJ'),
 (',', '.'),
 ('monotonous', 'ADJ'),
 (',', '.'),
 ('never', 'ADV'),
 ('breaking', 'VERB'),
 ('the', 'DET'),
 ('boredom', 'NOUN')]

In [11]:
nltk.corpus.brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [12]:
nltk.corpus.brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [13]:

brown_tagged = brown.tagged_words(categories='adventure')
brown_tagged[1:10]

[('Morgan', 'NP'),
 ('told', 'VBD'),
 ('himself', 'PPL'),
 ('he', 'PPS'),
 ('would', 'MD'),
 ('forget', 'VB'),
 ('Ann', 'NP'),
 ('Turner', 'NP'),
 ('.', '.')]

In [14]:
brown_tagged = brown.tagged_words(tagset='universal')
brown_tagged[0:10]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP')]

In [15]:
brown.fileids()

['ca01',
 'ca02',
 'ca03',
 'ca04',
 'ca05',
 'ca06',
 'ca07',
 'ca08',
 'ca09',
 'ca10',
 'ca11',
 'ca12',
 'ca13',
 'ca14',
 'ca15',
 'ca16',
 'ca17',
 'ca18',
 'ca19',
 'ca20',
 'ca21',
 'ca22',
 'ca23',
 'ca24',
 'ca25',
 'ca26',
 'ca27',
 'ca28',
 'ca29',
 'ca30',
 'ca31',
 'ca32',
 'ca33',
 'ca34',
 'ca35',
 'ca36',
 'ca37',
 'ca38',
 'ca39',
 'ca40',
 'ca41',
 'ca42',
 'ca43',
 'ca44',
 'cb01',
 'cb02',
 'cb03',
 'cb04',
 'cb05',
 'cb06',
 'cb07',
 'cb08',
 'cb09',
 'cb10',
 'cb11',
 'cb12',
 'cb13',
 'cb14',
 'cb15',
 'cb16',
 'cb17',
 'cb18',
 'cb19',
 'cb20',
 'cb21',
 'cb22',
 'cb23',
 'cb24',
 'cb25',
 'cb26',
 'cb27',
 'cc01',
 'cc02',
 'cc03',
 'cc04',
 'cc05',
 'cc06',
 'cc07',
 'cc08',
 'cc09',
 'cc10',
 'cc11',
 'cc12',
 'cc13',
 'cc14',
 'cc15',
 'cc16',
 'cc17',
 'cd01',
 'cd02',
 'cd03',
 'cd04',
 'cd05',
 'cd06',
 'cd07',
 'cd08',
 'cd09',
 'cd10',
 'cd11',
 'cd12',
 'cd13',
 'cd14',
 'cd15',
 'cd16',
 'cd17',
 'ce01',
 'ce02',
 'ce03',
 'ce04',
 'ce05',
 'ce06',
 

In [16]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [17]:
brown_tagged = brown.tagged_words(categories='science_fiction', tagset='universal')
print("First 10 word-tag pairs: ", brown_tagged[1:10])

tag_fd = nltk.FreqDist(tag for (word, tag) in brown_tagged)

print("\n Most common tags: ", tag_fd.most_common())
print("  \n Keys: ", tag_fd.keys())

First 10 word-tag pairs:  [('that', 'ADP'), ('he', 'PRON'), ('knew', 'VERB'), ('himself', 'PRON'), ('to', 'PRT'), ('be', 'VERB'), ('self', 'NOUN'), ('he', 'PRON'), ('was', 'VERB')]

 Most common tags:  [('NOUN', 2747), ('VERB', 2579), ('.', 2428), ('DET', 1582), ('ADP', 1451), ('PRON', 934), ('ADJ', 929), ('ADV', 828), ('PRT', 483), ('CONJ', 416), ('NUM', 79), ('X', 14)]
  
 Keys:  dict_keys(['ADV', 'ADP', 'PRON', 'VERB', 'PRT', 'NOUN', 'ADJ', 'DET', '.', 'CONJ', 'NUM', 'X'])


#### Tagged sentences method

In [18]:
brown_sents_tag = brown.tagged_sents('ca05')
brown_sents_tag[0:2]

[[('East', 'JJ-TL'),
  ('Providence', 'NP-TL'),
  ('should', 'MD'),
  ('organize', 'VB'),
  ('its', 'PP$'),
  ('civil', 'JJ'),
  ('defense', 'NN'),
  ('setup', 'NN'),
  ('and', 'CC'),
  ('begin', 'VB'),
  ('by', 'IN'),
  ('appointing', 'VBG'),
  ('a', 'AT'),
  ('full-time', 'JJ'),
  ('director', 'NN'),
  (',', ','),
  ('Raymond', 'NP'),
  ('H.', 'NP'),
  ('Hawksley', 'NP'),
  (',', ','),
  ('the', 'AT'),
  ('present', 'JJ'),
  ('city', 'NN'),
  ('CD', 'NN'),
  ('head', 'NN'),
  (',', ','),
  ('believes', 'VBZ'),
  ('.', '.')],
 [('Mr.', 'NP'),
  ('Hawksley', 'NP'),
  ('said', 'VBD'),
  ('yesterday', 'NR'),
  ('he', 'PPS'),
  ('would', 'MD'),
  ('be', 'BE'),
  ('willing', 'JJ'),
  ('to', 'TO'),
  ('go', 'VB'),
  ('before', 'IN'),
  ('the', 'AT'),
  ('city', 'NN'),
  ('council', 'NN'),
  ('``', '``'),
  ('or', 'CC'),
  ('anyone', 'PN'),
  ('else', 'RB'),
  ('locally', 'RB'),
  ("''", "''"),
  ('to', 'TO'),
  ('outline', 'VB'),
  ('his', 'PP$'),
  ('proposal', 'NN'),
  ('at', 'IN'),
  ('t

In [19]:
chat_tags = nltk.corpus.nps_chat.tagged_words()

chat_tags[100:120]

[('my', 'PRP$'),
 ('cousin', 'NN'),
 ('drew', 'VBD'),
 ('a', 'DT'),
 ('messed', 'VBD'),
 ('up', 'RP'),
 ('pic', 'NN'),
 ('on', 'IN'),
 ('my', 'PRP$'),
 ('cast', 'NN'),
 ('PART', 'VB'),
 ('24', 'CD'),
 ('/', 'CC'),
 ('m', 'NN'),
 ('boo', 'UH'),
 ('.', '.'),
 ('26', 'CD'),
 ('/', 'CC'),
 ('m', 'NN'),
 ('and', 'CC')]

In [20]:
text = nltk.Text(word.lower() for word in brown.words())
type(text)

nltk.text.Text

# Unigrams - Bigrams - Trigrams - Ngrams

In [38]:
text = "Toto, I've a feeling we're not in Kansas anymore." # The Wizard of Oz (1939)

#tokenize first
text_toks = word_tokenize(text)
print(text_toks)

list(nltk.bigrams(text_toks))

['Toto', ',', 'I', "'ve", 'a', 'feeling', 'we', "'re", 'not', 'in', 'Kansas', 'anymore', '.']


[('Toto', ','),
 (',', 'I'),
 ('I', "'ve"),
 ("'ve", 'a'),
 ('a', 'feeling'),
 ('feeling', 'we'),
 ('we', "'re"),
 ("'re", 'not'),
 ('not', 'in'),
 ('in', 'Kansas'),
 ('Kansas', 'anymore'),
 ('anymore', '.')]

In [22]:
list(nltk.trigrams(text_toks))

[('Toto', ',', 'I'),
 (',', 'I', "'ve"),
 ('I', "'ve", 'a'),
 ("'ve", 'a', 'feeling'),
 ('a', 'feeling', 'we'),
 ('feeling', 'we', "'re"),
 ('we', "'re", 'not'),
 ("'re", 'not', 'in'),
 ('not', 'in', 'Kansas'),
 ('in', 'Kansas', 'anymore'),
 ('Kansas', 'anymore', '.')]

In [41]:
list(nltk.ngrams(text_toks, 3))

[('Toto', ',', 'I'),
 (',', 'I', "'ve"),
 ('I', "'ve", 'a'),
 ("'ve", 'a', 'feeling'),
 ('a', 'feeling', 'we'),
 ('feeling', 'we', "'re"),
 ('we', "'re", 'not'),
 ("'re", 'not', 'in'),
 ('not', 'in', 'Kansas'),
 ('in', 'Kansas', 'anymore'),
 ('Kansas', 'anymore', '.')]

## Let’s inspect some tagged text to see what parts-of-speech occur before a noun, with the most frequent ones first. To begin with, we construct a list of bigrams whose members are themselves word-tag pairs, such as (('The', 'DET'), ('Fulton', 'NP')) and (('Fulton', 'NP'), ('County', 'N')).

In [24]:
# Repeat previous example

brown_tagged = brown.tagged_words(categories='news', tagset='universal')
print("First 10 word-tag pairs: ", brown_tagged[0:10])

tag_fd = nltk.FreqDist(tag for (word, tag) in brown_tagged)

print("\n Most common tags: ", tag_fd.most_common(10))
# print("  \n Keys: ", tag_fd.keys())

First 10 word-tag pairs:  [('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP')]

 Most common tags:  [('NOUN', 30654), ('VERB', 14399), ('ADP', 12355), ('.', 11928), ('DET', 11389), ('ADJ', 6706), ('ADV', 3349), ('CONJ', 2717), ('PRON', 2535), ('PRT', 2264)]


In [25]:
word_tag_pairs = nltk.bigrams(brown_tagged)
# print(list(word_tag_pairs)[0:10])
freq = nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] =='NOUN')

list(freq.most_common(10))

[('NOUN', 7959),
 ('DET', 7373),
 ('ADJ', 4761),
 ('ADP', 3781),
 ('.', 2796),
 ('VERB', 1842),
 ('CONJ', 938),
 ('NUM', 894),
 ('ADV', 186),
 ('PRT', 94)]

In [26]:
# getting bigrams out of a gutenberg text

sample = g.fileids()[0]

emma_text = g.raw(sample)
print(type(text))

emma_tokens = word_tokenize(text)

emma_pairs = nltk.trigrams(emma_tokens)
list(emma_pairs)[1:10]

<class 'str'>


[(',', 'I', "'ve"),
 ('I', "'ve", 'a'),
 ("'ve", 'a', 'feeling'),
 ('a', 'feeling', 'we'),
 ('feeling', 'we', "'re"),
 ('we', "'re", 'not'),
 ("'re", 'not', 'in'),
 ('not', 'in', 'Kansas'),
 ('in', 'Kansas', 'anymore')]

In [27]:
# getting bigrams out of a brown text

sample_brown = brown.fileids()[2]

brown_text = brown.raw(sample_brown)

brown_tokens = word_tokenize(brown_text) # you can use split() here too

brown_pairs  = nltk.bigrams(brown_tokens)

list(brown_pairs)[0:10]

[('Several/ap', 'defendants/nns'),
 ('defendants/nns', 'in/in'),
 ('in/in', 'the/at'),
 ('the/at', 'Summerdale/np'),
 ('Summerdale/np', 'police/nn'),
 ('police/nn', 'burglary/nn'),
 ('burglary/nn', 'trial/nn'),
 ('trial/nn', 'made/vbd'),
 ('made/vbd', 'statements/nns'),
 ('statements/nns', 'indicating/vbg')]

In [28]:
brown_text.split()

['Several/ap',
 'defendants/nns',
 'in/in',
 'the/at',
 'Summerdale/np',
 'police/nn',
 'burglary/nn',
 'trial/nn',
 'made/vbd',
 'statements/nns',
 'indicating/vbg',
 'their/pp$',
 'guilt/nn',
 'at/in',
 'the/at',
 'time/nn',
 'of/in',
 'their/pp$',
 'arrest/nn',
 ',/,',
 'Judge/nn-tl',
 'James/np',
 'B./np',
 'Parsons/np',
 'was/bedz',
 'told/vbn',
 'in/in',
 'Criminal/jj-tl',
 'court/nn',
 'yesterday/nr',
 './.',
 'The/at',
 'disclosure/nn',
 'by/in',
 'Charles/np',
 'Bellows/np',
 ',/,',
 'chief/jjs',
 'defense/nn',
 'counsel/nn',
 ',/,',
 'startled/vbd',
 'observers/nns',
 'and/cc',
 'was/bedz',
 'viewed/vbn',
 'as/cs',
 'the/at',
 'prelude/nn',
 'to/in',
 'a/at',
 'quarrel/nn',
 'between/in',
 'the/at',
 'six/cd',
 'attorneys/nns',
 'representing/vbg',
 'the/at',
 'eight/cd',
 'former/ap',
 'policemen/nns',
 'now/rb',
 'on/in',
 'trial/nn',
 './.',
 'Bellows/np',
 'made/vbd',
 'the/at',
 'disclosure/nn',
 'when/wrb',
 'he/pps',
 'asked/vbd',
 'Judge/nn-tl',
 'Parsons/np',
 'to/to

In [29]:
brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [30]:
brown.tagged_words(tagset='universal')[1:10]

[('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP')]

In [31]:
g.fileids()

moby_sample = g.fileids()[13]

moby_text = g.raw(moby_sample)

moby_tokens = word_tokenize(moby_text) # you can use split() here too

moby_tri  = nltk.trigrams(moby_tokens)

list(moby_tri)[0:10]

[('[', 'Moby', 'Dick'),
 ('Moby', 'Dick', 'by'),
 ('Dick', 'by', 'Herman'),
 ('by', 'Herman', 'Melville'),
 ('Herman', 'Melville', '1851'),
 ('Melville', '1851', ']'),
 ('1851', ']', 'ETYMOLOGY'),
 (']', 'ETYMOLOGY', '.'),
 ('ETYMOLOGY', '.', '('),
 ('.', '(', 'Supplied')]

In [32]:
moby_ngrams  = nltk.ngrams(moby_tokens, 1) # Unigram
# list(moby_ngrams)

# Create a Frequency Distribution to view what words are in the text

# Apply a list of tokens to the FreqDist object
moby_dick_freqdist = FreqDist(moby_tokens)

# View the most frequent tokens and corresponding counts in descending order
moby_dick_freqdist.most_common(10)

In [194]:
# moby_dick_freqdist.tabulate()

moby_dick_freqdist.most_common()[0:10]


[(',', 19204),
 ('the', 13715),
 ('.', 7306),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2978)]

In [213]:
moby_tags = nltk.pos_tag(moby_tokens)
list(nltk.trigrams(moby_tags))

[(('[', 'JJ'), ('Moby', 'NNP'), ('Dick', 'NNP')),
 (('Moby', 'NNP'), ('Dick', 'NNP'), ('by', 'IN')),
 (('Dick', 'NNP'), ('by', 'IN'), ('Herman', 'NNP')),
 (('by', 'IN'), ('Herman', 'NNP'), ('Melville', 'NNP')),
 (('Herman', 'NNP'), ('Melville', 'NNP'), ('1851', 'CD')),
 (('Melville', 'NNP'), ('1851', 'CD'), (']', 'NNP')),
 (('1851', 'CD'), (']', 'NNP'), ('ETYMOLOGY', 'NNP')),
 ((']', 'NNP'), ('ETYMOLOGY', 'NNP'), ('.', '.')),
 (('ETYMOLOGY', 'NNP'), ('.', '.'), ('(', '(')),
 (('.', '.'), ('(', '('), ('Supplied', 'VBN')),
 (('(', '('), ('Supplied', 'VBN'), ('by', 'IN')),
 (('Supplied', 'VBN'), ('by', 'IN'), ('a', 'DT')),
 (('by', 'IN'), ('a', 'DT'), ('Late', 'JJ')),
 (('a', 'DT'), ('Late', 'JJ'), ('Consumptive', 'NNP')),
 (('Late', 'JJ'), ('Consumptive', 'NNP'), ('Usher', 'NNP')),
 (('Consumptive', 'NNP'), ('Usher', 'NNP'), ('to', 'TO')),
 (('Usher', 'NNP'), ('to', 'TO'), ('a', 'DT')),
 (('to', 'TO'), ('a', 'DT'), ('Grammar', 'NNP')),
 (('a', 'DT'), ('Grammar', 'NNP'), ('School', 'NNP')

In [248]:
# moby_dick_freqdist.keys()

In [160]:
# Normalize the frequency by dividing each word's frequency by the total number
# of words in the corpus

# Obtain a total word count in the corpus
total_word_count = sum(moby_dick_freqdist.values())

# View the top 25 words by normalize frequency distribution
moby_dick_top_25 = moby_dick_freqdist.most_common(25)
print("Word\t\t\tNormalized Frequency")
for word in moby_dick_top_25:
    normalized_frequency = word[1] / total_word_count
    print("{} \t\t\t {:.4}".format(word[0], normalized_frequency))

Word			Normalized Frequency
, 			 0.0753
the 			 0.05378
. 			 0.02865
of 			 0.02554
and 			 0.02357
a 			 0.01782
to 			 0.0177
; 			 0.01636
in 			 0.01532
that 			 0.01168
his 			 0.009642
it 			 0.00861
I 			 0.008285
! 			 0.006928
is 			 0.006752
-- 			 0.006717
with 			 0.006505
he 			 0.006501
was 			 0.006426
as 			 0.006352
'' 			 0.006332
's 			 0.006215
`` 			 0.005709
all 			 0.005662
for 			 0.00554


In [220]:
print(total_word_count)

255038
