# Analyzing Moby Dick

Use nltk to explore the Herman Melville novel Moby Dick.

In [32]:
import nltk
import pandas as pd
import numpy as np
import operator

with open('moby.txt', 'r') as f:
    moby_raw = f.read()    
moby_tokens = nltk.word_tokenize(moby_raw)
text1 = nltk.Text(moby_tokens)

### Example 1

How many tokens (words and punctuation symbols) are in text1?

In [4]:
def example_one():    
    return len(moby_tokens) 
example_one()

255018

### Example 2

How many unique tokens (unique words and punctuation) does text1 have?

In [5]:
def example_two():    
    return len(set(moby_tokens))
example_two()

20754

### Example 3

After lemmatizing the verbs, how many unique tokens does text1 have?

In [8]:
from nltk.stem import WordNetLemmatizer

def example_three():
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w,'v') for w in text1]
    return len(set(lemmatized))
example_three()

16899

### Question 1

What is the lexical diversity of the given text input? (i.e. ratio of unique tokens to the total number of tokens)

In [10]:
def answer_one():        
    return example_two()/example_one()
answer_one()

0.08138249064771899

### Question 2

What percentage of tokens is 'whale'or 'Whale'?

In [13]:
def answer_two():        
    return len([w for w in text1 if (w == 'whale' or w =='Whale')])/example_one()
answer_two()

0.004125199005560392

### Question 3

What are the 20 most frequently occurring (unique) tokens in the text? What is their frequency?

In [37]:
def answer_three():    
    return sorted(text1.vocab().items(), key = operator.itemgetter(1),reverse = True)[:20]
answer_three()

[(',', 19204),
 ('the', 13715),
 ('.', 7308),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2978),
 ('his', 2459),
 ('it', 2196),
 ('I', 2111),
 ('!', 1767),
 ('is', 1722),
 ('--', 1713),
 ('with', 1659),
 ('he', 1658),
 ('was', 1639),
 ('as', 1620)]

### Question 4

What tokens have a length of greater than 5 and frequency of more than 150?

In [42]:
def answer_four():    
    return set(sorted([k for k, v in text1.vocab().items() if len(k) > 5 and v >150])) 
answer_four()

{'Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'seemed',
 'should',
 'though',
 'through',
 'whales',
 'without'}

### Question 5

Find the longest word in text1 and that word's length.

In [57]:
def answer_five():
    longest_word = max(moby_tokens, key = lambda x: len(x))
    return longest_word, len(longest_word)
answer_five()

("twelve-o'clock-at-night", 23)

### Question 6

What unique words have a frequency of more than 2000? What is their frequency?

In [60]:
def answer_six():    
    return sorted([(v, k) for (k,v) in text1.vocab().items() if (v > 2000) and k.isalpha()], reverse = True)
answer_six()

[(13715, 'the'),
 (6513, 'of'),
 (6010, 'and'),
 (4545, 'a'),
 (4515, 'to'),
 (3908, 'in'),
 (2978, 'that'),
 (2459, 'his'),
 (2196, 'it'),
 (2111, 'I')]

### Question 7

What is the average number of tokens per sentence?

In [75]:
def answer_seven(): 
    return np.mean([len(nltk.word_tokenize(s)) for s in nltk.sent_tokenize(moby_raw)])
answer_seven()

25.88489646772229

### Question 8

What are the 5 most frequent parts of speech in this text? What is their frequency?

In [82]:
def answer_eight():
    freq_dict = {}
    pos = [v for k, v in nltk.pos_tag(moby_tokens)]
    for p in pos:
        freq_dict[p] = freq_dict.get(p, 0) + 1    
    return sorted(freq_dict.items(), key = operator.itemgetter(1), reverse = True)[:5]
answer_eight()

[('NN', 32730), ('IN', 28658), ('DT', 25870), (',', 19204), ('JJ', 17619)]