In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
import nltk

nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\jesst\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jesst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import gutenberg, stopwords

In [5]:
print(gutenberg.fileids())

persuasion = gutenberg.raw("austen-persuasion.txt")
alice = gutenberg.raw("carroll-alice.txt")

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [6]:
print("\nRaw:\n", alice[0:100])


Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [7]:
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice)

print("Title removed:\n", alice[:100])

Title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [8]:
persuasion = re.sub(r"Chapter \d+", "", persuasion)
alice = re.sub(r"CHAPTER .*", "", alice)

print("Chapter headings removed:\n", alice[:100])


Chapter headings removed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [9]:
persuasion = " ".join(persuasion.split())
alice = " ".join(alice.split())

print("Extra whitespace removed:\n", alice[:100])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [10]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
import spacy

In [12]:
nlp = spacy.load("en")

In [13]:
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [14]:
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long.".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34430 tokens long.
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [15]:
from collections import Counter

In [16]:
def word_frequencies(text, include_stop=True):
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    return Counter(words)

alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print("Alice:", alice_freq)
print("Persuasion:", persuasion_freq)

Alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 534), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
Persuasion: [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1121)]


In [17]:
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
print("Alice:", alice_freq)
print("Persuasion:", persuasion_freq)

Alice: [('I', 534), ('said', 453), ('Alice', 394), ("n't", 215), ("'s", 190), ('little', 124), ('The', 102), ('like', 84), ('went', 83), ('know', 83)]
Persuasion: [('I', 1121), ('Anne', 497), ("'s", 485), ('She', 326), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 255), ('He', 225), ('Wentworth', 217)]


In [18]:
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

print("Unique to Alice:", set(alice_common) - set(persuasion_common))
print("Unique to Persuasion:", set(persuasion_common) - set(alice_common))

Unique to Alice: {'The', 'went', 'know', 'like', "n't", 'Alice', 'said', 'little'}
Unique to Persuasion: {'He', 'She', 'Elliot', 'Wentworth', 'Anne', 'Captain', 'Mr', 'Mrs'}


In [19]:
[pair[0] for pair in alice_freq]

['I', 'said', 'Alice', "n't", "'s", 'little', 'The', 'like', 'went', 'know']

In [20]:
def lemma_frequencies(text, include_stop=True):
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
    return Counter(lemmas)

In [21]:
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)

print("\nAlice:", alice_lemma_freq)
print("Persuasion:", persuasion_lemma_freq)




Alice: [('-PRON-', 758), ('say', 476), ('alice', 396), ('be', 254), ('not', 231), ('go', 133), ('think', 131), ('little', 126), ('the', 109), ('look', 105)]
Persuasion: [('-PRON-', 2241), ('anne', 497), ("'s", 466), ('captain', 303), ('elliot', 295), ('mrs', 291), ('good', 289), ('know', 258), ('think', 256), ('mr', 255)]


In [22]:
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print("Unique to Alice:", set(alice_lemma_common) - set(persuasion_lemma_common))
print("Unique to Persuasion:", set(persuasion_lemma_common) - set(alice_lemma_common))

Unique to Alice: {'be', 'look', 'go', 'not', 'alice', 'little', 'the', 'say'}
Unique to Persuasion: {'know', "'s", 'mr', 'anne', 'captain', 'mrs', 'good', 'elliot'}


In [23]:
sentences = list(alice_doc.sents)
print("Alice in Wonderland has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Alice in Wonderland has 1678 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!



In [24]:
print(nlp("I want a hippopotamus")[1].pos_)
print(nlp("I have several wants")[3].pos_)

VERB
NOUN


In [None]:
len(example_sentence)

There was nothing so VERY remarkable in that; nor

In [25]:
print("\nParts of Speech:")
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


Parts of Speech:
There ADV
was VERB
nothing NOUN
so ADV
VERY ADV
remarkable ADJ
in ADP
that DET
; PUNCT


In [26]:
print("\nDependencies:")
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)


Dependencies:
There expl was
was ROOT was
nothing attr was
so advmod remarkable
VERY advmod remarkable
remarkable amod nothing
in prep nothing
that pobj in
; punct was


In [27]:
entities = list(alice_doc.ents)[0:10]
for entity in entities:
    print(entity.label_, " ".join(t.orth_ for t in entity))

PERSON Alice
DATE the hot day
PERSON Alice
PRODUCT Rabbit
PRODUCT Rabbit
PRODUCT WAISTCOAT - POCKET
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First


In [28]:
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == "PERSON"]
print(set(people))

{"the Mock Turtle: '", 'The White Rabbit', 'Gryphon', 'Begin', 'M--', 'Prizes', 'Fish-Footman', 'indeed:--', 'Soup', 'Mary Ann', 'Canary', 'Shakespeare', 'Run', 'Shall', 'Beau', 'the Mock Turtle', 'INSIDE', 'Edgar Atheling', 'Sentence', 'HAD', 'Mabel', 'The Fish-Footman', 'Turn', 'Hush', 'Mock Turtle', 'Elsie', "Don't", 'a Lobster Quadrille', 'Idiot', 'the March Hare', 'the Lobster Quadrille?', 'the King', 'Frog-Footman', 'Fifteenth', 'Bill', 'Panther', 'Sixteenth', '--or', 'Morcar', 'Serpent', 'The Queen', 'Said', 'Rabbit', 'Beautiful Soup', 'm--', 'Soles', 'FUL SOUP', 'Latitude', 'Seaography', 'Crab', 'Stolen', 'Lacie', 'Adventures', 'Pat', 'Drink', 'Ma', 'Duchess', 'Cheshire Puss', 'YOURS', 'The Mock Turtle', 'Pinch', "Dinah'll", 'Brandy', 'Tut', 'William the Conqueror', 'Tillie', 'Fury', 'William', "the Duchess: '", 'Jack', 'Treacle', 'Stupid', 'Repeat', "the King: '", 'WILLIAM', 'Duck', 'the Queen of Hearts', 'Ou', 'the Duchess', 'Queen', 'Alice', 'Longitude', 'Majesty', 'Kings', 