### Investigating texts from Project Gutenberg

![](images/gutenberg.png)

In [68]:
%matplotlib inline
import matplotlib.pyplot as plt
import requests

In [69]:
from bs4 import BeautifulSoup

In [None]:
url = "http://www.gutenberg.org/files/15784/15784-0.txt"

In [None]:
response = requests.get(url)

In [None]:
type(response)

In [None]:
response

In [None]:
soup_dos = BeautifulSoup(response.content, "html.parser")

In [None]:
len(soup_dos)

In [None]:
dos_text = soup_dos.get_text()

In [None]:
type(dos_text)

In [None]:
len(dos_text)

In [1]:
dos_text[:100]

NameError: name 'dos_text' is not defined

### Analyzing the Text with NLTK

The Natural Language Toolkit is a popular Python library for text analysis.  We will use it to split the text into individual words(tokens), and create a plot of the frequency distribution of the tokens.  

In [80]:
import nltk
from nltk import word_tokenize
tokens = word_tokenize(dos_text)

In [81]:
tokens[:10]

['Project',
 'Gutenberg',
 '’',
 's',
 'Alice',
 '’',
 's',
 'Adventures',
 'in',
 'Wonderland']

In [82]:
text = nltk.Text(tokens)

In [83]:
text[:10]

['Project',
 'Gutenberg',
 '’',
 's',
 'Alice',
 '’',
 's',
 'Adventures',
 'in',
 'Wonderland']

In [84]:
fdist = nltk.FreqDist(text)

In [85]:
type(fdist)

nltk.probability.FreqDist

In [86]:
fdist.most_common(50)

[(',', 2565),
 ('’', 1769),
 ('the', 1681),
 ('‘', 1116),
 ('.', 901),
 ('and', 863),
 ('to', 794),
 ('a', 671),
 ('of', 603),
 ('I', 543),
 ('it', 531),
 ('she', 509),
 ('said', 456),
 ('!', 451),
 ('in', 411),
 ('Alice', 398),
 ('you', 396),
 ('was', 352),
 ('that', 285),
 ('--', 266),
 (':', 256),
 ('as', 256),
 ('her', 243),
 ('with', 221),
 ('at', 217),
 ('t', 216),
 ('s', 204),
 ('?', 202),
 ('on', 198),
 (';', 194),
 ('all', 189),
 ('had', 177),
 ('be', 162),
 ('for', 160),
 ('not', 151),
 ('this', 150),
 ('or', 148),
 ('but', 137),
 ('they', 131),
 ('very', 127),
 ('little', 125),
 ('so', 124),
 ('is', 120),
 ('The', 119),
 ('out', 112),
 ('he', 103),
 ('about', 101),
 ('down', 99),
 ('up', 97),
 ('one', 96)]

In [None]:
fdist['blood']

In [None]:
fdist = nltk.FreqDist(word.lower() for word in word_tokenize(dos_text))

In [None]:
plt.figure()
fdist.plot(30)

In [None]:
plt.figure()
fdist.plot(30, cumulative=True)

In [None]:
tagged = nltk.pos_tag(text)

In [None]:
tagged[:10]

In [None]:
text.similar("god")

In [None]:
text.common_contexts(["king", "lord"])

In [None]:
text.dispersion_plot(tokens[500:510])

In [None]:
len(text)

In [None]:
sorted(set(text))

### Lexical Richness of Text

In [None]:
len(set(text))/len(text)

In [None]:
text.count("Kings")

In [None]:
100*text.count("Kings")/len(text)

### Stopwords

In [None]:
from nltk.corpus import stopwords

In [None]:
set(stopwords.words('english'))

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
filter_text = [w for w in text if not w in stop_words ]

In [None]:
filter_text[:10]

### Lists Reminders

In [None]:
sent = ['Call', 'me', 'Ishmael', '.']

In [None]:
sent

In [None]:
sent[0]

In [None]:
sent.append('stever')

In [None]:
sent

In [None]:
sent.index('stever')

### `set` and tokenizing

In [None]:
saying = ['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said',
         'than', 'done']

In [None]:
tokens = set(saying)

In [None]:
tokens

In [None]:
tokens = sorted(tokens)

In [None]:
tokens[-2:]

In [None]:
long_words = [w for w in text if len(w)>15]

In [None]:
long_words

In [None]:
list(nltk.bigrams(['more', 'is', 'said', 'than', 'done']))

In [None]:
text.collocations()

In [None]:
nltk.corpus.gutenberg.fileids()

### Task

1. Scrape and tokenize a text from project Gutenberg.

2. Compare the most frequent occurring words with and without stopwords removed.

3. Compare the top bigrams and trigrams for a second book.

**Further Reading**:  http://www.nltk.org/book/

In [70]:
url = "https://www.gutenberg.org/files/11/11-0.txt"

In [71]:
response = requests.get(url)

In [72]:
type(response)

requests.models.Response

In [73]:
response

<Response [200]>

In [74]:
soup_dos = BeautifulSoup(response.content, "html.parser")

In [75]:
len(soup_dos)

1

In [76]:
dos_text = soup_dos.get_text()

In [77]:
type(dos_text)

str

In [78]:
len(dos_text)

167552

In [79]:
dos_text[:100]

'Project Gutenberg’s Alice’s Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use '

In [104]:
fdist = nltk.FreqDist(text)

In [105]:
type(fdist)

nltk.probability.FreqDist

In [106]:
fdist.most_common(10)

[(',', 2565),
 ('’', 1769),
 ('the', 1681),
 ('‘', 1116),
 ('.', 901),
 ('and', 863),
 ('to', 794),
 ('a', 671),
 ('of', 603),
 ('I', 543)]

In [101]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [92]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [93]:
stop_words = set(stopwords.words('english'))

In [102]:
filter_text = [w for w in text if not w in stop_words ]

In [103]:
filter_text[:10]

TypeError: 'RegexpTokenizer' object is not subscriptable

In [97]:
filter_dist = nltk.FreqDist(filter_text)

In [99]:
type(filter_dist)

nltk.probability.FreqDist

In [100]:
filter_dist.most_common(10)

[(',', 2565),
 ('’', 1769),
 ('‘', 1116),
 ('.', 901),
 ('I', 543),
 ('said', 456),
 ('!', 451),
 ('Alice', 398),
 ('--', 266),
 (':', 256)]