In [None]:
import nltk

In [None]:
nltk.download()

# Gutenberg Corpus

- A small selection (42,000) of texts from the Project Gutenberg electronic text archive.


In [None]:
from nltk.corpus import gutenberg

In [None]:
gutenberg.fileids()

### Look into Austen's Emma

- Just see first 10 words and count the number of word tokens and word types

In [None]:
emma_words = gutenberg.words('austen-emma.txt')
print(emma_words[:10])

In [None]:
print(len(emma_words), len(set(emma_words)))

- You can get it as a long single string

In [None]:
emma_str = gutenberg.raw('austen-emma.txt')
print(emma_str[:10])
print(len(emma_str))

- Or as a list of sentences

In [None]:
emma_sents = gutenberg.sents('austen-emma.txt')
print(emma_sents[0:3])
print(len(emma_sents))

### Take a statistics for each text

Print the following three statistics:

1. Average number of characters per words
2. Average number of words per sentences
3. Average number of words per vocabularies

In [None]:
for f in gutenberg.fileids():
    n_chars = len(gutenberg.raw(f))
    n_words = len(gutenberg.words(f))
    n_sents = len(gutenberg.sents(f))
    n_vocab = len(set(gutenberg.words(f)))
    
    avg_word_len = n_chars / n_words
    avg_sent_len = n_words / n_sents
    avg_frequency = n_words / n_vocab
    
    print(avg_word_len, avg_sent_len, avg_frequency, f)

### Why the average sentence length in Milton's Paradise is so long?

In [None]:
paradise_sents = gutenberg.sents('milton-paradise.txt')

maxlen = max([len(s) for s in paradise_sents])
print(maxlen)

In [None]:
maxlen, maxsent = max([(len(s), s) for s in paradise_sents])
print(maxsent)

In [None]:
print(' '.join(maxsent))

# Web Text Corpus

In [None]:
from nltk.corpus import webtext

In [None]:
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

# Chat Corpus

In [None]:
from nltk.corpus import nps_chat

In [None]:
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])

# Brown Corpus

- A corpus created in 1961 at Brown University
- An exemplar of the brown corpus
- The original brown corpus contains 583 million tokens and 293,181 types
- It has been categorized by genre and is convenient for studying differences between genres.

In [None]:
from nltk.corpus import brown

In [None]:
brown.categories()

In [None]:
brown.sents(categories='news')

In [None]:
allwords = brown.words(categories=brown.categories())
len(allwords)

In [None]:
len(set(allwords))

# Reuters Corpus

- News corpus in Reuters

In [None]:
from nltk.corpus import reuters

In [None]:
reuters.fileids()

In [None]:
print(reuters.categories())

In [None]:
reuters.sents('test/14826')

In [None]:
reuters.categories('test/14826')

In [None]:
print(reuters.fileids('trade'))

# Inaugural Address Corpus

- First statement by Presidents of U.S.A.

In [None]:
from nltk.corpus import inaugural

In [None]:
print(inaugural.fileids())

In [None]:
inaugural.sents('2009-Obama.txt')

# There are more corpora available

Refer to http://www.nltk.org/nltk_data/

### Every Corpus in NLTK holds Three Forms of Text

- A long single string (raw contents)
- A sequence of words
- A sequence of sentences (and each sentence is a sequence of words)

In [None]:
raw = inaugural.raw('2009-Obama.txt')
print(raw[:20])

In [None]:
words = inaugural.words('2009-Obama.txt')
print(words[:20])

In [None]:
sents = inaugural.sents('2009-Obama.txt')
print(sents[:20])

# Getting Text from The Web

- `urllib` provides an easy way to access the website and get the contents
- However, the web text is usually written in HTML, which contains a lot of tags that specify the layout, the style, images, etc.

In [None]:
from urllib.request import urlopen
url = 'http://www.bbc.com/news/world-us-canada-36439151'
html = urlopen(url).read()
print(html)

### `BeautifulSoup`

- `BeautifulSoup` is a library that provides functions to extract contents you want to use
- First, we should find the sentences that we want to extract in the HTML
- Then, we try to figure out the HTML tags that surround the target sentences
  - In this case, `<p></p>` tags seem to be a key structure to characterize the body of the contents

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
ptags = soup.find_all('p')
print(ptags)

In [None]:
doc = ' '.join([p.string for p in ptags if p.string is not None])
print(doc)

- It seems to be good, but some undesired sentences are getting in.
  - `"Share this with Email Facebook Messenger ..."`
  - `"Cuba has a leader who is not a Castro"`, which is an anchor text to another news article
- So, we look more closely to find the rule to tell them from the target sentence
  - In this case, we can find that these messages are surrounded by `p` tags with special attributes such as `<p aria-hidden="true" class="twite__title">`, `<p aria-hidden="true" class="twite__channel-text">`, and `<p class="top-stories-promo-story__summary ">`
  - Here we try to extract only `p` tags with no class specification

In [None]:
ptags = soup.find_all('p', class_=None)
print(ptags)

In [None]:
doc = ' '.join([p.string for p in ptags if p.string is not None])
print(doc)

### It seems perfect

Now we can analyze the text in any way

In [None]:
words = nltk.word_tokenize(doc)
print(words[:10])

In [None]:
print(len(words), len(set(words)))

### Define a function to process other articles in the same way

In [None]:
def extract_bbc_text(url):
    html = urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    ptags = soup.find_all('p', class_=None)
    return ' '.join([p.string for p in ptags if p.string is not None])

In [None]:
doc2 = extract_bbc_text('http://www.bbc.com/news/election-us-2016-37918303')
print(doc2)

### Don't make too much access that may be regarded as an attack to the website

- Insert `sleep()` for courtecy

In [None]:
from time import sleep
urls = ['http://www.bbc.com/news/world-us-canada-36439151', 
        'http://www.bbc.com/news/election-us-2016-37918303', 
        'http://www.bbc.com/news/election-us-2016-37468751']

docs = []
for url in urls:
    docs.append(extract_bbc_text(url))
    sleep(10)
    print(len(docs))

In [None]:
print([nltk.word_tokenize(d)[:10] for d in docs])

### Also you should save the text into a file to avoid a repetitive access to website

In [None]:
out = open('bbc.txt', 'w')
out.write('\n'.join(docs))
out.close()

### More information to get to know how to use `BeautifulSoup`

https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# Open a Local File

In [None]:
f = open('bbc.txt')
docs = f.readlines()
f.close()

In [None]:
docs

In [None]:
[len(d) for d in docs]

In [None]:
docs = [nltk.word_tokenize(d) for d in docs]
[len(d) for d in docs]