In [7]:
import nltk

# Working with Corpora

## Gutenberg Corpus
- Based on 25K free electronic books.
- Available from www.gutenberg.org

In [8]:
nltk.corpus.gutenberg.fileids()
# from nltk.corpus import gutenberg
# gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [9]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)

192427

In [12]:
raw_txt = nltk.corpus.gutenberg.raw('austen-emma.txt')
words_txt = nltk.corpus.gutenberg.words('austen-emma.txt')
sents_txt = nltk.corpus.gutenberg.sents('austen-emma.txt')

In [13]:
print('-----'*5)
print('Raw Text :')
print(raw_txt[:10])
print('-----'*5)
print('Word Text :')
print(words_txt[:10])
print('-----'*5)
print('Sent Text :')
print(sents_txt[:3])
print('-----'*5)

-------------------------
Raw Text :
[Emma by J
-------------------------
Word Text :
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
-------------------------
Sent Text :


LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/huleilei/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

## Web and Chat Text
-    Web Text : Firefox discussion forum, conversations overheard in New York, the movie script of Pirates of the Carribean, personal adver- tisements, and wine reviews
-    IM Chat : Naval Postgraduate School from 10K posts

In [14]:
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

['i',
 'do',
 "n't",
 'want',
 'hot',
 'pics',
 'of',
 'a',
 'female',
 ',',
 'I',
 'can',
 'look',
 'in',
 'a',
 'mirror',
 '.']

## Brown Corpus
- Million-word electronic corpus of English, created in 1961 at Brown University
- Contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on

!['Common dictionary literals and operations'](img/pg-43.png)

In [13]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [14]:
brown.words(categories='news')

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

## Reuters Corpus
- Contains 10,788 news documents totaling 1.3 million words
- Classified into 90 topics, and grouped into two sets, called “training” and “test”

In [16]:
from nltk.corpus import reuters
reuters.fileids()[:5]

['test/14826', 'test/14828', 'test/14829', 'test/14832', 'test/14833']

In [19]:
reuters.categories()[:10]

['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee']

In [20]:
reuters.words(categories='coffee')

['INDONESIAN', 'COMMODITY', 'EXCHANGE', 'MAY', ...]

## Inaugural Address Corpus
- Collection of 55 texts, one for each presidential ad- dress.

In [23]:
from nltk.corpus import inaugural
inaugural.fileids()[-5:]

['1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt']

!['Common dictionary literals and operations'](img/pg-47-1.png)

!['Common dictionary literals and operations'](img/pg-47-2.png)

!['Common dictionary literals and operations'](img/pg-47-3.png)

# Methods Summary

!['Common dictionary literals and operations'](img/pg-50.png)

## WordList Corpora
- Contain words from english dict. Use in some Spellcheckers

In [25]:
words = nltk.corpus.words.words() 
words[:20]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'aba',
 'Ababdeh',
 'Ababua',
 'abac']

In [27]:
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

# Obama - Words per Sentence

In [53]:
from nltk.corpus import inaugural
inaugural.fileids()[-5:]

['1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt']

In [54]:
'2009-Obama.txt'.split('.')

['2009-Obama', 'txt']

In [55]:
inaugural.words('2009-Obama.txt')

['My', 'fellow', 'citizens', ':', 'I', 'stand', 'here', ...]

In [56]:
inaugural.sents('2009-Obama.txt')

[['My', 'fellow', 'citizens', ':'], ['I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have', 'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by', 'our', 'ancestors', '.'], ...]

In [57]:
inaugural.sents('2009-Obama.txt')

[['My', 'fellow', 'citizens', ':'], ['I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have', 'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by', 'our', 'ancestors', '.'], ...]

In [58]:
count = len(inaugural.words('2009-Obama.txt'))/len(inaugural.sents('2009-Obama.txt'))

In [59]:
print('words per sentence: ', count)

words per sentence:  24.339285714285715


# how many presidents form 

In [47]:
from nltk.corpus import inaugural
all_presidents = inaugural.fileids() # 再去掉前后多余的字符

In [49]:
clean_names = []
for name in all_presidents:
    clean_name = name[5:-4]
    clean_names.append(clean_name)
unique_names = set(clean_names)

In [50]:
print('Number of presidents: ', len(unique_names)+1)

Number of presidents:  35


In [52]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True