# 03 - Processing Raw Text

Resource: https://www.nltk.org/book/ch03.html

In [2]:
import nltk
from nltk import word_tokenize

#### Table of Content
1. Accessing Text from the Web and from Disk

2. Strings: Text Processing at the Lowest Level

3. Text Processing with Unicode

4. Regular Expressions for Detecting Word Patterns

5. Useful Applications of Reguler Expressions

6. Normalizing Text

7. Reguler Expressiong for Tokenizing Text

8. Segmentation

9. Formatting: From Lists to Strings

In [3]:
from urllib import request
url = "https://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(raw[:75])

The Project Gutenberg eBook of Crime and Punishment, by Fyodor Dostoevsky


In [4]:
tokens = word_tokenize(raw)
len(tokens)

257059

In [5]:
text = nltk.Text(tokens)
text[1052:1062]

['came', 'to', 'him', 'from', 'nature', ',', 'this', 'he', 'won', 'for']

In [6]:
text.collocations()

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Project Gutenberg; Ilya
Petrovitch; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens


In [7]:
raw.find("PART I")

5575

In [8]:
raw.rfind("End of Project Gutenberg's Crime")

-1

In [9]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
print(html[:60])

<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN


In [49]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
tokens[:100:10]

['BBC',
 '200',
 'SEARCH',
 'Americas',
 'Science/Nature',
 '--',
 '-',
 '--',
 'SERVICES',
 '--']

In [11]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


In [12]:
import feedparser
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']

'Language Log'

In [13]:
len(llog.entries)

13

In [14]:
post = llog.entries[2]
print(post.title[6:-6])

tion rec


In [15]:
content = post.content[0].value
content[:70]

'<p><br />\n<span id="more-67158"></span></p>\n<p>Links for those interes'

In [16]:
raw = BeautifulSoup(content, 'html.parser').get_text()
word_tokenize(raw)

['Links',
 'for',
 'those',
 'interested',
 'in',
 'becoming',
 'collectors',
 'of',
 'LLOG',
 'posts',
 'related',
 'to',
 'recursion…']

In [51]:
from nltk.corpus import stopwords

path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'r').read()
tokens = word_tokenize(raw)
words = [word.lower() for word in tokens if word.isalpha()]
types = [type for type in words if type not in stopwords.words('english')]
types[:100:10]

# vocab = sorted(set(words))
# vocab
# 
# len_alpha_type = len(set(word.lower() for word in text1 if word.isalpha()))

['moby',
 'school',
 'dusting',
 'known',
 'mortality',
 'ignorance',
 'hackluyt',
 'arched',
 'wallow',
 'wal']

In [18]:
monty = "Monty Python"
grail = "Holy Grail"
print(monty + 'and' + grail, end='\n')

Monty PythonandHoly Grail


In [19]:
sent = "colorless green ideas sleep furiously"
for char in sent:
  print(char, end=' ')

c o l o r l e s s   g r e e n   i d e a s   s l e e p   f u r i o u s l y 

#### Useful String Methods
|Method|Functionality|
|:-----|:-----------:|
|s.find(t)|index of first instance of string t inside s (-1 if not found)|
|s.rfind(t)|index of last instance of string t inside s (-1 if not found)|
|s.index(t)|like s.find(t) except it raises ValueError if not found|
|s.rindex(t)|like s.rfind(t) except it raises ValueError if not found|
|s.join(text)|combine the words of the text into a string using s as the glue|
|s.split(t)|split s into a list wherever a t is found (whitespace by default)|
|s.splitlines()|split s into a list of strings, one per line|
|s.lower()|a lowercased version of the string s|
|s.upper()|an uppercased version of the string s|
|s.title()|a titlecased version of the string s|
|s.strip()|a copy of s without leading or trailing whitespace|
|s.replace(t, u)|replace instances of t with u inside s|

In [20]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f = open(path, encoding='latin2')
for line in f:
  line = line.strip()
  print(line)

Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.


In [21]:
f = open(path, encoding='latin2')
for line in f:
  line = line.strip()
  print(line.encode('unicode_escape'))

b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'


In [22]:
print("Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą")

Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą


In [23]:
ord('ń')

324

In [24]:
nacute = '\u0144'
nacute

'ń'

In [25]:
import re
word_list = [w for w in nltk.corpus.words.words('en') if w.islower()]
len(word_list)

210687

In [52]:
[w for w in word_list if re.search('ed$', w)][:100:10]

['abaissed',
 'absorbed',
 'accredited',
 'acquainted',
 'addlepated',
 'adreamed',
 'affined',
 'agazed',
 'aiguilletted',
 'alleyed']

In [27]:
[w for w in word_list if re.search('^..j..t..$', w)]

['abjectly',
 'adjuster',
 'dejected',
 'dejectly',
 'injector',
 'majestic',
 'objectee',
 'objector',
 'rejecter',
 'rejector',
 'unjilted',
 'unjolted',
 'unjustly']

In [28]:
[w for w in word_list if re.search('^[ghi][mno][jlk][def]$', w)]

['gold', 'golf', 'hold', 'hole']

In [29]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [30]:
[w for w in chat_words if re.search('^[ha]+$', w)]

['a',
 'aaaaaaaaaaaaaaaaa',
 'aaahhhh',
 'ah',
 'ahah',
 'ahahah',
 'ahh',
 'ahhahahaha',
 'ahhh',
 'ahhhh',
 'ahhhhhh',
 'ahhhhhhhhhhhhhh',
 'h',
 'ha',
 'haaa',
 'hah',
 'haha',
 'hahaaa',
 'hahah',
 'hahaha',
 'hahahaa',
 'hahahah',
 'hahahaha',
 'hahahahaaa',
 'hahahahahaha',
 'hahahahahahaha',
 'hahahahahahahahahahahahahahahaha',
 'hahahhahah',
 'hahhahahaha']

#### Basic Regular Expression Meta-Characters
|Operator|Behavior|
|:-------|:------:|
|.|Wildcard, matches any character|
|^abc|Matches some pattern abc at the start of a string|
|abc$|Matches some pattern abc at the end of a string|
|[abc]|Matches one of a set of characters|
|[A-Z0-9]|Matches one of a range of characters|
|ed\|ing\|s|Matches one of the specified strings (disjunction)|
|*|Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)|
|+|One or more of previous item, e.g. a+, [a-z]+|
|?|Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?|
|{n}|Exactly n repeats where n is a non-negative integer|
|{n,}|At least n repeats|
|{,n}|No more than n repeats|
|{m,n}|At least m and no more than n repeats|
|a(b\|c)+|Parentheses that indicate the scope of the operators|

In [31]:
word = 'supercalifragilisticexpialidocious'
len(re.findall(r'[aeiou]', word))

16

In [32]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                   for vs in re.findall(r'[aeious]{2,}', word))
fd.most_common(12)

[('es', 588),
 ('io', 483),
 ('ea', 399),
 ('is', 277),
 ('ou', 270),
 ('ai', 251),
 ('ia', 239),
 ('ie', 221),
 ('ee', 193),
 ('se', 193),
 ('oo', 158),
 ('su', 147)]

In [33]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                      for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(15)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95),
 ('ei', 86),
 ('oi', 65),
 ('oa', 59)]

In [34]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [35]:
cv_word_pairs = [(cv, w) for w in rotokas_words
                 for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
cv_index['su']

['kasuari']

In [54]:
cv_index['po'][:50:5]

['kaapo', 'kapo', 'kapokapoa', 'kapokaporo', 'kapooto', 'kaporopa', 'kepo']

In [37]:
from nltk.corpus import gutenberg
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r'<a> (<.*>) <man>')

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [38]:
from nltk.corpus import nps_chat
chat = nltk.Text(nps_chat.words())
chat.findall(r'<.*> <.*> <bro>')

you rule bro; telling you bro; u twizted bro


In [39]:
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [40]:
from nltk.corpus import brown

hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>')

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


In [41]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)

In [55]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

[porter.stem(t) for t in tokens][:100:10]

['[', '(', 'school', 'heart', 'now', 'grammar', 'all', 'the', ';', '``']

In [56]:
[lancaster.stem(t) for t in tokens][:100:10]

['[', '(', 'school', 'heart', 'now', 'gramm', 'al', 'the', ';', '``']

In [57]:
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens][:100:10]

['[', '(', 'School', 'heart', 'now', 'grammar', 'all', 'the', ';', '``']

In [45]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [None]:
re.split(r'\s+', raw)[:100:10]   # [ \t\n]+ ==> \s (all whitespace characters)

['[Moby', 'Late', 'in', 'He', 'a', 'of', 'to', 'of', 'others,', 'is']

In [47]:
re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw)[:100:10]

['[Moby', 'Supplied', ')', ',', '.', ',', 'the', 'world', 'it', 'you']

#### Regular Expression Symbols

|Symbol|Function|
|:-----|:-------|
|\b|Word boundary (zero width)|
|\d|Any decimal digit (equivalent to [0-9])|
|\D|Any non-digit character (equivalent to [^0-9])|
|\s|Any whitespace character (equivalent to [ \t\n\r\f\v])|
|\S|Any non-whitespace character (equivalent to [^ \t\n\r\f\v])|
|\w|Any alphanumeric character (equivalent to [a-zA-Z0-9_])|
|\W|Any non-alphanumeric character (equivalent to [^a-zA-Z0-9_])|
|\t|The tab character|
|\n|The newline character|

In [62]:
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)     # set flag to allow verbose regexps
    (?:[A-Z]\.)+       # abbreviations, e.g. U.S.A.
  | \w+(?:-\w+)*       # words with optional internal hyphens
  | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
  | \.\.\.             # ellipsis
  | [][.,;"'?():-_`]   # these are separate tokens; includes ], ['''
nltk.regexp_tokenize(text, pattern)

['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']

In [63]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])

# Determine the maximum word length for alignment
max_word_length = max(len(word) for word in fdist)

# Print the output in aligned columns
for word in sorted(fdist):
    print(f"{word:<{max_word_length}} ==> {fdist[word]:>3}")

cat   ==>   3
dog   ==>   4
snake ==>   1
