In [24]:
import nltk, re, pprint
from nltk import word_tokenize

## 4.1 Back to the Basics

### Assignment

In [1]:
foo = 'Monty'
bar = foo
foo = 'Python'
bar

'Monty'

In [2]:
foo = ['Monty', 'Python']
bar = foo
foo[1] = 'Bodkin'
bar

['Monty', 'Bodkin']

In [3]:
empty = []
nested = [empty, empty, empty]
nested

[[], [], []]

In [5]:
nested[1].append('Python')
nested

[['Python', 'Python'], ['Python', 'Python'], ['Python', 'Python']]

In [3]:
nested = [[]] * 3
nested[1].append('Python')
nested[1] = ['Monty']
nested

[['Python'], ['Monty'], ['Python']]

### Equality

In [4]:
size = 5
python = ['Python']
snake_nest = [python] * size
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]

True

In [5]:
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]

True

In [6]:
import random
position = random.choice(range(size))
snake_nest[position] = ['Python']
snake_nest

[['Python'], ['Python'], ['Python'], ['Python'], ['Python']]

In [7]:
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]

True

In [8]:
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]

False

In [9]:
[id(snake) for snake in snake_nest]

[3044202892, 3044202892, 3044202892, 3061972652, 3044202892]

### Conditionals

In [10]:
mixed = ['cat', '', ['dog'], []]
for element in mixed:
    if element:
        print(element)

cat
['dog']


In [11]:
animals = ['cat', 'dog']
if 'cat' in animals: 
    print(1)
elif 'dog' in animals:
    print(2)
    

1


In [12]:
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
all(len(w) > 4 for w in sent)

False

In [13]:
any(len(w) > 4 for w in sent)

True

## 4.2 Sequences

In [14]:
t = 'walk', 'fem', 3
t

('walk', 'fem', 3)

In [15]:
t[0]

'walk'

In [16]:
t[1:]

('fem', 3)

In [17]:
len(t)

3

In [18]:
raw = 'I turned off the spectroroute'
text = ['I', 'turned', 'off', 'the', 'spectroroute']
pair = (6, 'turned')
raw[2], text[3], pair[1]

('t', 'the', 'turned')

In [19]:
raw[-3:], text[-3:], pair[-3:]

('ute', ['off', 'the', 'spectroroute'], (6, 'turned'))

In [20]:
len(raw), len(text), len(pair)

(29, 5, 2)

### Operating of Sequence Types

In [25]:
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry'
text = word_tokenize(raw)
fdist = nltk.FreqDist(text)
sorted(fdist)

[',', 'Red', 'lorry', 'red', 'yellow']

In [26]:
for key in fdist: 
    print(key + ':', fdist[key], end='; ')

lorry: 4; Red: 1; ,: 3; red: 1; yellow: 2; 

In [27]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
words

['I', 'turned', 'the', 'spectroroute', 'off']

In [28]:
tmp = words[2]
words[2] = words[3]
words[3] = words[4]
words[4] = tmp

In [29]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
zip(words, tags)

<zip at 0xa97cd94c>

In [30]:
list(zip(words, tags))

[('I', 'noun'),
 ('turned', 'verb'),
 ('off', 'prep'),
 ('the', 'det'),
 ('spectroroute', 'noun')]

In [31]:
list(enumerate(words))

[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

In [34]:
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
text == training_data + test_data

True

In [35]:
len(training_data) / len(test_data)

9.0

### Combining Different Sequence Types

In [40]:
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
wordlens.sort()
' '.join(w for (_, w) in wordlens)

'I off the turned spectroroute'

In [42]:
lexicon = [
    ('the', 'det', ['Di:', 'D@']),
    ('off', 'prep', ['Qf', 'O:f'])
]
lexicon.sort()
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
del lexicon[0]

### Generator Expressions

In [43]:
text = '''"When I use a word," Humpty Dumpty said in rather scornful 
tone. "it means just what I choose it to mean - neither more nor less"'''
[w.lower() for w in word_tokenize(text)]

['``',
 'when',
 'i',
 'use',
 'a',
 'word',
 ',',
 "''",
 'humpty',
 'dumpty',
 'said',
 'in',
 'rather',
 'scornful',
 'tone',
 '.',
 '``',
 'it',
 'means',
 'just',
 'what',
 'i',
 'choose',
 'it',
 'to',
 'mean',
 '-',
 'neither',
 'more',
 'nor',
 'less',
 "''"]

In [44]:
max([w.lower() for w in word_tokenize(text)])

'word'

In [45]:
max(w.lower() for w in word_tokenize(text))

'word'

## 4.3 Questions of Style

### Procedural vs Declarative Style

In [47]:
tokens = nltk.corpus.brown.words(categories='news')
count = 0
total = 0
for token in tokens:
    count += 1
    total += len(token)
total / count
                                 

4.401545438271973

In [48]:
total = sum(len(t) for t in tokens)
print(total / len(tokens))

4.401545438271973


In [50]:
word_list = []
i = 0
while i < len(tokens):
    j = 0
    while j < len(word_list) and word_list[j] <= tokens[i]:
        j += 1
    if j == 0 or tokens[i] != word_list[j-1]:
        word_list.insert(j, tokens[i])
    i += 1

KeyboardInterrupt: 

In [None]:
word_list = sorted(set(tokens))

In [51]:
fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
most_common_words = [word for (word, count) in fd.most_common()]
for rank, word in enumerate(most_common_words):
    cumulative += fd.freq(word)
    print("%3d %6.2f%% %s" % (rank + 1, cumulative * 100, word))
    if cumulative > 0.25:
        break

  1   5.40% the
  2  10.42% ,
  3  14.67% .
  4  17.78% of
  5  20.19% and
  6  22.40% to
  7  24.29% a
  8  25.97% in


In [53]:
text = nltk.corpus.gutenberg.words('milton-paradise.txt')
longest = ''
for word in text:
     if len(word) > len(longest):
         longest = word
longest

'unextinguishable'

In [54]:
maxlen = max(len(word) for word in text)
[word for word in text if len(word) == maxlen]

['unextinguishable',
 'transubstantiate',
 'inextinguishable',
 'incomprehensible']

### Some Legitimate Uses for Counters

In [56]:
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
n = 3
[sent[i:i+n] for i in range(len(sent)-n+1)]

[['The', 'dog', 'gave'],
 ['dog', 'gave', 'John'],
 ['gave', 'John', 'the'],
 ['John', 'the', 'newspaper']]

In [57]:
m, n = 3, 7
array = [[set() for i in range(n)] for j in range(m)]
array[2][5].add('Alice')
pprint.pprint(array)

[[set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), {'Alice'}, set()]]


In [59]:
array = [[set()] * n ] * m
array[2][5].add(7)
pprint.pprint(array)

[[{7}, {7}, {7}, {7}, {7}, {7}, {7}],
 [{7}, {7}, {7}, {7}, {7}, {7}, {7}],
 [{7}, {7}, {7}, {7}, {7}, {7}, {7}]]


## 4.4 Functions: The Foundation of Structured Programming

In [60]:
import re
def get_text(file):
    """Read text from a file, normalizing whitespace and stripping HTML markup."""
    text = open(file).read()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

help(get_text)

Help on function get_text in module __main__:

get_text(file)
    Read text from a file, normalizing whitespace and stripping HTML markup.



### Function Inputs and Output

In [61]:
def repeat(msg, num):
    return ' '.join([msg] * num)
monty = 'Monty Python'
repeat(monty, 3)

'Monty Python Monty Python Monty Python'

In [63]:
def monty():
    return 'Monty Python'
monty()

'Monty Python'

In [64]:
repeat(monty(), 3)

'Monty Python Monty Python Monty Python'

In [65]:
def my_sort1(mylist): # good: modifies its argument, no return value
    mylist.sort()
    
def my_sort2(mylist): # good: doesn't touch its argumet, returns value
    return sorted(mylist)

def my_sort3(mylist): #bad: modifies its argument and also returns it
    mylist.sort()
    return mylist


### Parameter Passing

In [66]:
def set_up(word, properties):
    word = 'lolcat'
    properties.append('noun')
    properties = 5
    
w = ''
p = []
set_up(w, p)
w

''

In [67]:
p

['noun']

### Checking Parameter Types

In [68]:
def tag(word):
    if word in ['a', 'the', 'all']:
        return 'det'
    else: 
        return 'noun'
# Here function assumed that its argument would always be a string
tag('the')

'det'

In [69]:
tag('knight')

'noun'

In [70]:
tag(["'Tis", 'but', 'a', 'scratch'])

'noun'

In [71]:
def tag(word):
    assert isinstance(word, basestring), "argument to tag() must be a string"
    if word in ['a', 'the', 'all']:
        return 'det'
    else: 
        return 'noun'
    

### Functional Decomposition

In [75]:
from urllib import request
from bs4 import BeautifulSoup

def freq_words(url, freqdist, n):
    html = request.urlopen(url).read().decode('utf-8')
    raw = BeautifulSoup(html, "lxml").get_text()
    for word in word_tokenize(raw):
        freqdist[word.lower()] += 1
    result = []
    for word, count in freqdist.most_common(n):
        result = result + [word]
    print(result)

constitution = "http://www.archives.gov/exhibits/charters/constitution_transcript.html"
fd = nltk.FreqDist()
freq_words(constitution, fd, 30)

["''", ',', 'the', ':', ':1', ';', '{', 'of', '}', ')', '(', '#', 'archives', "'", '.', 'national', 'and', '[', ']', '``', 'a', 'constitution', 'documents', 'declaration', 'to', 'charters', 'rights', 'freedom', '.section-theme', 'founding']


In [77]:
from urllib import request
from bs4 import BeautifulSoup

def freq_words(url, n):
    html = request.urlopen(url).read().decode('utf8')
    text = BeautifulSoup(html, 'lxml').get_text()
    freqdist = nltk.FreqDist(word.lower() for word in word_tokenize(text))
    return [word for (word, _) in freqdist.most_common(n)]

freq_words(constitution, 30)

["''",
 ',',
 'the',
 ':',
 ':1',
 ';',
 '{',
 'of',
 '}',
 ')',
 '(',
 '#',
 'archives',
 "'",
 '.',
 'national',
 'and',
 '[',
 ']',
 '``',
 'a',
 'constitution',
 'documents',
 'declaration',
 'to',
 'charters',
 'rights',
 'freedom',
 '.section-theme',
 'founding']

### Documenting Functions

In [78]:
def accuracy(reference, test):
    """
    Calculate the fraction of test items that equal the corresponding reference items.

    Given a list of reference values and a corresponding list of test values,
    return the fraction of corresponding values that are equal.
    In particular, return the fraction of indexes
    {0<i<=len(test)} such that C{test[i] == reference[i]}.

        >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
        0.5

    :param reference: An ordered list of reference values
    :type reference: list
    :param test: A list of values to compare against the corresponding
        reference values
    :type test: list
    :return: the accuracy score
    :rtype: float
    :raises ValueError: If reference and length do not have the same length
    """

    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    num_correct = 0
    for x, y in zip(reference, test):
        if x == y:
            num_correct += 1
    return float(num_correct) / len(reference)


## 4.5 Doing More with Functions

### Functions as Arguments

In [79]:
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
def extract_property(prop):
    return [prop(word) for word in sent]

extract_property(len)

[4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]

In [80]:
def last_letter(word):
    return word[-1]
extract_property(last_letter)

['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

#### Lamda Expressions

In [81]:
extract_property(lambda w: w[-1])

['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

In [88]:
sorted(sent)

[',',
 '.',
 'Take',
 'and',
 'care',
 'care',
 'of',
 'of',
 'sense',
 'sounds',
 'take',
 'the',
 'the',
 'themselves',
 'will']

In [93]:
sorted(sent, key=lambda x: len(x), reverse=True)

['themselves',
 'sounds',
 'sense',
 'Take',
 'care',
 'will',
 'take',
 'care',
 'the',
 'and',
 'the',
 'of',
 'of',
 ',',
 '.']

### Accumulative Functions

In [94]:
def search1(substring, words):
    result = []
    for word in words:
        if substring in word:
            result.append(word)
    return result

def search2(substring, words):
    for word in words:
        if substring in word:
            yield word

for item in search1('zz', nltk.corpus.brown.words()):
    print(item, end = " ")

Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazz

In [95]:
for item in search2('zz', nltk.corpus.brown.words()):
    print(item, end = " ")

Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazz

In [96]:
def permutations(seq):
    if len(seq) <= 1:
        yield seq
    else: 
        for perm in permutations(seq[1:]):
            for i in range(len(perm) + 1):
                yield perm[:i] + seq[0:1] + perm[i:]

list(permutations(['police', 'fish', 'buffalo']))

[['police', 'fish', 'buffalo'],
 ['fish', 'police', 'buffalo'],
 ['fish', 'buffalo', 'police'],
 ['police', 'buffalo', 'fish'],
 ['buffalo', 'police', 'fish'],
 ['buffalo', 'fish', 'police']]

### Higher-Order Functions

In [97]:
def is_content_word(word):
     return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
         'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
list(filter(is_content_word, sent))

['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

In [98]:
[w for w in sent if is_content_word(w)]

['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

In [99]:
lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))
sum(lengths) / len(lengths)

21.75081116158339

In [100]:
lengths = [len(sent) for sent in nltk.corpus.brown.sents(categories='news')]
sum(lengths) / len(lengths)

21.75081116158339

In [105]:
# list(map(lambda w: len(filter(lambda c: c.lower() in "aeiou", w)), sent))
# [len(c for c in w if c.lower() in "aeiou") for w in sent]

In [106]:
def repeat(msg='<empty>', num=1):
     return msg * num
repeat(num=3)

'<empty><empty><empty>'

In [107]:
repeat(msg='Alice')

'Alice'

In [108]:
repeat(num=5, msg='Alice')

'AliceAliceAliceAliceAlice'

In [111]:
def generic(*args, **kwargs):
    print(args)
    print(kwargs)
generic(1, "African swallow", monty="python")

(1, 'African swallow')
{'monty': 'python'}


In [112]:
song = [['four', 'calling', 'birds'],
         ['three', 'French', 'hens'],
         ['two', 'turtle', 'doves']]
list(zip(song[0], song[1], song[2]))

[('four', 'three', 'two'),
 ('calling', 'French', 'turtle'),
 ('birds', 'hens', 'doves')]

In [113]:
list(zip(*song))

[('four', 'three', 'two'),
 ('calling', 'French', 'turtle'),
 ('birds', 'hens', 'doves')]