# Counting words

Because what's a parallel computing demo without counting words?

In [None]:
from __future__ import print_function

Some utilitiles for excluding commmon phrases and normalizing words

In [None]:
import re
non_word = re.compile(r'[\W\d]+', re.UNICODE)

def normalize_word(word):
    """normalize a word
    
    simply strips non-word characters and case
    """
    word = word.lower()
    word = non_word.sub('', word)
    return word

common_words = {
'the','of','and','in','to','a','is','it','that','which','as','on','by',
'be','this','with','are','from','will','at','you','not','for','no','have',
'i','or','if','his','its','they','but','their','one','all','he','when',
'than','so','these','them','may','see','other','was','has','an','there',
'more','we','footnote', 'who', 'had', 'been',  'she', 'do', 'what',
'her', 'him', 'my', 'me', 'would', 'could', 'said', 'am', 'were', 'very',
'your', 'did', 'not',
}

def yield_words(filename):
    """returns a generator of words in a file"""
    import io
    with io.open(filename, errors='replace') as f:
        for line in f:
            for word in line.split():
                word = normalize_word(word)
                if word:
                    yield word


A function that reads a file, and returns a dictionary
with string keys of phrases of `n` words,
whose values

In [None]:
def ngrams(filename, n=1):
    """compute ngram counts for the contents of a file"""
    word_iterator = yield_words(filename)
    counts = {}
    def _count_gram(gram):
        common = sum(word in common_words for word in gram)
        if common > n / 2.0:
            # don't count ngrams that are >= 50% common words
            return
        sgram = ' '.join(gram)
        counts.setdefault(sgram, 0)
        counts[sgram] += 1
    
    gram = []
    
    # get the first word
    while len(gram) < n:
        try:
            word = next(word_iterator)
            if not word:
                continue
        except StopIteration:
            return counts
        else:
            gram.append(word)
    
    _count_gram(gram)

    while True:
        try:
            word = next(word_iterator)
        except StopIteration:
            break
        else:
            gram.append(word)
            gram.pop(0)
            _count_gram(gram)
    return counts
            

In [None]:
%%writefile cathat.txt
the cat in the hat is a cat whose hat is big.

In [None]:
ngrams('cathat.txt', 1)

In [None]:
ngrams('cathat.txt', 2)

Now fetch some interesting data from Project Gutenberg:

In [None]:
try: 
    from urllib.request import urlretrieve # py3
except ImportError:
    from urllib import urlretrieve # py2

davinci_url = "http://www.gutenberg.org/files/5000/5000-8.txt"

if not os.path.exists('davinci.txt'):
    # download from project gutenberg
    print("Downloading Da Vinci's notebooks from Project Gutenberg")
    urlretrieve(davinci_url, 'davinci.txt')

In [None]:
import sys

def print_common(freqs, n=10):
    """Print the n most common keys by count."""
    words, counts = freqs.keys(), freqs.values()
    items = zip(counts, words)
    items = sorted(items, reverse=True)
    justify = 0
    for (count, word) in items[:n]:
        justify = max(justify, len(word))
    
    for (count, word) in items[:n]:
        print(word.rjust(justify), count)


In [None]:
# Run the serial version
print("Serial word frequency count:")
%time counts = ngrams('davinci.txt', 1)
print_common(counts, 10)


Let's split the file

In [None]:
# split the davinci.txt into one file per engine:
text = open('davinci.txt', encoding='latin1', errors='replace').read()
lines = text.splitlines()
nlines = len(lines)
n = 10

block = nlines//n
for i in range(n):
    chunk = lines[i*block:(i+1)*(block)]
    with open('davinci%i.txt' % i, 'w') as f:
        f.write('\n'.join(chunk))

In [None]:
import os
cwd = os.path.abspath(os.getcwd())
fnames = [ os.path.join(cwd, 'davinci%i.txt' % i) for i in range(n)]

In [None]:
import ipyparallel as ipp
rc = ipp.Client()

In [None]:
view = rc.load_balanced_view()
eall = rc[:]
eall.push(dict(
    non_word=non_word,
    yield_words=yield_words,
    common_words=common_words,
    normalize_word=normalize_word,
))

### Exercise: parallel ngrams

Write a version of ngrams that runs in parallel,
rejoining the results into a single count dict.

In [None]:
def ngrams_parallel(view, fnames, n=1):
    """Compute ngrams in parallel
    
    view - An IPython View
    fnames - The filenames containing the split data.
    """
    pass

In [None]:
%load ../soln/ngrams.py

In [None]:
print("Parallel ngrams")
%time pcounts = ngrams_parallel(view, fnames, 3)
print_common(pcounts, 10)

### A bit more data

Download some Project Gutenberg samples from ntlk (avoid rate-limiting on PG itself)

In [None]:
gutenberg_samples = 'http://nltk.github.com/nltk_data/packages/corpora/gutenberg.zip'
if not os.path.isdir('gutenberg'):
    if not os.path.exists('gutenberg.zip'):
        urlretrieve(gutenberg_samples, 'gutenberg.zip')
    !unzip gutenberg.zip

import glob
gutenberg_files = glob.glob(os.path.abspath(os.path.join('gutenberg', '*.txt')))
# remove the bible, because it's too big relative to the rest
gutenberg_files.remove(os.path.abspath(os.path.join('gutenberg', 'bible-kjv.txt')))

In [None]:
ls gutenberg

In [None]:
print("Parallel ngrams across several books")
%time pcounts = ngrams_parallel(view, gutenberg_files, 3)
print()
print_common(pcounts, 10)
pcounts = ngrams_parallel(view, gutenberg_files, 4)
print()
print_common(pcounts, 10)
