In [14]:
#!/usr/bin/env python3
# 
# We'll recreate a real-time streaming system, a la Apache Storm,
# using Python's generators. Why? Because it's more interesting this
# way.
#
# Quick terminology:
#  Stream: Data input.
#  Spout: Stream creator.
#  Bolt: Stream processor. Outputs a modified stream.

In [83]:
from collections import defaultdict, OrderedDict
import logging
import string

# Logging was being weird inside my iPython notebook, so I resorted to print()
# LOGGER = logging.getLogger('WordCount')
# LOGGER.setLevel(logging.INFO)
# LOGGER.addHandler(logging.StreamHandler())
# LOGGER.propagate = False

In [50]:
import nltk.data

class Tokenizer:
    """Tokenizers convert file streams into streams of delimited values.
    
    :ivar str filename: Name of file to read from.
    """
    
    def __init__(self, filename, delimiter='.'):
        self.filename = filename
        self.setup()
        try:
            english_tokens = os.path.join(nltk_data_parent_dir,
                                         'tokenizers/punkt/english.pickle')
            self.tokenizer = nltk.data.load(english_tokens)
        except:
            logger.exception('Failed to load tokenizer data from %s', 
                             english_tokens)
            raise
            
    @staticmethod
    def setup():    
        nltk_data_parent_dir = os.path.expanduser('~')
        if 'nltk_data' not in os.listdir(nltk_data_parent_dir):
            import nltk
            nltk.download('punkt')

        
    def tokenize(self):
        # This is dangerous if the file is very large. If memory was a bottlneck,
        # we could read in bytes until we hit the delimiter, then pass that to 
        # the Mapper's input queue.
        with open(self.filename, 'r') as data:
            return self.tokenizer.tokenize(data)
        
    def tokenize_text(text):
        return self.tokenizer.tokenize(data)
    
    def stream_tokens(self):
        for token in self.tokenize():
            yield token

In [51]:
test_paragraph = """As Mr. Smith walked towards the edge of the cliff, 
he recalled what his father had said to him. "Boy,", his father had started, 
"I must tell you this next thing before I go." But Mr. Smith had never 
gotten to hear what his father had to say, as at that moment his father
had fallen over the side of the cliff. The very same cliff, Mr. Smith mused,
that he himself was walking towards this very moment. How Ms. Smith would
laugh, he thought to himself, if she were reading an account of his present
actions.
"""
test_sentences = [
    'As Mr. Smith walked towards the edge of the cliff, he recalled what his father had said to him.', 
    '"Boy,", his father had started, "I must tell you this next thing before I go."',
    'But Mr. Smith had never gotten to hear what his father had to say, as at that moment his father had fallen over the side of the cliff.',
    'The very same cliff, Mr. Smith mused, that he himself was walking towards this very moment.', 
    '"How Ms. Smith would laugh," he thought to himself, "if she were reading an account of his present actions."'
]

In [80]:
def split_sentences(stream):
    """Split sentences into word tokens, bereft of punctuation and lowercase."""
    for s, idx in stream:
        # LOGGER.info("split_bolt: %s", s)
        words = (w.strip(string.punctuation).lower() for w in s.split())
        for word in words:
            # LOGGER.debug("split_bolt: Yielding %s, %d", word, idx)
            yield (word, idx)
            
def tally_words(stream, table):
    """Associate words with sentence indices and tally their appearances."""
    for word, idx in stream:
        table[word].append(idx)
        # LOGGER.debug("tally_bolt: Yielding %s, %d, %s", word, idx, table[word])
        yield word, len(table[word]), table[word]
        
def count_words(stream):
    """Archive word counts from previous stream.
    
    To continue the theme of stream processing, I could've had this itself
    be a stream, maintaining an organized data structure of words. The output
    would be the most recently updated (word, count, sentence indices) tuple.
    We'd need to implement a data structure with good lookup and ordered insertions
    times, such as a binary tree. The `bisect` library provides a working example 
    for doing so.
    """
    table = {}
    for word, count, indices in stream:
        table[word] = (count, indices)
    return table


def pretty_streamer(word_hash):
    for word, val in word_hash.items():
        yield '{}: {} [{}]'.format(word,
                                   val[0], 
                                   ', '.join([str(x) for x in val[1]]))
                     
def write_to_file(inputs):
    with open('concordance.txt', 'w') as fp:
        for line in inputs:
            fp.write(line)

In [85]:
def test():
    LOGGER.setLevel(logging.DEBUG)
    # Mock text stream
    spout = ((s, i) for i, s in enumerate(test_sentences))
    # First bolt: Convert sentences into (word, sentence #)
    split_bolt = split_sentences(spout)
    # Second bolt: Tally words up 
    tally_hash = defaultdict(list)
    tally_bolt = tally_words(split_bolt, tally_hash)
    # Terminal: Maintain a structure of words and their counts
    word_count = count_words(tally_bolt)
    ordered_count = OrderedDict(sorted(word_count.items(), key=lambda x: x[0]))
    
    from pprint import pprint
    print('\n'.join([x for x in pretty_streamer(ordered_count)]))
    write_to_file(pretty_streamer(ordered_count))
    
test()

account: 1 [4]
actions: 1 [4]
an: 1 [4]
as: 2 [0, 2]
at: 1 [2]
before: 1 [1]
boy: 1 [1]
but: 1 [2]
cliff: 3 [0, 2, 3]
edge: 1 [0]
fallen: 1 [2]
father: 4 [0, 1, 2, 2]
go: 1 [1]
gotten: 1 [2]
had: 5 [0, 1, 2, 2, 2]
he: 3 [0, 3, 4]
hear: 1 [2]
him: 1 [0]
himself: 2 [3, 4]
his: 5 [0, 1, 2, 2, 4]
how: 1 [4]
i: 2 [1, 1]
if: 1 [4]
laugh: 1 [4]
moment: 2 [2, 3]
mr: 3 [0, 2, 3]
ms: 1 [4]
mused: 1 [3]
must: 1 [1]
never: 1 [2]
next: 1 [1]
of: 3 [0, 2, 4]
over: 1 [2]
present: 1 [4]
reading: 1 [4]
recalled: 1 [0]
said: 1 [0]
same: 1 [3]
say: 1 [2]
she: 1 [4]
side: 1 [2]
smith: 4 [0, 2, 3, 4]
started: 1 [1]
tell: 1 [1]
that: 2 [2, 3]
the: 5 [0, 0, 2, 2, 3]
thing: 1 [1]
this: 2 [1, 3]
thought: 1 [4]
to: 4 [0, 2, 2, 4]
towards: 2 [0, 3]
very: 2 [3, 3]
walked: 1 [0]
walking: 1 [3]
was: 1 [3]
were: 1 [4]
what: 2 [0, 2]
would: 1 [4]
you: 1 [1]
