Chapter 4. Processing large datasets with lazy workflows
====
### Mastering Large Datasets with Python by JT Wolohan 



### Poem Puzzle
Remember to run the poem generation script before you run this code!

In [None]:
import toolz
import re, itertools
from glob import iglob

In [None]:
def word_ratio(d):
    """This helper function returns the ratio of a's to the's"""
    return float(d.get("a",0))/float(d.get("the",0.0001))

In [None]:
Class PoemCleaner:
    def __init__(self):
        self.r = re.compile(r'[.,;:!-]')

    def clean_poem(self, fp):
        """This helper function opens a poem at a filepath and returns a clean poem.

        A clean poem will be a punctuation-less sequence of lowercase words, in
        the order that the author of the poem placed them.
        """
        with open(fp) as poem:
            no_punc = self.r.sub("",poem.read())
            return no_punc.lower().split()

In [None]:
def word_is_desired(w):
    """This helper function detects whether a word is "a" or "the".

    It is designed to be used in conjunction with filter to filter a sequence
    of words down to just definite and indefinite articles.
    """
    if w in ["a","the"]:
        return True
    else:
        return False

In [None]:
def analyze_poems(poems, cleaner):
    return word_ratio(
        toolz.frequencies(
            filter(word_is_desired,
                itertools.chain(*map(cleaner.clean_poem, poems)))))

In [None]:
Cleaner = PoemCleaner()
author_a_poems = iglob("author_a/*.txt")
author_b_poems = iglob("author_b/*.txt")

author_a_ratio = analyze_poems(author_a_poems, Cleaner)
author_b_ratio = analyze_poems(author_b_poems, Cleaner)

print("""
Original_Poem:  0.3
Author A:     {:.2f}
Author B:     {:.2f}
""".format(author_a_ratio, author_b_ratio))

### Fishing village simulation

In [None]:
import random, itertools
from operator import methodcaller

In [None]:
class Village:
  def __init__(self):
    self.population = random.uniform(1000,5000)
    self.cheat_rate = random.uniform(.05,.15)

  def update(self, sim):
    if sim.cheaters >= 2:
      self.cheat_rate += .05
    self.population = int(self.population*1.025)

  def go_fishing(self):
    if random.uniform(0,1) < self.cheat_rate:
      cheat = 1
      fish_taken = self.population * 2
    else:
      cheat = 0
      fish_taken = self.population * 1
    return fish_taken, cheat

In [None]:
class LakeSimulation:
  def __init__(self):
    self.villages = [Village() for _ in range(4)]
    self.fish = 80000
    self.year = 1
    self.cheaters = 0

  def simulate(self):
    for _ in itertools.count():
        yearly_results = map(methodcaller("go_fishing"), self.villages)
        fishs, cheats = zip(*yearly_results)
        total_fished = sum(fishs)
        self.cheaters = sum(cheats)
        if self.year > 1000:
            print("Wow! Your villages lasted 1000 years!")
            break
        if self.fish < total_fished:
            print("The lake was overfished in {} years.".format(self.year))
            break
        else:
            self.fish = (self.fish-total_fished)* 1.15
            map(methodcaller("update"), self.villages)
            print("Year {:<5}   Fish: {}".format(self.year,
                                                 int(self.fish)))
            self.year += 1

In [None]:
random.seed("Wolohan")
Lake = LakeSimulation()
Lake.simulate()

[Read for more? Go to chapter 5!](./Ch05_notebook.ipynb)