In [7]:
%%writefile ipynb_map_reduce_cc1_top10.py
"""The classic MapReduce job: count the frequency of words.
"""
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")
ALPHABET_RE = re.compile(r"[a-zA-Z]")

class MRCC1TOP10(MRJob):
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_max_word)
        ]
    
    def mapper_get_words(self, _, line):
        the_line = WORD_RE.findall(line)
        for key, value in enumerate(the_line):
            if (value == "my" and (key + 1 < len(the_line))):
                yield (value + "," + the_line[key+1].lower(), 1)

    def combiner_count_words(self, value, counts):
        yield (value, sum(counts))

    def reducer_count_words(self, value, counts):
        #yield (value, sum(counts))
        yield None, (sum(counts), value)
    
    def reducer_find_max_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        yield max(word_count_pairs)
        

if __name__ == '__main__':
    MRCC1TOP10.run()


Overwriting ipynb_map_reduce_cc1_top10.py


In [8]:
!python3 ipynb_map_reduce_cc1_top10.py -r local ./data/shortjokes.csv

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.174011.094463
Running step 1 of 2...
Running step 2 of 2...
job output is in /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.174011.094463/output
Streaming final output from /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.174011.094463/output...
1311	"my,wife"
Removing temp directory /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.174011.094463...
