#DATASCI W261: Machine Learning at Scale

#This is a wordcount to show the usage of counter in mrjob

# Write some words to a file

In [5]:
!echo foo foo quux labs foo bar quux > WordCount.txt
!echo fodd fodo quuxa labs foo bar quux >> WordCount.txt

# MrJob class for Counter

In [6]:
%%writefile mr_wc_counter.py
from mrjob.job import MRJob
from mrjob.step import MRJobStep
import re
 
WORD_RE = re.compile(r"[\w']+")
 
class MRWordFreqCount(MRJob):
    def init_get_words(self):
        self.words = {}

    def get_words(self, _, line):
        self.increment_counter('group', 'Num_mapper_calls', 1)
        for word in WORD_RE.findall(line):
            word = word.lower()
            self.words.setdefault(word, 0)
            self.words[word] = self.words[word] + 1

    def final_get_words(self):
        self.increment_counter('group', 'Num_mapper_final_calls', 1)
        for word, val in self.words.iteritems():
            yield word, val

    def sum_words_combiner(self, word, counts):
        self.increment_counter('group', 'Num_combiner_calls', 1)
        yield word, sum(counts)
        
    def sum_words(self, word, counts):
        self.increment_counter('group', 'Num_reducer_calls', 1)
        yield word, sum(counts)
        
    def steps(self):
        return [self.mr(mapper_init=self.init_get_words,
                       mapper=self.get_words,
                       mapper_final=self.final_get_words,
                       combiner=self.sum_words_combiner,
                       reducer=self.sum_words)]

if __name__ == '__main__':
    MRWordFreqCount.run()

Overwriting mr_wc_counter.py


The code above is straightforward. Mapper outputs (word, 1) key value pairs, and then conbiner combines the sum locally. At last, Reducer sums them up. 

# Run the code through python driver

####  Reminder: You cannot use the programmatic runner functionality in the same file as your job class. That is because the file with the job class is sent to Hadoop to be run. Therefore, the job file cannot attempt to start the Hadoop job, or you would be recursively creating Hadoop jobs!

Use make_runner() to run an MRJob
1. seperate driver from mapreduce jobs
2. now we can run it within pythonnode book 
3. In python, typically one class is in each file. Each mrjob job is a seperate class, should be in a seperate file

In [7]:
from mr_wc_counter import MRWordFreqCount
mr_job = MRWordFreqCount(args=['WordCount.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    print runner.counters()

[{'group': {'Num_combiner_calls': 7, 'Num_mapper_calls': 2, 'Num_reducer_calls': 7, 'Num_mapper_final_calls': 1}}]
