In [3]:
%%writefile ipynb_map_reduce_cc1_top10.py
"""The classic MapReduce job: count the frequency of words.
"""
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")
ALPHABET_RE = re.compile(r"[a-zA-Z]")

class MRCC1TOP10(MRJob):
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top10_word)
        ]
    
    def mapper_get_words(self, _, line):
        the_line = WORD_RE.findall(line)
        for key, value in enumerate(the_line):
            if (value == "my" and (key + 1 < len(the_line))):
                yield (value + "," + the_line[key+1].lower(), 1)

    def combiner_count_words(self, value, counts):
        yield (value, sum(counts))

    def reducer_count_words(self, value, counts):
        #yield (value, sum(counts))
        yield None, (sum(counts), value)
    
    def reducer_find_top10_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        list_descending = sorted(word_count_pairs, reverse = True)
        yield ("top 10", list_descending[0:10])

if __name__ == '__main__':
    MRCC1TOP10.run()


Overwriting ipynb_map_reduce_cc1_top10.py


In [1]:
!python3 ipynb_map_reduce_cc1_top10.py -r local ./data/shortjokes.csv --output-dir=ipynb_map_reduce_cc1_top10_out --no-output

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.182934.838004
Running step 1 of 2...
Running step 2 of 2...
job output is in ipynb_map_reduce_cc1_top10_out
Removing temp directory /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.182934.838004...


In [5]:
%%writefile ~/.mrjob.conf

# http://mrjob.readthedocs.io/en/stable/guides/emr-opts.html

runners:
  emr:
    aws_access_key_id: AKIAIILKINIDDAFXCMYA
    aws_secret_access_key: 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
    ec2_key_pair: cc1top10
    ec2_key_pair_file: /Users/jalal/Projects/BBK/CC/cc1top10.pem
    region: eu-west-2 # http://docs.aws.amazon.com/general/latest/gr/rande.html
    master_instance_type: m5.xlarge # https://aws.amazon.com/emr/pricing/
    instance_type: m5.xlarge
    num_core_instances: 1
    ssh_tunnel: true


Overwriting /Users/jalal/.mrjob.conf


In [6]:
!python3 ipynb_map_reduce_cc1_top10.py -r emr s3://cc1top10/shortjokes.csv --output-dir=s3://cc1top10/output --no-output

Using configs in /Users/jalal/.mrjob.conf
Using s3://mrjob-04f631f3f6930b5a/tmp/ as our temp dir on S3
Creating temp directory /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.201959.521062
writing master bootstrap script to /var/folders/3g/kpbx6md97b74_076lhbnbvt00000gn/T/ipynb_map_reduce_cc1_top10.jalal.20190413.201959.521062/b.sh
Copying local files to s3://mrjob-04f631f3f6930b5a/tmp/ipynb_map_reduce_cc1_top10.jalal.20190413.201959.521062/files/...
Created new cluster j-3T1I7LLNQGSWB
Added EMR tags to cluster j-3T1I7LLNQGSWB: __mrjob_label=ipynb_map_reduce_cc1_top10, __mrjob_owner=jalal, __mrjob_version=0.6.7
Waiting for Step 1 of 2 (s-49EFU26BGJU4) to complete...
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STAR