In [15]:
%%file word_count.py
# From http://mrjob.readthedocs.org/en/latest/guides/quickstart.html#writing-your-first-job

from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class MRWordFrequencyCount(MRJob):
    def steps(self):
        return [
            # Step 1: Count bigrams
            MRStep(mapper=self.mapper_counts,
                   reducer=self.reducer_counts),
            # Step 2: Calculate probabilities
            MRStep(reducer=self.reducer_prob)
        ]

    # Step 1a: Split out all bigrams
    def mapper_counts(self, _, line):
        words = re.findall('([a-zA-Z]+[\'][a-zA-Z]+|[a-zA-Z]+)', line)
        num_words = len(words)
        if num_words>1:
            words[0] = words[0].upper()
            for i in range(num_words-1):
                words[i+1] = words[i+1].upper()
                yield (words[i], words[i+1]), 1

    # Step 1b: Reduce bigram counts
    def reducer_counts(self, words, count):
        word1, word2 = words
        yield word1, (sum(count),word2)

    
    # Step 2: Calculate probablities for each bigram
    #         and get top words that come after 'my'
    def reducer_prob(self, word1, bigrams_gen):
        yield word1, bigrams_gen

if __name__ == '__main__':
    MRWordFrequencyCount.run()


Overwriting word_count.py


In [16]:
!python word_count.py -r emr s3://jws-mapreduce/Input/ \
--output-dir=s3://jws-mapreduce/result \
--no-output

Using configs in /Users/johnstocktoniv/.mrjob.conf
Using s3://mrjob-22bfba3f9ba9c6ea/tmp/ as our temp dir on S3
Creating temp directory /var/folders/5s/hq_d6y217wb9tt3p5v4_25cw0000gn/T/word_count.johnstocktoniv.20220311.100709.361835
uploading working dir files to s3://mrjob-22bfba3f9ba9c6ea/tmp/word_count.johnstocktoniv.20220311.100709.361835/files/wd...
Copying other local files to s3://mrjob-22bfba3f9ba9c6ea/tmp/word_count.johnstocktoniv.20220311.100709.361835/files/
Created new cluster j-W9QAGRIZT73H
Added EMR tags to cluster j-W9QAGRIZT73H: __mrjob_label=word_count, __mrjob_owner=johnstocktoniv, __mrjob_version=0.7.4
Waiting for Step 1 of 2 (s-2GEAYZCZFCTPL) to complete...
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING)
  PENDING (cluster is STARTING: Configuring cluster softw