# Week 7: MapReduce

## Exercise 1: WordCount

In [86]:
%%file wordCount.py

from mrjob.job import MRJob
import re

#use regular expression to extract words
WORD_RE = re.compile(r"[\w']+")

class MRWordCount(MRJob):
    
    def mapper(self, key, line):
        #take all the words in the streamed lines
        for word in WORD_RE.findall(line):
            yield word.lower(), 1
        
    def reducer(self, key, values):
        #Sum up all the keys and their values
        yield key, sum(values)
        
if __name__ == '__main__':
     MRWordCount.run()

Overwriting wordCount.py


In [28]:
%%file wordCountPeergrade.py
from mrjob.job import MRJob

class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        # Split the lines into words.
        for word in line.split():
            # yield each word in the line
            yield word, 1

    def reducer(self, key, values):
        # optimization: sum the words we've seen so far
        yield key, sum(values)

if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting wordCountPeergrade.py


ex1test.txt: 
```
Hello I'm a testfile
Hello I'm a testfile
TestFile
hey
computational tools
for for
big data!
mapreduce mapreduce2 mapreduce2
```

In [29]:
!python3 wordCountPeergrade.py ex1test.txt -q

"Hello"	2
"I'm"	2
"TestFile"	1
"a"	2
"big"	1
"computational"	1
"data!"	1
"for"	2
"hey"	1
"mapreduce"	1
"mapreduce2"	2
"testfile"	2
"tools"	1


## Exercise 2: Euler Graphs

In [14]:
%%file eulerTest.py

from mrjob.job import MRJob
from mrjob.step import MRStep


class MREulerTest(MRJob):
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   reducer=self.reducer_vertex_degree),
            MRStep(reducer=self.reducer_result)
        ]
    
    def mapper(self, key, line):
        #extract and yield the verticies for each line in the file
        yield line.split()[0], 1   #vertex 1
        yield line.split()[1], 1   #vertex 2
        
    def reducer_vertex_degree(self, key, values):
        #Find the degree of all the verticies, and send them all to the same reducer using None as main key
        yield None, (key, sum(values))
        
    def reducer_result(self, _, degrees):
        #determine if the graf has an euler tour or not
        for i, degree in enumerate(degrees):
            #if graph contains a vertex with odd degree, stop
            if degree[1] % 2 == 1:
                print("Graph does not contain an euler tour")
                return
        
        #if no odd degrees detected, print:
        print("Graph contains an euler tour")
        
        
if __name__ == '__main__':
     MREulerTest.run()

Overwriting eulerTest.py


In [89]:
!python3 eulerTest.py graph1.txt -q
!python3 eulerTest.py graph2.txt -q
!python3 eulerTest.py graph3.txt -q
!python3 eulerTest.py graph4.txt -q
!python3 eulerTest.py graph5.txt -q

Graph contains an euler tour
Graph does not contain an euler tour
Graph contains an euler tour
Graph contains an euler tour
Graph does not contain an euler tour


## Exercise3 - Make your own

### Students:
```
{ "studentID" : "s112233", "name" : "Flemming", "gender" : "M", "GPA" : 4.7}
{ "studentID" : "s445522", "name" : "Emma", "gender" : "F", "GPA" : 11}
{ "studentID" : "s783943", "name" : "Ann", "gender" : "F", "GPA" : 7.8}
{ "studentID" : "s645323", "name" : "Carl", "gender" : "M", "GPA" : 6}
```

In [90]:
%%file student.py
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol
import numpy as np

class MRStudent(MRJob):
    
    INPUT_PROTOCOL = JSONValueProtocol
    def mapper(self, _, student):
        #yields the gender and the GPA of each student
        yield student["gender"], student["GPA"]
        
    def reducer(self, gender, GPA):
        #yields the gender and the mean grade of the gender
        yield gender, np.mean(list(GPA))
        
if __name__ == '__main__':
     MRStudent.run()

Overwriting student.py


In [91]:
!python3 student.py student*.json -q

"F"	9.4
"M"	5.35
