In [70]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
%%writefile homework52.py
## Author: Michael Kennedy
## Description: Homework 4.3, simple MrJob most frequent pages
from mrjob.job import MRJob
from mrjob.job import MRStep


class urlJoiner(MRJob):

    def configure_options(self):
        super(urlJoiner, self).configure_options()
        self.add_passthrough_option('--join')
    
    
    def load(self, filename):
        table = {}
        with open(filename,'r') as rf:
            for line in rf:
                splits = line.strip().split(',')
                if len(splits)==5:
                    log_type, key, value, pagename, url  = splits[0:5]
                    if log_type == 'A':
                        table[key] = (pagename, url)
        return table
    
    def mapper_init(self):
        self.metatable = self.load("anonymous-msweb.data")
        self.join = self.options.join
    
    def mapper(self, _, line):
        fields = line.strip().split(',')
        visit, pageID, number, customer, visitorID = fields
        if self.join == "inner" or self.join == "right":
            if pageID in self.metatable:
                yield pageID, (self.metatable[pageID][1], visitorID)
        elif self.join == "left":
            if pageID in self.metatable:
                yield pageID, (self.metatable[pageID][1], visitorID)
            else:
                yield pageID, ("Unknown URL", visitorID)

            

#     def combiner(self, word, counts):
#         yield word, sum(counts)

    def reducer_init(self):
        self.join = self.options.join
        if self.join == "right":
            self.metatable = self.load("anonymous-msweb.data")
            self.visited = {x:False for x in self.metatable.keys()}
            
    def reducer(self, pageID, data):
        if self.join == "right":
            self.visited[pageID] = True
        for x in data:
            yield pageID, x
    
    def reducer_final(self):
        if self.join == "right":
            for key, value in self.visited.iteritems():
                if value == False:
                    yield key, (self.metatable[key][1], "Unknown visitor ID")
        

if __name__ == '__main__':
    urlJoiner.run()
    

Overwriting homework52.py


In [25]:
from homework52 import urlJoiner

def runJoin(joinType):

    mr_job = urlJoiner(args=['clean-msweb.data', '--file', 'anonymous-msweb.data', '--join', joinType])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
        for line in runner.stream_output():
            output.append(mr_job.parse_output_line(line))
    
    return output
            
outInner = runJoin('inner')
outLeft = runJoin('left')
outRight = runJoin('right')



In [26]:
print "Rows resulting from join type:\n"
for joinType in ['inner', 'left', 'right']:
    if joinType == 'inner': out = outInner
    elif joinType == 'left': out = outLeft
    elif joinType == 'right': out = outRight
    
    print "{:7s}{:>4,d}".format(joinType, len(out))

Rows resulting from join type:

inner  98,654
left   98,654
right  98,663


In [7]:
%%writefile homework53.py
## Author: Michael Kennedy
## Description: Homework 5.3, simple MrJob longest 5gram
from mrjob.job import MRJob
from mrjob.job import MRStep


class longFinder(MRJob):
    
    
    def mapper_init(self):
        self.longest = (0, "")
    
    def mapper(self, _, line):
        fields = line.strip().split('\t')
        ngram = fields[0]
        if len(ngram)> self.longest[0]:
            self.longest = (len(ngram), ngram)
    
    def mapper_final(self):
        yield self.longest[0], self.longest[1]

    def reducer_init(self):
        self.longest = (0, "")
            
    def reducer(self, local_max, payload):
        if local_max > self.longest[0]:
            self.longest = (local_max, payload[0])
    
    def reducer_final(self):
        yield self.longest[0], self.longest[1]
        

if __name__ == '__main__':
    longFinder.run()
    

Overwriting homework53.py


In [4]:
from homework53 import longFinder

def runEDA():

    mr_job = longFinder(args=['-r', 'hadoop', 'hdfs:///user/hadoop/ngrams/'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
        for line in runner.stream_output():
            output.append(mr_job.parse_output_line(line))
    
    return output
            
runEDA()

ERROR:mrjob.fs.hadoop:STDERR: 16/02/17 02:54:50 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable



[]

In [54]:
%%writefile homework53wc.py
## Author: Michael Kennedy
## Description: Homework 5.3, simple MrJob longest 5gram
from mrjob.job import MRJob
from mrjob.job import MRStep


class wordCount(MRJob):
    
    def mapper(self, _, line):
        fields = line.strip().split('\t')
        count = fields[1]
        ngram = fields[0]
        words = ngram.split(" ")
        for word in words:
            yield word, int(count)

    def combiner(self, word, wc):
        yield word, sum(wc)
        
            
    def reducer(self, word, wc):
        yield word, sum(wc)
        
    def sort_mapper(self, key, value):
        yield value, key
    
    def sort_reducer_init(self):
        self.count = 0
    
    def sort_reducer(self, key, values):
        if self.count > 10:
            pass
        else:
            self.count += 1
            for value in values:
                yield value, key
                
    def steps(self):        
        sort_conf = {  #key value pairs            
            'mapreduce.job.output.key.comparator.class': 'org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator',
            'mapreduce.partition.keycomparator.options': '-k1,1nr',
            'mapreduce.job.maps': '2',
            'mapreduce.job.reduces': '1',
            'stream.num.map.output.key.fields': '2',
            'mapreduce.map.output.key.field.separator': ' ',
            'stream.map.output.field.separator': ' ',            
        }
        
        return [   
            # Data aggregation
            MRStep(mapper=self.mapper,
               combiner=self.combiner,
               reducer=self.reducer),
            # Secondary sort
            MRStep(mapper=self.sort_mapper,
               reducer_init=self.sort_reducer_init,
               reducer=self.sort_reducer,
               jobconf=sort_conf)]

if __name__ == '__main__':
    wordCount.run()
    

Overwriting homework53wc.py


In [None]:
from homework53wc import wordCount

def runEDA():
# Run on hadoop cluster
    mr_job = wordCount(args=['-r', 'hadoop', 'hdfs:///user/hadoop/ngrams/', '--output-dir', 'hdfs:///user/hadoop/wc/'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
        for line in runner.stream_output():
            output.append(mr_job.parse_output_line(line))
    
    return output
            
sortedWords = runEDA()



In [180]:
!python homework53wc.py --no-output filtered-5Grams/googlebooks-eng-all-5gram-20090715-0-filtered.txt

no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory /tmp/homework53wc.hadoop.20160219.043919.739398

PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols

writing to /tmp/homework53wc.hadoop.20160219.043919.739398/step-0-mapper_part-00000
Counters from step 1:
  (no counters found)
writing to /tmp/homework53wc.hadoop.20160219.043919.739398/step-0-mapper-sorted
> sort /tmp/homework53wc.hadoop.20160219.043919.739398/step-0-mapper_part-00000
writing to /tmp/homework53wc.hadoop.20160219.043919.739398/step-0-reducer_part-00000
Counters from step 1:
  (no counters found)
Detected hadoop configuration property names that do not match hadoop version 0.20:
The have been translated as follows
 mapreduce.job.output.key.comparator.cla

In [63]:
%%writefile homework53density.py
## Author: Michael Kennedy
## Description: Homework 5.3, simple MrJob longest 5gram
from mrjob.job import MRJob
from mrjob.job import MRStep


class densityFinder(MRJob):
    
    def mapper(self, _, line):
        fields = line.strip().split('\t')
        ngram, count, pages= fields[0:3]
        words = ngram.split(" ")
        for word in words:
            yield word, (count, pages)

    def combiner(self, word, data):
        count = 0
        pages = 0
        for each in data:
            count += each[0]
            pages += each[0]
        yield word, (count,pages)
        
            
    def reducer(self, word, data):
        count = 0
        pages = 0
        for each in data:
            count += each[0]
            pages += each[0]
        yield word, count/float(pages)
        
    def sort_mapper(self, key, value):
        yield float(value), key
    
    def sort_reducer_init(self):
        self.count = 0
    
    def sort_reducer(self, key, values):
#         if self.count > 10:
#             pass
#         else:
#             self.count += 1
#             for val in values:
        yield value, key
    
if __name__ == '__main__':
    densityFinder.run()
    

Overwriting homework53density.py


In [64]:
from homework53density import densityFinder

def runEDA():

    mr_job = densityFinder(args=['-r', 'hadoop', 'hdfs:///user/hadoop/ngrams/'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
        for line in runner.stream_output():
            output.append(mr_job.parse_output_line(line))
    
    return output
            
density = runEDA()



KeyboardInterrupt: 

In [None]:
%%writefile homework53dist.py
## Author: Michael Kennedy
## Description: Homework 5.3, simple MrJob longest 5gram
from mrjob.job import MRJob
from mrjob.job import MRStep


class wc_dist(MRJob):
    
    def mapper(self, _, line):
        fields = line.strip().split('\t')
        count = fields[1]
        ngram = fields[0]
        words = ngram.split(" ")
        for word in words:
            yield int(count), 1

    def combiner(self, word, wc):
        yield word, sum(wc)
     
    def reducer(self, word, wc):
        yield word, sum(wc)
        
    def sort_mapper(self, key, value):
        yield value, key
    
    def sort_reducer_init(self):
        self.count = 0
    
    def sort_reducer(self, key, values):
        if self.count > 10:
            pass
        else:
            self.count += 1
            for value in values:
                yield value, key
                
    def steps(self):        
        sort_conf = {  #key value pairs            
            'mapreduce.job.output.key.comparator.class': 'org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator',
            'mapreduce.partition.keycomparator.options': '-k1,1nr',
            'mapreduce.job.maps': '2',
            'mapreduce.job.reduces': '1',
            'stream.num.map.output.key.fields': '2',
            'mapreduce.map.output.key.field.separator': ' ',
            'stream.map.output.field.separator': ' ',            
        }
        
        return [   
            # Data aggregation
            MRStep(mapper=self.mapper,
               combiner=self.combiner,
               reducer=self.reducer),
            # Secondary sort
            MRStep(mapper=self.sort_mapper,
               reducer_init=self.sort_reducer_init,
               reducer=self.sort_reducer,
               jobconf=sort_conf)]

if __name__ == '__main__':
    wordCount.run()
    

In [None]:
from homework53dist import wc_dist

def runEDA():

    mr_job = wc_dist(args=['-r', 'hadoop', 'hdfs:///user/hadoop/ngrams/'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
        for line in runner.stream_output():
            output.append(mr_job.parse_output_line(line))
            
    
    return output
            
runEDA()

In [172]:
%%writefile homework54.py
## Author: Michael Kennedy
## Description: Homework 5.4, stripe&similarity metric
from mrjob.job import MRJob
from mrjob.job import MRStep
import math

class similarity(MRJob):
    
    def pull_vocab(self):
        s = "Saturday", "morning", "evening", "afternoon"
        return s
    
    def map_index_init(self):
        self.vocab = self.pull_vocab()
    
    def map_index(self, _, line):
        fields = line.strip().split('\t')
        count = int(fields[1])
        ngram = fields[0]
        words = ngram.split(" ")
        for firstword in words:
            for secondword in words:
                if firstword in self.vocab and secondword in self.vocab:
                    if firstword == secondword:
                        pass
                    else: yield firstword, {secondword:1}
     
    def combine_index(self, key, data):
        reference_dict = data.next()
        for d in data:
            for k, v in d.iteritems():
                reference_dict[k] = reference_dict.setdefault(k,0)+v
        yield key, reference_dict
            
    def reduce_index(self, key, data):
        reference_dict = data.next()
        for d in data:
            for k, v in d.iteritems():
                reference_dict[k] = reference_dict.setdefault(k,0)+v
        yield key, reference_dict

    def configure_options(self):
        super(similarity, self).configure_options()
        self.add_passthrough_option('--metric')
        
    def map_combinations(self, word, stripe):
        for k1, v1 in stripe.iteritems():
            for k2, v2 in stripe.iteritems():
                if k1<k2:
                    yield (k1,k2), (v1, v2)
            
    def calculate_euclidean(self, vector_list):
        return math.sqrt(sum([(x-y)**2 for x, y in vector_list]))
    
    def calculate_cosine(self, vector_list):
        magnitude_a = math.sqrt(sum([x**2 for x in a.itervalues()]))
        magnitude_b = math.sqrt(sum([x**2 for x in b.itervalues()]))
        dot_product = 0
        for item, value in a:
            if item in b:
                dot_product += value*b[item]
        return dot_product / magnitude_a * magnitude_b
        
    def reducer_init(self):
        if self.options.metric == 'euclidean':
            self.calculate_similarity = self.calculate_euclidean
        elif self.options.metric == 'cosine':
            self.calculate_similarity = self.calculate_cosine
        else: self.calculate_similarity = self.calculate_euclidean #default euclidean
   

    def similarity_reducer(self, pair, coincidence_vector):
        similarity = self.calculate_similarity(coincidence_vector)
        yield ",".join(pair), similarity
    
    def steps(self):
        return [MRStep(mapper_init=self.map_index_init,
                        mapper = self.map_index,
                        combiner = self.combine_index,
                        reducer = self.reduce_index
                       ),
               MRStep(mapper=self.map_combinations,
                       reducer_init=self.reducer_init,
                       reducer=self.similarity_reducer
                      )]
        
    
if __name__ == '__main__':
    similarity.run()
    

Overwriting homework54.py


In [174]:
!hadoop/bin/hdfs dfs -rm -f -r hdfs:///user/hadoop/stripes
#!python homework54.py -r hadoop hdfs:///user/hadoop/ngrams/ --output-dir hdfs:///user/hadoop/stripes
!python homework54.py -r hadoop hdfs:///user/hadoop/ngrams/ --no-output
#!python homework54.py filtered-5Grams/googlebooks-eng-all-5gram-20090715-0-filtered.txt

16/02/18 16:57:14 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory /tmp/homework54.hadoop.20160218.225715.986346
writing wrapper script to /tmp/homework54.hadoop.20160218.225715.986346/setup-wrapper.sh
Using Hadoop version 2.7.2
Copying local files into hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160218.225715.986346/files/

PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols

HADOOP: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
HADOOP: packageJobJar: [/tmp/hadoop-unjar4894620266088176756/] [] /tmp/streamjob480649071154284010.jar

In [145]:
!python homework54.py -r hadoop hdfs:///user/hadoop/ngrams/ --output-dir hdfs:///user/hadoop/stripes

Traceback (most recent call last):
  File "homework54.py", line 80, in <module>
    stripe_former.run()
  File "/usr/lib/python2.7/site-packages/mrjob/job.py", line 460, in run
    mr_job = cls(args=_READ_ARGS_FROM_SYS_ARGV)
  File "/usr/lib/python2.7/site-packages/mrjob/job.py", line 99, in __init__
    super(MRJob, self).__init__(self.mr_job_script(), args)
  File "/usr/lib/python2.7/site-packages/mrjob/launch.py", line 97, in __init__
    self.configure_options()
  File "homework54.py", line 43, in configure_options
    super(similarity, self).configure_options()
NameError: global name 'similarity' is not defined


In [132]:
from homework54 import stripe_former

def runStripes():
#     mr_job = stripe_former(args=["filtered-5Grams/googlebooks-eng-all-5gram-20090715-0-filtered.txt"])
    mr_job = stripe_former(args=['-r', 'hadoop', 'hdfs:///user/hadoop/ngrams/', '--output-dir', 'hdfs:///user/hadoop/stripes'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
#         for line in runner.stream_output():
#             output.append(mr_job.parse_output_line(line))
            
    
#     return output
            
runStripes()

ERROR:mrjob.hadoop:Job failed with return code 1280: ['/home/hadoop/hadoop/bin/hadoop', 'jar', '/home/hadoop/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar', '-files', 'hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160217.233057.493100/files/homework54.py#homework54.py,hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160217.233057.493100/files/setup-wrapper.sh#setup-wrapper.sh', '-archives', 'hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160217.233057.493100/files/mrjob.tar.gz#mrjob.tar.gz', '-input', 'hdfs:///user/hadoop/ngrams/', '-input', 'hdfs:///user/hadoop/stripes', '-output', 'hdfs:///user/hadoop/utput-dir', '-mapper', 'sh -ex setup-wrapper.sh python homework54.py --step-num=0 --mapper', '-combiner', 'sh -ex setup-wrapper.sh python homework54.py --step-num=0 --combiner', '-reducer', 'sh -ex setup-wrapper.sh python homework54.py --step-num=0 --reducer']


CalledProcessError: Command '['/home/hadoop/hadoop/bin/hadoop', 'jar', '/home/hadoop/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar', '-files', 'hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160217.233057.493100/files/homework54.py#homework54.py,hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160217.233057.493100/files/setup-wrapper.sh#setup-wrapper.sh', '-archives', 'hdfs:///user/hadoop/tmp/mrjob/homework54.hadoop.20160217.233057.493100/files/mrjob.tar.gz#mrjob.tar.gz', '-input', 'hdfs:///user/hadoop/ngrams/', '-input', 'hdfs:///user/hadoop/stripes', '-output', 'hdfs:///user/hadoop/utput-dir', '-mapper', 'sh -ex setup-wrapper.sh python homework54.py --step-num=0 --mapper', '-combiner', 'sh -ex setup-wrapper.sh python homework54.py --step-num=0 --combiner', '-reducer', 'sh -ex setup-wrapper.sh python homework54.py --step-num=0 --reducer']' returned non-zero exit status 1280

In [123]:
%%writefile homework54similarity.py
## Author: Michael Kennedy
## Description: Homework 5.4, stripe&similarity metric
from mrjob.job import MRJob
from mrjob.job import MRStep
import math


class similarity(MRJob):
    
    def configure_options(self):
        super(similarity, self).configure_options()
        self.add_passthrough_option('--metric')
        
    def mapper(self, _, line):
        fields = line.strip().split('\t')
        word = eval(fields[0])
        stripe = eval(fields[1])
        del stripe[word]
        yield word, (word, stripe)
        for other_word in stripe:
            yield word, (other_word, stripe)
            
    def calculate_euclidean(self, a, b):
        similarity = math.sqrt(sum((a.get(k,0) - b.get(k,0))**2 for k in set(a.keys()).intersection(set(b.keys()))))
        return similarity
    
    def calculate_cosine(self, a, b):
        magnitude_a = math.sqrt(sum([x**2 for x in a.itervalues()]))
        magnitude_b = math.sqrt(sum([x**2 for x in b.itervalues()]))
        dot_product = 0
        for item, value in a:
            if item in b:
                dot_product += value*b[item]
        return dot_product / magnitude_a * magnitude_b
        
    def reducer_init(self):
        if self.options.metric == 'euclidean':
            self.calculate_similarity = self.calculate_euclidean
        elif self.options.metric == 'cosine':
            self.calculate_similarity = self.calculate_cosine
        else: self.calculate_similarity = self.calculate_euclidean #default euclidean
   

    def reducer(self, word, stripes):
        reference_stripe = False
        compare_stripes = []
        for stripe in stripes:
            if stripe[0]==word: # Separate the reference stripe
                reference_stripe = stripe
            else: 
                compare_stripes[stripe[0]]
        for second_stripe in compare_stripes:
            second_stripe_name = second_stripe[0]
            second_stripe_data = second_stripe[1]
            score = self.calculate_similarity(second_stripe[1],second_stripe_data)
            yield reference_stripe[0], (second_stripe_name,  score)
                
#     def sort_mapper(self, key, value):
#         yield value, key
    
#     def sort_reducer_init(self):
#         self.count = 0
    
#     def sort_reducer(self, key, values):
#         if self.count > 10:
#             pass
#         else:
#             self.count += 1
#             for value in values:
#                 yield value, key
                

if __name__ == '__main__':
    similarity.run()
    

Overwriting homework54similarity.py


In [124]:
from homework54similarity import similarity

def runStripes():
#     mr_job = similarity(args=["filtered-5Grams/googlebooks-eng-all-5gram-20090715-0-filtered.txt"])
    mr_job = similarity(args=['-r', 'hadoop', 'hdfs:///user/hadoop/utput-dir/part-00000', '--metric', 'euclidean'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
#         for line in runner.stream_output():
#             output.append(mr_job.parse_output_line(line))
            
    
#     return output
            
runStripes()



f
