## Creating Log Files

In [5]:
%%writefile customers.dat
1|Alice Bob|31|CA
2|Sam Sneed|51|NV
3|Jon Sneed|37|CA
4|Arnold Wesise|17|NY
5|Henry Bob|25|NV
6|Yo Yo Ma|37|NY
7|Jon York|41|WA
8|Alex Ball|26|WA
9|Jim Davis|19|CA

Overwriting customers.dat


In [6]:
%%writefile orders.dat
1|Apple
3|Garlic
2|Milk
1|Iphone
4|Ipad
5|Book
7|Potato
8|Tomato
9|Orange
5|shoes

Overwriting orders.dat


In [14]:
%%writefile test.py
from mrjob.job import MRJob
from mrjob.step import MRJobStep
from mrjob.compat import get_jobconf_value
from collections import defaultdict
import csv

def csv_readline(line):
    """Given a sting CSV line, return a list of strings."""
    for row in csv.reader([line]):
        return row

class leftjoin(MRJob):
    
    def steps(self):
        return [MRJobStep(mapper_init = self.mapper_init,
                         mapper = self.mapper, mapper_final = self.mapper_final)]
    
    def mapper_init():
        self.lefttable = {}        
        with open('customers.dat','r') as f:
            for line in f: 
                cell = line.split("|")
                if cell[1] not in urls.keys():
                    self.lefttable[cell[1]] = [cell[4],[]]

    def mapper(self, _, line):
        cell = line.split("|")
        key = cell[0]
        self.lefttable[key][1].append()

    def mapper_final(self):
        yield urls

if __name__ == '__main__':
    leftjoin.run()

Writing hashinnerjoin.py


In [15]:
from test import leftjoin
mr_job = leftjoin(args=['customers.dat','orders.dat'])
with mr_job.make_runner() as runner: 
    runner.run()
    count = 0
    # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print value
        count = count + 1
print "\n"
print "There are %s records" %count

IndentationError: expected an indented block (hashinnerjoin.py, line 17)

## ReducerSideInnerJoin

In [3]:
%%writefile reducersideinnerjoin.py
from mrjob.job import MRJob
from mrjob.step import MRJobStep
from mrjob.compat import get_jobconf_value
 
class innerjoin(MRJob):
    def mapper(self, _, line):
        x = line.split("|")
        if len(x) == 4:
            yield x[0], ("lefttable", x[1], x[2], x[3])
        else:
            yield x[0], ("righttable", x[1])

    def reducer(self, key, values):
        customers = list()
        orders = list()
        for val in values:
            if val[0] == u'lefttable':
                customers.append(val)
            else:
                orders.append(val)
        for o in orders:
            for c in customers:
                yield None, [key] + c[1:] + o[1:]

if __name__ == '__main__':
    innerjoin.run()

Overwriting reducersideinnerjoin.py


In [4]:
from reducersideinnerjoin import innerjoin
mr_job = innerjoin(args=['customers.dat','orders.dat'])
with mr_job.make_runner() as runner: 
    runner.run()
    count = 0
    # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print value
        count = count + 1
print "\n"
print "There are %s records" %count



['1', 'Alice Bob', '31', 'CA', 'Apple']
['1', 'Alice Bob', '31', 'CA', 'Iphone']
['2', 'Sam Sneed', '51', 'NV', 'Milk']
['3', 'Jon Sneed', '37', 'CA', 'Garlic']
['4', 'Arnold Wesise', '17', 'NY', 'Ipad']
['5', 'Henry Bob', '25', 'NV', 'Book']
['5', 'Henry Bob', '25', 'NV', 'shoes']
['7', 'Jon York', '41', 'WA', 'Potato']
['8', 'Alex Ball', '26', 'WA', 'Tomato']
['9', 'Jim Davis', '19', 'CA', 'Orange']


There are 10 records


## ReducerSideLeftJoin

In [7]:
%%writefile reducersideleftjoin.py
from mrjob.job import MRJob
from mrjob.step import MRJobStep
from mrjob.compat import get_jobconf_value
 
class leftjoin(MRJob):
    def mapper(self, _, line):
        x = line.split("|")
        if len(x) == 4:
            yield x[0], ("lefttable", x[1], x[2], x[3])
        else:
            yield x[0], ("righttable", x[1])

    def reducer(self, key, values):
        customers = list()
        orders = list()
        for val in values:
            if val[0]== u'lefttable':
                customers.append(val)
            else:
                orders.append(val)
        for c in customers:
            if len(orders)==0:
                yield None, [key] + c[1:] + [None] 
            for o in orders:
                yield None, [key] + c[1:] + o[1:]

if __name__ == '__main__':
    leftjoin.run()

Writing reducersideleftjoin.py


In [8]:
from reducersideleftjoin import leftjoin
mr_job = leftjoin(args=['customers.dat','orders.dat'])
with mr_job.make_runner() as runner: 
    runner.run()
    count = 0
    # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print value
        count = count + 1
print "\n"
print "There are %s records" %count



['1', 'Alice Bob', '31', 'CA', 'Apple']
['1', 'Alice Bob', '31', 'CA', 'Iphone']
['2', 'Sam Sneed', '51', 'NV', 'Milk']
['3', 'Jon Sneed', '37', 'CA', 'Garlic']
['4', 'Arnold Wesise', '17', 'NY', 'Ipad']
['5', 'Henry Bob', '25', 'NV', 'Book']
['5', 'Henry Bob', '25', 'NV', 'shoes']
['6', 'Yo Yo Ma', '37', 'NY', None]
['7', 'Jon York', '41', 'WA', 'Potato']
['8', 'Alex Ball', '26', 'WA', 'Tomato']
['9', 'Jim Davis', '19', 'CA', 'Orange']


There are 11 records


## ReducerSideRightJoin

In [9]:
%%writefile reducersiderightjoin.py
from mrjob.job import MRJob
from mrjob.step import MRJobStep
from mrjob.compat import get_jobconf_value
 
class rightjoin(MRJob):
    def mapper(self, _, line):
        x = line.split("|")
        if len(x) == 4:
            yield x[0], ("lefttable", x[1], x[2], x[3])
        else:
            yield x[0], ("righttable", x[1])

    def reducer(self, key, values):
        customers = list()
        orders = list()
        for val in values:
            if val[0]== u'lefttable':
                customers.append(val)
            else:
                orders.append(val)
        for o in orders:
            if len(customers)==0:
                yield None, [key] + [None, None, None] + o[1:]
            for c in customers:
                yield None, [key] + c[1:] + o[1:]

if __name__ == '__main__':
    rightjoin.run()

Writing reducersiderightjoin.py


In [10]:
from reducersiderightjoin import rightjoin
mr_job = rightjoin(args=['customers.dat','orders.dat'])
with mr_job.make_runner() as runner: 
    runner.run()
    count = 0
    # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print value
        count = count + 1
print "\n"
print "There are %s records" %count



['1', 'Alice Bob', '31', 'CA', 'Apple']
['1', 'Alice Bob', '31', 'CA', 'Iphone']
['2', 'Sam Sneed', '51', 'NV', 'Milk']
['3', 'Jon Sneed', '37', 'CA', 'Garlic']
['4', 'Arnold Wesise', '17', 'NY', 'Ipad']
['5', 'Henry Bob', '25', 'NV', 'Book']
['5', 'Henry Bob', '25', 'NV', 'shoes']
['7', 'Jon York', '41', 'WA', 'Potato']
['8', 'Alex Ball', '26', 'WA', 'Tomato']
['9', 'Jim Davis', '19', 'CA', 'Orange']


There are 10 records


In [11]:
%%writefile convert.py
#!/usr/bin/python
## log_preprocess_42.py
## Author: Angela Gunn & Jing Xu
## Description: Proprocesses log data on a single node

import sys
import os

if len(sys.argv) < 2:
    print "No input file is passed, Aborting!!!"
    sys.exit(1)

input_file = sys.argv[1]
output_file = input_file + '.pp'

try:
    os.remove(output_file)
except OSError:
    pass

last_visitor = None #set last visitor value to append to output file
with open(input_file, 'r') as f1: #open input file to read
    with open(output_file, 'a') as f2: #open ouput file to write
        for line in f1:
            line = line.strip()
            tokens = line.split(",")
            if len(tokens) == 3 and tokens[0] == 'C': #check for Visitor ID
                last_visitor = tokens[2]  #set last visitor to new Visitor ID
            if len(tokens) == 3 and tokens[0] == 'V': #check for Page ID
                out_line = 'V,{0},C,{1}\n'.format(tokens[1],last_visitor)
                f2.write(out_line)

Writing convert.py


In [12]:
!chmod a+x convert.py

In [13]:
!python convert.py anonymous-msweb.data

In [None]:
%%writefile top_visitor_44.py
## top_visitor_44.py
## Author: Angela Gunn & Jing Xu
## Description: Find most frequent visitor for each page from the log

from mrjob.job import MRJob
from mrjob.step import MRStep
import csv

def csv_readline(line):
    """Given a sting CSV line, return a list of strings."""
    for row in csv.reader([line]):
        return row

class TopVisitor(MRJob):
    
    top_page_visitor = {}
    
    def steps(self):
        return [MRStep(mapper = self.mapper,
                    combiner = self.combiner,
                    reducer = self.reducer),
                MRStep(reducer = self.reducer_frequent_visitor)]

    def mapper(self, line_no, line):
        #Extracts the Vroot that was visited
        line = line.strip(' ')
        cell = csv_readline(line)
        yield (cell[1],cell[3]),1 

    def combiner(self, key, visit_counts):
        #combines the visits
        total = sum(visit_counts)
        yield key, total
        
    def reducer(self, key, visit_counts): #Sumarizes the visit counts by adding them together.
        #combines the visits, and adds the key to top_page_visitor dictionary if qualified
        total = sum(visit_counts)
        page = key[0]
        visitor = key[1][1:]
        top_count = int(self.top_page_visitor.get(page,(visitor,0))[1]) #assign top_count value
        if top_count < total:
            self.top_page_visitor[page] = (visitor,total)        
        yield page, total    
    #end def reducer        
    
    def reducer_frequent_visitor(self, page, visit_counts):
        with open('url.txt','r') as f:
            for line in f:
                cell = csv_readline(line)
                if cell[1] == page:
                    key = "{0:>20}|{1:>5}|{2:>5}".format(cell[4],page,self.top_page_visitor[page][0]) #yield top page visitor
                    break
        yield key, self.top_page_visitor[page][1]

if __name__ == '__main__':
    TopVisitor.run()

In [None]:
!chmod a+x top_visitors_44.py

In [3]:
%load_ext autoreload
%autoreload 2

In [20]:
!cat bigram_test_Jaccard_54.out | sort -k3nr  > file_test.out
!head file_test.out
!head -1000 file_test.out > top1k_test.out
!head top1k_test.out
!wc -l top1k_test.out

["activities", "descriptions"]	1.0
["activities", "events"]	1.0
["activities", "facts"]	1.0
["activities", "increased"]	1.0
["activities", "methods"]	1.0
["activities", "principles"]	1.0
["agreement", "developed"]	1.0
["agreement", "relations"]	1.0
["alone", "cost"]	1.0
["alteration", "clue"]	1.0
["activities", "descriptions"]	1.0
["activities", "events"]	1.0
["activities", "facts"]	1.0
["activities", "increased"]	1.0
["activities", "methods"]	1.0
["activities", "principles"]	1.0
["agreement", "developed"]	1.0
["agreement", "relations"]	1.0
["alone", "cost"]	1.0
["alteration", "clue"]	1.0
    1000 top1k_test.out


In [18]:
import nltk
from nltk.corpus import wordnet as wn
import sys
import ast
#print all the synset element of an element
def synonyms(string):
    syndict = {}
    for i,j in enumerate(wn.synsets(string)):
        syns = j.lemma_names()
        for syn in syns:
            syndict.setdefault(syn,1)
    return syndict.keys()

print synonyms('ask')

[u'call_for', u'inquire', u'require', u'necessitate', u'involve', u'need', u'enquire', u'expect', u'demand', u'ask', u'postulate', u'take']


In [21]:
import nltk
from nltk.corpus import wordnet as wn
import sys
import ast
#print all the synset element of an element
def synonyms(string):
    syndict = {}
    for i,j in enumerate(wn.synsets(string)):
        syns = j.lemma_names()
        for syn in syns:
            syndict.setdefault(syn,1)
    return syndict.keys()

total_count = 0
correct_count = 0
cnt_fn = 0

# Load synomyn file
dict_syn = {}
line_cnt = 0
with open('top1k_test.out', 'r') as f:
    for line in f:
        line_cnt += 1
        if line_cnt%100 == 0: print line_cnt
        t = line.strip().split('\t')
        w1 = t[0].lower()
        w2 = t[1].lower()
        if w1 in dict_syn.keys():
            dict_syn[w1].append(w2)
        else:
            dict_syn[w1] = [w2]
        if w2 in dict_syn.keys():
            dict_syn[w2].append(w1)
        else:
            dict_syn[w2] = [w1]

print "Length of Synonym Dict: {0}".format(len(dict_syn))
            
# Check if any of the top 1000 matches the synonym list
with open('top1k_test.out', 'r') as f:
    for line in f:
        cnt_t += 1
        t = line.strip().split('\t')
        t[1] = t[1].replace("\\","")[1:-1]
        pair = ast.literal_eval(t[1])
        syn0 = synonyms(pair[0].lower())
        syn1 = synonyms(pair[1].lower())

        # Precision
        if pair[1].lower() in syn0 or pair[0].lower() in syn1:
            print "MATCH: {0}".format(pair)
            cnt_m += 1
        
        # Recall
        if pair[0].lower() in dict_syn.keys() or pair[1].lower() in dict_syn.keys():
            cnt_fn += 1
        
            
print "\nTotal Count: {0}, TP: {1}, FP: {2}, FN: {3}".format(cnt_t, cnt_m, cnt_t - cnt_m, cnt_fn)
"""
p = round(float(cnt_m) / cnt_t, 3)
r = round(float(cnt_m) / (cnt_m + cnt_fn), 3)
f1 = round(2 * p * r / (p + r), 3)

print "\n### PRECISION: {0}".format(p)
print "\n### RECALL: {0}".format(r)
print "\n### F1 Score: {0}".format(f1)"""

100
200
300
400
500
600
700
800
900
1000
Length of Synonym Dict: 1007


SyntaxError: unexpected EOF while parsing (<unknown>, line 1)

In [40]:
import nltk
from nltk.corpus import wordnet as wn
import sys
import ast
#print all the synset element of an element
def synonyms(string):
    syndict = {}
    for i,j in enumerate(wn.synsets(string)):
        syns = j.lemma_names()
        for syn in syns:
            syndict.setdefault(syn,1)
    return syndict.keys()

cnt_t = 0
cnt_m = 0
cnt_fn = 0

# Load synomyn file
dict_syn = {}
line_cnt = 0
with open('top1k.out', 'r') as f:
    for line in f:
        line_cnt += 1
        if line_cnt%100 == 0: print line_cnt
        t = line.strip().split('\t')
        w1 = t[0].lower()
        w2 = t[1].lower()
        if w1 in dict_syn.keys():
            dict_syn[w1].append(w2)
        else:
            dict_syn[w1] = [w2]
        if w2 in dict_syn.keys():
            dict_syn[w2].append(w1)
        else:
            dict_syn[w2] = [w1]

print "Length of Synonym Dict: {0}".format(len(dict_syn))

import nltk
from nltk.corpus import wordnet as wn
import sys
import ast

# Check if any of the top 1000 matches the synonym list
with open('top1k.out', 'r') as f:
    for line in f:
        cnt_t += 1
        t = line.strip().split('\t')
        pair = ast.literal_eval(t[0])
        syn0 = synonyms(pair[0].lower().strip(' '))
        syn1 = synonyms(pair[1].lower().strip(' '))
        # Precision
        if pair[1].lower() in syn0 or pair[0].lower() in syn1:
            print "MATCH: {0}".format(pair)
            cnt_m += 1
        
        # Recall
        if pair[0].lower() in dict_syn.keys() or pair[1].lower() in dict_syn.keys():
            cnt_fn += 1
        
            
print "\nTotal Count: {0}, TP: {1}, FP: {2}, FN: {3}".format(cnt_t, cnt_m, cnt_t - cnt_m, cnt_fn)

p = round(float(cnt_m) / cnt_t, 3)
r = round(float(cnt_m) / (cnt_m + cnt_fn), 3)
f1 = round(2 * p * r / (p + r), 3)

print "\n### PRECISION: {0}".format(p)
print "\n### RECALL: {0}".format(r)
print "\n### F1 Score: {0}".format(f1)

100
200
300
400
500
600
700
800
900
1000
Length of Synonym Dict: 1007
MATCH: ['principles', 'rule']
MATCH: ['added', 'supply']

Total Count: 1000, TP: 2, FP: 998, FN: 0

### PRECISION: 0.002

### RECALL: 1.0

### F1 Score: 0.004
