# A MapReduce framework for Python
The following code is a simple MapReduce framework that runs entirely in Python. This framework does not use any parallel nodes, so it is only for practice with MapReduce with small data.

In [1]:
import itertools
def map_reduce(i,mapper,reducer):
  intermediate = []
  for (key,value) in i.items():
    intermediate.extend(mapper(key,value))
  groups = {}
  for key, group in itertools.groupby(sorted(intermediate), 
                                      lambda x: x[0]):
    groups[key] = list([y for x, y in group])
  return [reducer(intermediate_key,groups[intermediate_key])
          for intermediate_key in groups]

## Example: Count word frequencies
For example, the following code would use MapReduce to count word frequencies in a list of tweets.

In [2]:
def mapper(key, value):
    return [(w, 1) for w in value.split()]

In [3]:
def reducer(key, values):
    return key, sum(values)

In [4]:
import zipfile
from pathlib import Path
if not Path('../assignments/10000 tweets-NEW.json').exists():
    print("Unzipping tweets")
    with zipfile.ZipFile('../assignments/cleaned-tweets.zip') as myzip:
        myzip.extractall()

In [5]:
import json
def get_tweets(ntweets):
    with open('../assignments/10000 tweets-NEW.json', encoding='iso8859-1') as jfile:
        result = dict()
        i = 0
        for line in jfile:
            if i >= ntweets:
                break
            try:
                next_tweet = json.loads(line)
                tweet_id = next_tweet['id']
                tweet_body = next_tweet['body']
            except:
                continue
            result[tweet_id] = tweet_body
            i += 1
    return result

In [6]:
tweets_5 = get_tweets(5)

In [7]:
len(tweets_5)

5

In [8]:
map_reduce(tweets_5, mapper, reducer)

[('"We', 1),
 ('#Convergence2016', 1),
 ('#RedCentreNT!', 1),
 ('#vicweather', 1),
 ('&amp;', 2),
 ('0', 1),
 ('0.0', 1),
 ('1014.3', 1),
 ('11:00', 1),
 ('19.8Â°C,', 1),
 ('3pm', 1),
 ('4pm...', 1),
 ('67pct,', 1),
 ('@', 1),
 ('AM', 1),
 ('AU', 1),
 ('Australia', 1),
 ('BBQ,', 1),
 ('Beverages', 1),
 ('Bikini', 1),
 ('CONGRATULATIONS', 1),
 ('Central', 1),
 ('FANTASTIC', 1),
 ('Free', 1),
 ("Friday's", 1),
 ('Girls', 1),
 ('Lunch', 1),
 ('Meat', 1),
 ('NNW', 1),
 ('RH', 1),
 ('Raffle', 1),
 ('Raffles', 1),
 ('Rain', 1),
 ('Register', 1),
 ('Seafood', 1),
 ('So', 1),
 ('Steady.', 1),
 ('Suzie', 1),
 ('Temp', 1),
 ('The...', 1),
 ('Time', 1),
 ('Today', 1),
 ('VIC,', 1),
 ('Visit', 1),
 ('Walker', 1),
 ('Wantirna,', 1),
 ('Winds', 1),
 ('You...', 1),
 ('about', 1),
 ('and', 3),
 ('around', 1),
 ('beaches', 1),
 ('beautiful', 1),
 ('both', 1),
 ('change', 1),
 ('commitment', 1),
 ('digital', 1),
 ('do', 1),
 ('for', 1),
 ('from', 2),
 ('hard', 1),
 ('have', 1),
 ('hear@ChelleMelbourne',

## Exercise 1: Find the number of tweets per twitter
Write code that finds the number of tweets per twitter.

In [9]:
def mapper(key, value):
    # Write your code here
    return []

In [10]:
def reducer(key, values):
    # write your code here
    return None

In [11]:
import json
def get_tweets(ntweets):
    with open('../assignments/10000 tweets-NEW.json', encoding='iso8859-1') as jfile:
        result = dict()
        i = 0
        for line in jfile:
            if i >= ntweets:
                break
            try:
                next_tweet = json.loads(line)
                tweet_twitter = next_tweet['actor']['id']
                tweet_body = next_tweet['body']
            except:
                continue
            result[tweet_twitter] = tweet_body
            i += 1
    return result

In [12]:
tweets_500 = get_tweets(500)

In [13]:
map_reduce(tweets_500, mapper, reducer)

[('id:twitter.com:1003723232', 1),
 ('id:twitter.com:1040901512', 1),
 ('id:twitter.com:1047530119', 1),
 ('id:twitter.com:1057079388', 1),
 ('id:twitter.com:10576322', 1),
 ('id:twitter.com:1075639382', 1),
 ('id:twitter.com:107774296', 1),
 ('id:twitter.com:1081805533', 1),
 ('id:twitter.com:108577207', 1),
 ('id:twitter.com:1094259638', 1),
 ('id:twitter.com:1097917428', 1),
 ('id:twitter.com:109983796', 1),
 ('id:twitter.com:1105928430', 1),
 ('id:twitter.com:1117988982', 1),
 ('id:twitter.com:112008354', 1),
 ('id:twitter.com:1120104104', 1),
 ('id:twitter.com:1170173209', 1),
 ('id:twitter.com:117344322', 1),
 ('id:twitter.com:117602839', 1),
 ('id:twitter.com:1178994756', 1),
 ('id:twitter.com:1210672568', 1),
 ('id:twitter.com:12541222', 1),
 ('id:twitter.com:1268982752', 1),
 ('id:twitter.com:1271991079', 1),
 ('id:twitter.com:128767993', 1),
 ('id:twitter.com:132387877', 1),
 ('id:twitter.com:133222533', 1),
 ('id:twitter.com:1341343152', 1),
 ('id:twitter.com:1356109537', 1)

## Exercise 2: Compute tf.idf
Write code that computes tf.idf
$$tf.idf(w,d) = tf(w,d) \times idf(w)$$ where
$$idf(w) = \log\frac{N}{|\{d \in D : w \in d\}|}$$

To compute $tf(w,d)$:

In [14]:
def mapper_tf(docname, contents):
    # Write your code here
    return []

def reducer_tf(key, values):
    # Write your code here
    return None

In [15]:
import json
def get_tweets(ntweets):
    with open('../assignments/10000 tweets-NEW.json', encoding='iso8859-1') as jfile:
        result = dict()
        i = 0
        for line in jfile:
            if i >= ntweets:
                break
            try:
                next_tweet = json.loads(line)
                tweet_id = next_tweet['id']
                tweet_body = next_tweet['body']
            except:
                continue
            result[tweet_id] = tweet_body
            i += 1
    return result

In [16]:
tweets_500 = get_tweets(500)

In [17]:
tf = map_reduce(tweets_500, mapper_tf, reducer_tf)

In [18]:
tf[:10]

[(('!', 'tag:search.twitter.com,2005:715690356755435520'), 1),
 (('"April', 'tag:search.twitter.com,2005:715692082564714496'), 1),
 (('"Basic', 'tag:search.twitter.com,2005:715691268689371136'), 1),
 (('"Blade', 'tag:search.twitter.com,2005:715692660279738369'), 1),
 (('"Clean', 'tag:search.twitter.com,2005:715691356673298432'), 1),
 (('"Demons"', 'tag:search.twitter.com,2005:715691780901986305'), 1),
 (('"Eat', 'tag:search.twitter.com,2005:715691248498020352'), 1),
 (('"I', 'tag:search.twitter.com,2005:715692267923570688'), 1),
 (('"I', 'tag:search.twitter.com,2005:715692385263550464'), 1),
 (('"I\'ll', 'tag:search.twitter.com,2005:715692479706710016'), 1)]

To compute $idf(w)$:

In [19]:
from math import log
def mapper_idf(docname, contents):
    # Write your code here
    return []

def reducer_idf(word, values, N):
    # Write your code here
    return None

In [20]:
N = len(tweets_500)
idf = map_reduce(tweets_500, mapper_idf, 
                 lambda x, y: reducer_idf(x, y, N))

In [21]:
idf[:10]

[('!', 6.214608098422191),
 ('"April', 6.214608098422191),
 ('"Basic', 6.214608098422191),
 ('"Blade', 6.214608098422191),
 ('"Clean', 6.214608098422191),
 ('"Demons"', 6.214608098422191),
 ('"Eat', 6.214608098422191),
 ('"I', 5.521460917862246),
 ('"I\'ll', 6.214608098422191),
 ('"It\'s', 6.214608098422191)]

## Example: Minhashing
The following exercise is a little more involved and uses Mapreduce to perform minhashing. Remember the minhashing algorithm:

1. Initialise $SIG(i,c) \leftarrow \infty, i=1 \ldots n$ for all c.
2. Scan row $r=1 \ldots M$ of the characteristic matrix. This row represents a (possibly hashed) k-shingle.
3. Compute the hash values $h_1(r), h_2(r), \ldots h_n(r)$.
4. For each column $c$ (representing a document or set of k-shingles) do the following: 
    1. If $c$ has 0 in row $r$, the set does not contain the k-shingle; do nothing.
    2. If $c$ has 1 in row r, $SIG(i,c) \leftarrow \min(SIG(i,c), h_i(r))$, for every hash $i$.
    
Below is the code that we used in the lecture notebook of week 7.

In [23]:
import nltk

def k_hash_shingle(text, k, hash_function, target_range):
    return set(hash_function(item, target_range) 
               for item in nltk.ngrams(text, k))

def permute_hash(item, hash_index, target_range):
    return (hash(item) * (hash_index+1) + 1) % target_range

import numpy as np
def minhash(kshingles,
            permute_hash_function, signature_rows,
            target_range,
            verbose=False):
    sig = np.ones((signature_rows, len(kshingles))) * np.inf
    for r in range(target_range):
        for c, ks in enumerate(kshingles):
            if r not in ks:
                continue
            for i in range(signature_rows):
                sig[i, c] = min(sig[i,c], 
                                permute_hash_function(r, i, target_range))
        if verbose:
            print("After scanning row %i" % r)
            print("Signature matrix:")
            print(sig)
    return sig

And below we use the code to build the minhash matrix for the first 500 tweets.

In [24]:
import json
def get_tweets(ntweets):
    with open('../assignments/10000 tweets-NEW.json', encoding='iso8859-1') as jfile:
        result = dict()
        i = 0
        for line in jfile:
            if i >= ntweets:
                break
            try:
                next_tweet = json.loads(line)
                tweet_id = next_tweet['id']
                tweet_body = next_tweet['body']
            except:
                continue
            result[tweet_id] = tweet_body
            i += 1
    return result

In [25]:
tweets_500 = get_tweets(500)

In [26]:
def my_hash(item, target_range):
    return hash(item) % target_range

In [27]:
k = 9
target_range = 1000000
kshingles = [k_hash_shingle(tweets_500[t_id], k, my_hash, target_range) 
             for t_id in tweets_500]

In [28]:
sig = minhash(kshingles, permute_hash, 100, target_range)

But let's try the code using the sample from the notebook of week 7. In this sample, we only use two hashes and four sets of hashed kshingles.

In [29]:
def simple_permute_hash(x, index, target_range):
    if index == 0:
        return (x + 1) % target_range
    else:
        return (3*x + 1) % target_range
    
kshingles = [{0, 3}, {2}, {1, 3, 4}, {0, 2, 3}]
minhash(kshingles, simple_permute_hash, 2, 5)

array([[ 1.,  3.,  0.,  1.],
       [ 0.,  2.,  0.,  0.]])

### Exercise 3: Map by individual sets of k-shingles.
Let's implement a MapReduce where the map function processes one set of k-shingles.

In [36]:
def mapper(key, ks, permute_hash_function, nhashes, target_range):
    # Write your code here
    return []

def reducer(key, values):
    # Write your code here
    return None

In [38]:
mr_data = dict()
for i, ks in enumerate(kshingles):
    mr_data[i] = ks
result = map_reduce(mr_data, 
                    lambda key, value: mapper(key, value, simple_permute_hash, 2, 5),
                    reducer)
for key, value in result:
    print("Set %i" % key)
    print(value)

Set 0
[[ 1.]
 [ 0.]]
Set 1
[[ 3.]
 [ 2.]]
Set 2
[[ 0.]
 [ 0.]]
Set 3
[[ 1.]
 [ 0.]]


### Exercise 4: Using a combiner
The following variant uses a **combiner** so that the mapper will process several sets.

In [39]:
def mapper(key, kshingles, permute_hash_function, nhashes, target_range):
    # Write your code here
    return []

def reducer(key, values):
    # Write your code here
    return None

In [41]:
mr_data = dict()
mr_data[0] = kshingles[0:2]
mr_data[1] = kshingles[2:4]
result = map_reduce(mr_data, 
                    lambda key, value: mapper(key, value, simple_permute_hash, 2, 5),
                    reducer)
for key, value in result:
    print("Group set %i" % key)
    print(value)

Group set 0
[array([[ 1.,  3.],
       [ 0.,  2.]])]
Group set 1
[array([[ 0.,  1.],
       [ 0.,  0.]])]


## Example: A-priori
Let's implement the two passes of the A-priori algorithm using MapReduce. Recall the two passes of the A-Priori algorithm:

1. Pass 1: count single items
    1.1. For each basket:
        1.1.1. For each item in basket:
            1.1.1.1. Increment item's count. 
            
2. Pass 2: count item pairs
    2.1. For each basket:
        2.1.1 For each pair (i,j) of items in the basket:
            2.1.1.1 If both i and j are frequent:
                2.1.1.1.1 Increment count for (i,j)

### Exercise 5: Pass 1
This is very much the same as word counting at the beginning of this notebook but now the reducer will return a value only if the sum is larger than the support.

In [47]:
def mapper(key, basket):
    # Write your code here
    return []

def reducer(key, counts, support):
    # Write your code here
    return None

baskets = {1: ('red', 'white', 'green'),
           2: ('white', 'orange'),
           3: ('white', 'blue'),
           4: ('red', 'white', 'orange'),
           5: ('red', 'blue'),
           6: ('white', 'blue'),
           7: ('white', 'orange'),
           8: ('red', 'white', 'blue', 'green'),
           9: ('red', 'white', 'blue'),
           10: ('yellow', )}

pass1 = map_reduce(baskets, mapper, lambda k, v: reducer(k, v, 3))
pass1 = dict(pass1)
pass1

{'blue': True,
 'green': False,
 'orange': True,
 'red': True,
 'white': True,
 'yellow': False}

### Exercise 6: Pass 2
Very similar to pass 1, but the mapper will only count the pair if each item is frequent.

In [49]:
import itertools

def mapper(key, basket):
    # Write your code here
    return []

def reducer(key, counts, support):
    # Write your code here
    return None

pass2 = map_reduce(baskets, mapper, lambda k, v: reducer(k, v, 3))
pass2 = dict(pass2)
pass2

{('red', 'blue'): True,
 ('red', 'orange'): False,
 ('red', 'white'): True,
 ('white', 'blue'): True,
 ('white', 'orange'): True}