
## Characteristics of a Big Data problem:
### 1. Volume - too much data
### 2. Velocity - rapidly generating data
### 3. Veracity - filtering a mass of data

# Testing the Applicability of the MapReduce paradigm
## Parallelism followed by recursion

# MapReduce with tongue twisters

## 1. Sum of a sequence is the sum of the sums of its parts
## 2. Product of a sequence is the product of the product of its parts

### Replace sum / product with any *cumulative* operation

# Summing a large set of numbers

In [3]:
import numpy as np
from functools import reduce
import re

In [2]:
x = np.random.randint(1, 11, size=(1000,))

In [None]:
print(x)

In [5]:
parts = np.split(x, 10)

In [13]:
mapped = map(np.sum, parts)

In [14]:
reduce(lambda x, y: x + y, mapped)

5533

In [15]:
x.sum()

5533

# Word Count Example

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('../spam.csv', encoding='latin-1')

In [19]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing

In [6]:
text = df.pop('v2')

In [22]:
text.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [7]:
def remove_punctuation_lowercase(x):
    return ' '.join([c.lower() for c in re.findall(r'\w+', x, re.IGNORECASE)])

In [8]:
clean_text = text.apply(remove_punctuation_lowercase)

In [25]:
clean_text.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: v2, dtype: object

# Each SMS is a map wordcount operation
# The complete dataset is the reduce operation

In [9]:
from collections import Counter

In [10]:
df['text'] = clean_text

In [28]:
df.head()

Unnamed: 0,v1,Unnamed: 2,Unnamed: 3,Unnamed: 4,text
0,ham,,,,go until jurong point crazy available only in ...
1,ham,,,,ok lar joking wif u oni
2,spam,,,,free entry in 2 a wkly comp to win fa cup fina...
3,ham,,,,u dun say so early hor u c already then say
4,ham,,,,nah i don t think he goes to usf he lives arou...


In [11]:
for c in df:
    if c.startswith('Unnamed'):
        del df[c]

In [30]:
df.head()

Unnamed: 0,v1,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...


In [12]:
hamData = df[df['v1'] == 'ham']['text'].apply(lambda x: x.split()).tolist()
spamData = df[df['v1'] == 'spam']['text'].apply(lambda x: x.split()).tolist()

In [13]:
hamWordCountMap = map(Counter, hamData)
hamWordCountReduce = reduce(lambda x, y: x + y, hamWordCountMap)

In [14]:
hamWordCountReduce.most_common(10)

[('i', 2940),
 ('you', 1943),
 ('to', 1554),
 ('the', 1122),
 ('a', 1056),
 ('u', 1018),
 ('and', 857),
 ('in', 818),
 ('me', 772),
 ('my', 750)]

In [15]:
spamWordCountMap = map(Counter, spamData)
spamWordCountReduce = reduce(lambda x, y: x + y, spamWordCountMap)

In [16]:
spamWordCountReduce.most_common(10)

[('to', 688),
 ('a', 377),
 ('call', 355),
 ('å', 299),
 ('you', 297),
 ('your', 264),
 ('free', 224),
 ('2', 206),
 ('the', 206),
 ('for', 203)]

# Notice anything?

# Remove the stopwords!

In [17]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [18]:
def stopword_remover(x):
    non_stop = []
    for word in x:
        if word not in ENGLISH_STOP_WORDS:
            non_stop.append(word)
    return non_stop

In [19]:
hamNonStopWords = [stopword_remover(sms) for sms in hamData]
spamNonStopWords = [stopword_remover(sms) for sms in spamData]

In [20]:
hamWordCountMap = map(Counter, hamNonStopWords)
hamWordCountReduce = reduce(lambda x, y: x + y, hamWordCountMap)
hamWordCountReduce.most_common(10)

[('u', 1018),
 ('s', 478),
 ('m', 405),
 ('t', 375),
 ('2', 322),
 ('gt', 318),
 ('lt', 316),
 ('just', 293),
 ('ok', 287),
 ('ll', 265)]

In [21]:
spamWordCountMap = map(Counter, spamNonStopWords)
spamWordCountReduce = reduce(lambda x, y: x + y, spamWordCountMap)
spamWordCountReduce.most_common(10)

[('å', 299),
 ('free', 224),
 ('2', 206),
 ('u', 174),
 ('txt', 163),
 ('ur', 144),
 ('4', 137),
 ('mobile', 127),
 ('text', 125),
 ('stop', 121)]

# Exercise: Eliminate all words less than 3 characters in length and re-compute the most common words in both categories

In [1]:
def remove_small_words(x, n_chars=3):
    big_words = []
    for word in x:
        if len(word) > n_chars:
            big_words.append(word)
    return big_words

In [2]:
# enter code here