
## Characteristics of a Big Data problem:
### 1. Volume - too much data
### 2. Velocity - rapidly generating data
### 3. Veracity - filtering a mass of data

# Testing the Applicability of the MapReduce paradigm
## Parallelism followed by recursion

# MapReduce with tongue twisters

## 1. Sum of a sequence is the sum of the sums of its parts
## 2. Product of a sequence is the product of the product of its parts

### Replace sum / product with any *cumulative* operation

# Summing a large set of numbers

In [None]:
import numpy as np
from functools import reduce
import re

In [None]:
x = np.random.randint(1, 11, size=(1000,))

In [None]:
print(x)

In [None]:
parts = np.split(x, 10)

In [None]:
mapped = map(np.sum, parts)

In [None]:
reduce(lambda x, y: x + y, mapped)

In [None]:
x.sum()

# Word Count Example

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../files/spam.csv', encoding='latin-1')

In [None]:
df.head()

## Preprocessing

In [None]:
text = df.pop('v2')

In [None]:
text.head()

In [None]:
def remove_punctuation_lowercase(x):
    return ' '.join([c.lower() for c in re.findall(r'\w+', x, re.IGNORECASE)])

In [None]:
clean_text = text.apply(remove_punctuation_lowercase)

In [None]:
clean_text.head()

# Each SMS is a map wordcount operation
# The complete dataset is the reduce operation

In [None]:
from collections import Counter

In [None]:
df['text'] = clean_text

In [None]:
df.head()

In [None]:
for c in df:
    if c.startswith('Unnamed'):
        del df[c]

In [None]:
df.head()

In [None]:
hamData = df[df['v1'] == 'ham']['text'].apply(lambda x: x.split()).tolist()
spamData = df[df['v1'] == 'spam']['text'].apply(lambda x: x.split()).tolist()

In [None]:
hamWordCountMap = map(Counter, hamData)
hamWordCountReduce = reduce(lambda x, y: x + y, hamWordCountMap)

In [None]:
hamWordCountReduce.most_common(10)

In [None]:
spamWordCountMap = map(Counter, spamData)
spamWordCountReduce = reduce(lambda x, y: x + y, spamWordCountMap)

In [None]:
spamWordCountReduce.most_common(10)

# Notice anything?

# Remove the stopwords!

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
def stopword_remover(x):
    non_stop = []
    for word in x:
        if word not in ENGLISH_STOP_WORDS:
            non_stop.append(word)
    return non_stop

In [None]:
hamNonStopWords = [stopword_remover(sms) for sms in hamData]
spamNonStopWords = [stopword_remover(sms) for sms in spamData]

In [None]:
hamWordCountMap = map(Counter, hamNonStopWords)
hamWordCountReduce = reduce(lambda x, y: x + y, hamWordCountMap)
hamWordCountReduce.most_common(10)

In [None]:
spamWordCountMap = map(Counter, spamNonStopWords)
spamWordCountReduce = reduce(lambda x, y: x + y, spamWordCountMap)
spamWordCountReduce.most_common(10)

# Exercise: Eliminate all words less than 3 characters in length and re-compute the most common words in both categories

In [None]:
def remove_small_words(x, n_chars=3):
    big_words = []
    for word in x:
        if len(word) > n_chars:
            big_words.append(word)
    return big_words

In [None]:
# enter code here