In [1]:
from collections import defaultdict

In [3]:
# Sample input data: List of documents (strings)
documents = [
    "Hello world",
    "Hello MapReduce",
    "MapReduce makes big data processing simple",
    "The world of big data"
]

### Map Function
* Convert documents into key-value pairs (word, 1)
* This function takes a document, splits it into words, and emits key-value pairs where the key is the word (in lowercase for consistency) and the value is 1.

In [5]:
def map_function(doc):
    words = doc.split()
    key_value_pairs = [(word.lower(), 1) for word in words]
    return key_value_pairs

### Shuffle and Sort
* Group all key-value pairs by key (word)
* The shuffle_sort function simulates grouping all the key-value pairs by their key (the word).
* This phase ensures that all occurrences of the same word are grouped together.

In [7]:
def shuffle_sort(mapped_data):
    grouped_data = defaultdict(list)
    for word, count in mapped_data:
        grouped_data[word].append(count)
    return grouped_data

### Reduce Function
* Sum the values for each word
* The reduce_function aggregates the list of values for each word and sums them up to get the total count.


In [9]:
def reduce_function(grouped_data):
    reduced_data = {word: sum(counts) for word, counts in grouped_data.items()}
    return reduced_data


### MapReduce Simulation
* The map_reduce function coordinates the Map, Shuffle/Sort, and Reduce phases.
* It processes a list of documents and outputs the word count for each word.

In [39]:
def map_reduce(documents):
    # Step 1: Map Phase
    mapped_data = []
    for doc in documents:
        mapped_data.extend(map_function(doc))

    print("Mapped data:")
    print(mapped_data)
    print()
    
    # Step 2: Shuffle/Sort Phase
    grouped_data = shuffle_sort(mapped_data)
    print("Grouped data:")
    print(grouped_data)
    print()
    
    # Step 3: Reduce Phase
    reduced_data = reduce_function(grouped_data)
    print("Reduced data:")
    print(reduced_data)
    print()
    
    return reduced_data


In [31]:
# Running the MapReduce word count
result = map_reduce(documents)

Mapped data:
[('hello', 1), ('world', 1), ('hello', 1), ('mapreduce', 1), ('mapreduce', 1), ('makes', 1), ('big', 1), ('data', 1), ('processing', 1), ('simple', 1), ('the', 1), ('world', 1), ('of', 1), ('big', 1), ('data', 1)]

Grouped data:
defaultdict(<class 'list'>, {'hello': [1, 1], 'world': [1, 1], 'mapreduce': [1, 1], 'makes': [1], 'big': [1, 1], 'data': [1, 1], 'processing': [1], 'simple': [1], 'the': [1], 'of': [1]})

Reduced data:
{'hello': 2, 'world': 2, 'mapreduce': 2, 'makes': 1, 'big': 2, 'data': 2, 'processing': 1, 'simple': 1, 'the': 1, 'of': 1}



In [15]:
# Display the result
print("Word Count using MapReduce:")
for word, count in result.items():
    print(f"{word}: {count}")

Word Count using MapReduce:
hello: 2
world: 2
mapreduce: 2
makes: 1
big: 2
data: 2
processing: 1
simple: 1
the: 1
of: 1
