In [1]:
import pandas as pd
from collections import defaultdict

In [3]:
# Sample sales data in a DataFrame
data = {
    'Product': ['Apple', 'Banana', 'Apple', 'Banana', 'Orange', 'Banana', 'Apple', 'Orange'],
    'Sales': [100, 150, 200, 250, 300, 100, 150, 200]
}

df = pd.DataFrame(data)

In [7]:
# Display the DataFrame
df

Unnamed: 0,Product,Sales
0,Apple,100
1,Banana,150
2,Apple,200
3,Banana,250
4,Orange,300
5,Banana,100
6,Apple,150
7,Orange,200


### Map Function
* Convert each row into key-value pair (Product, Sales)
* This function takes a row from the DataFrame and extracts a key-value pair where the key is the Product and the value is the Sales.


In [21]:
def map_function(row):
    product = row['Product']
    sales = row['Sales']
    return product, sales

### Shuffle and Sort 
* Group all key-value pairs by key (Product)
* The shuffle_sort function simulates the grouping of key-value pairs by their key (the product name).
* It stores each product’s sales values in a list.


In [11]:
def shuffle_sort(mapped_data):
    grouped_data = defaultdict(list)
    for product, sales in mapped_data:
        grouped_data[product].append(sales)
    return grouped_data

### Reduce Function
* Sum the sales for each product
* The reduce_function aggregates the sales for each product by summing all values in the list for each product.

In [26]:
def reduce_function(grouped_data):
    reduced_data = {product: sum(sales) for product, sales in grouped_data.items()}
    return reduced_data

### MapReduce Simulation
* Simulate the MapReduce process on a DataFrame
* The map_reduce function coordinates the Map, Shuffle/Sort, and Reduce phases.
* It processes all the rows in the DataFrame and outputs the aggregated sales data.


In [15]:
def map_reduce(df):
    # Step 1: Map Phase
    mapped_data = []
    for _, row in df.iterrows():
        mapped_data.append(map_function(row))
    
    # Step 2: Shuffle/Sort Phase
    grouped_data = shuffle_sort(mapped_data)
    
    # Step 3: Reduce Phase
    reduced_data = reduce_function(grouped_data)
    
    return reduced_data

In [17]:
# Running the MapReduce aggregation on the sales data
result = map_reduce(df)

In [19]:
# Display the result
print("\nAggregated Sales Data using MapReduce:")
for product, total_sales in result.items():
    print(f"{product}: {total_sales}")



Aggregated Sales Data using MapReduce:
Apple: 450
Banana: 500
Orange: 500
