# üêò Praktikum Minggu 2: Simulasi Hadoop MapReduce dengan Python
**Tujuan**: Memahami paradigma MapReduce dengan mengimplementasikannya menggunakan Python murni di Google Colab.

## 1. Implementasi MapReduce: Word Count

In [None]:
from collections import defaultdict
from functools import reduce as functools_reduce
import re

# ==== IMPLEMENTASI MAPREDUCE ====

def mapper_word_count(document):
    """MAP: Memecah dokumen menjadi pasangan (kata, 1)"""
    words = re.findall(r'\b[a-zA-Z]+\b', document.lower())
    return [(word, 1) for word in words]

def shuffler(mapped_results):
    """SHUFFLE & SORT: Mengelompokkan nilai berdasarkan key"""
    grouped = defaultdict(list)
    for key, value in mapped_results:
        grouped[key].append(value)
    return dict(grouped)

def reducer_word_count(key, values):
    """REDUCE: Menjumlahkan semua nilai untuk setiap kata"""
    return (key, sum(values))

# Data input (simulasi dokumen terdistribusi)
documents = [
    "Big data is the future of analytics and data science",
    "Machine learning and deep learning are part of data science",
    "Big data analytics uses Hadoop Spark and cloud computing",
    "Data science requires statistics machine learning and big data skills"
]

print('=== INPUT DOCUMENTS ===')
for i, doc in enumerate(documents, 1):
    print(f'Doc {i}: {doc}')

In [None]:
# MAP PHASE
print('=== MAP PHASE ===')
all_mapped = []
for i, doc in enumerate(documents, 1):
    mapped = mapper_word_count(doc)
    all_mapped.extend(mapped)
    print(f'Mapper {i}: {mapped[:5]}...')  # tampilkan 5 pertama

print(f'\nTotal key-value pairs: {len(all_mapped)}')

In [None]:
# SHUFFLE & SORT PHASE
print('=== SHUFFLE & SORT PHASE ===')
shuffled = shuffler(all_mapped)
print('Kata unik ditemukan:', len(shuffled))
print('\nContoh hasil shuffle (5 kata pertama):')
for i, (k, v) in enumerate(list(shuffled.items())[:5]):
    print(f'  {k}: {v}')

In [None]:
import pandas as pd

# REDUCE PHASE
print('=== REDUCE PHASE ===')
results = [reducer_word_count(k, v) for k, v in shuffled.items()]

# Urutkan berdasarkan frekuensi
results_sorted = sorted(results, key=lambda x: x[1], reverse=True)

df_wc = pd.DataFrame(results_sorted, columns=['Kata', 'Frekuensi'])
print('Top 15 kata terbanyak:')
print(df_wc.head(15).to_string(index=False))

In [None]:
import matplotlib.pyplot as plt

# Visualisasi Word Count
top15 = df_wc.head(15)
plt.figure(figsize=(12, 5))
bars = plt.bar(top15['Kata'], top15['Frekuensi'], color='steelblue', alpha=0.8)
plt.title('Hasil MapReduce: Top 15 Kata Terbanyak', fontsize=13, fontweight='bold')
plt.xlabel('Kata')
plt.ylabel('Frekuensi')
plt.xticks(rotation=45, ha='right')
for bar, val in zip(bars, top15['Frekuensi']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
             str(val), ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.show()

## 2. MapReduce: Analisis Penjualan

In [None]:
import random
from datetime import datetime, timedelta

# Generate dataset penjualan
random.seed(42)
products = ['Laptop', 'Smartphone', 'Tablet', 'Headphone', 'Monitor']
regions  = ['Jakarta', 'Surabaya', 'Bandung', 'Medan', 'Bali']

sales_data = []
base_date = datetime(2024, 1, 1)
for i in range(1000):
    sales_data.append({
        'tanggal':  (base_date + timedelta(days=random.randint(0, 364))).strftime('%Y-%m-%d'),
        'produk':   random.choice(products),
        'region':   random.choice(regions),
        'qty':      random.randint(1, 10),
        'harga':    random.choice([5_000_000, 8_000_000, 3_500_000, 500_000, 4_000_000])
    })

df_sales = pd.DataFrame(sales_data)
df_sales['total'] = df_sales['qty'] * df_sales['harga']
print(f'Dataset penjualan: {len(df_sales)} transaksi')
print(df_sales.head())

In [None]:
# MapReduce: Total penjualan per produk per region

def mapper_sales(record):
    """MAP: Key = (produk, region), Value = total penjualan"""
    key   = (record['produk'], record['region'])
    value = record['total']
    return (key, value)

def reducer_sales(key, values):
    """REDUCE: Jumlahkan total penjualan"""
    return (key, sum(values))

# MAP
mapped_sales = [mapper_sales(r) for r in sales_data]

# SHUFFLE
shuffled_sales = defaultdict(list)
for k, v in mapped_sales:
    shuffled_sales[k].append(v)

# REDUCE
results_sales = [reducer_sales(k, v) for k, v in shuffled_sales.items()]

# Format hasil
df_result = pd.DataFrame([(k[0], k[1], v) for k, v in results_sales],
                          columns=['Produk', 'Region', 'Total_Penjualan'])
df_pivot = df_result.pivot(index='Produk', columns='Region', values='Total_Penjualan').fillna(0)
df_pivot = df_pivot.applymap(lambda x: f'Rp {x/1e6:.1f}M')

print('=== Hasil MapReduce: Total Penjualan per Produk per Region ===')
print(df_pivot)

## 3. Tugas Praktikum Minggu 2
1. Implementasikan MapReduce untuk menghitung **jumlah transaksi per kategori per bulan** dari dataset penjualan di atas.
2. Implementasikan MapReduce untuk menemukan **produk dengan penjualan tertinggi di setiap region**.
3. Buat visualisasi dari hasil analisis nomor 1 dan 2.

**Hint**: Modifikasi fungsi `mapper_sales` dan `reducer_sales` sesuai kebutuhan.