# E-commerce Data Transformation and Enrichment

This notebook handles the transformation and enrichment of raw e-commerce data tables.

## Contents
1. Data Loading
2. Customer Data Enrichment
3. Product Analysis
4. Order Processing
5. Aggregations and Metrics

In [None]:
# Import required libraries and functions
import os
import sys
sys.path.append("../")  # Add project root to path

from src.processing import (
    init_spark,
    get_customer_metrics,
    analyze_product_performance,
    enrich_orders,
    get_sales_aggregations
)
from src.config import SparkConfig, BusinessConfig

# Initialize Spark session using our utility function
spark = init_spark("EcommerceTransformation")

# Load preprocessed tables
products_df = spark.read.parquet("../data/processed/products")
customers_df = spark.read.parquet("../data/processed/customers")
orders_df = spark.read.parquet("../data/processed/orders")

# Display sample data
print("Sample data preview:")
print("\nOrders:")
orders_df.show(3)
print("\nCustomers:")
customers_df.show(3)
print("\nProducts:")
products_df.show(3)

## Customer Data Enrichment

Enrich customer data with order history and segment information.

In [None]:
# Calculate customer metrics using our processing function
enriched_customers = get_customer_metrics(orders_df, customers_df)

# Display results with different segments
print("Customer Metrics Overview:")
enriched_customers.show(5)

print("\nCustomer Segments Distribution:")
enriched_customers.groupBy("Customer Segment").count().show()

print("\nActivity Status Distribution:")
enriched_customers.groupBy("Activity Status").count().show()

# Save enriched customer data
enriched_customers.write.mode("overwrite").parquet("../data/processed/enriched_customers")

## Product Analysis

Analyze product performance and create product hierarchies.

In [None]:
# Calculate product performance metrics using our processing function
product_metrics = analyze_product_performance(orders_df, products_df)

print("Product Performance Metrics:")
product_metrics.show(5)

print("\nProducts by Performance Flag:")
product_metrics.groupBy("Performance Flag").count().show()

# Calculate category-level aggregations
category_performance = product_metrics.groupBy("Category").agg(
    count("Product ID").alias("Number of Products"),
    spark_round(sum("Total Sales"), 2).alias("Category Sales"),
    spark_round(avg("Profit Margin"), 2).alias("Average Margin")
).orderBy("Category Sales", ascending=False)

print("\nCategory Performance Overview:")
category_performance.show()

# Save enriched product data
product_metrics.write.mode("overwrite").parquet("../data/processed/enriched_products")

## Order Enrichment and Aggregations

Combine order data with customer and product information, then calculate various aggregations.

In [None]:
# Enrich orders with customer and product information
enriched_orders = enrich_orders(orders_df, customers_df, products_df)

print("Enriched Orders Preview:")
enriched_orders.show(5)

# Calculate sales aggregations
monthly_sales, customer_patterns = get_sales_aggregations(enriched_orders)

print("\nMonthly Sales by Category:")
monthly_sales.orderBy("Order Year", "Order Month", "Category").show(5)

print("\nCustomer Purchase Patterns:")
customer_patterns.orderBy("Total Spend", ascending=False).show(5)

# Save enriched orders and aggregations
enriched_orders.write.mode("overwrite").parquet("../data/processed/enriched_orders")
monthly_sales.write.mode("overwrite").parquet("../data/processed/monthly_sales")
customer_patterns.write.mode("overwrite").parquet("../data/processed/customer_patterns")

## SQL-Based Profit Analysis

Analyze profit patterns using SQL queries across different dimensions.

In [None]:
# Execute all profit analysis SQL queries
from src.processing import get_sql_profit_analysis

profit_analysis = get_sql_profit_analysis(spark, enriched_orders)

# 1. Yearly Profit Analysis
print("Profit Analysis by Year:")
profit_analysis["yearly_profit"].show()

# 2. Category Profit Analysis
print("\nProfit Analysis by Year and Category:")
profit_analysis["category_profit"].show()

# 3. Customer Profit Analysis
print("\nTop Customers by Profit:")
profit_analysis["customer_profit"].show(10)

# 4. Customer Yearly Profit Analysis
print("\nCustomer Profit Trends by Year:")
profit_analysis["customer_yearly_profit"].show(10)

# Save results to parquet files
profit_analysis["yearly_profit"].write.mode("overwrite").parquet("../data/processed/yearly_profit")
profit_analysis["category_profit"].write.mode("overwrite").parquet("../data/processed/category_profit")
profit_analysis["customer_profit"].write.mode("overwrite").parquet("../data/processed/customer_profit")
profit_analysis["customer_yearly_profit"].write.mode("overwrite").parquet("../data/processed/customer_yearly_profit")