# Query Rewriting

This notebook demonstrates various query rewriting techniques for performance optimization:
* Rewriting subqueries
* Optimizing complex joins
* Improving aggregations
* Alternative query patterns

## 1. Rewriting Subqueries to JOINs

In [None]:
-- Original query with correlated subquery
EXPLAIN ANALYZE
SELECT 
    c.customer_id,
    c.first_name,
    c.last_name,
    (
        SELECT COUNT(*)
        FROM orders o
        WHERE o.customer_id = c.customer_id
        AND o.status = 'Completed'
    ) as completed_orders,
    (
        SELECT COALESCE(SUM(total_amount), 0)
        FROM orders o
        WHERE o.customer_id = c.customer_id
        AND o.status = 'Completed'
    ) as total_spent
FROM customers c
WHERE c.country = 'USA';

-- Rewritten query using JOIN
EXPLAIN ANALYZE
SELECT 
    c.customer_id,
    c.first_name,
    c.last_name,
    COUNT(o.order_id) as completed_orders,
    COALESCE(SUM(o.total_amount), 0) as total_spent
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
    AND o.status = 'Completed'
WHERE c.country = 'USA'
GROUP BY c.customer_id, c.first_name, c.last_name;

## 2. Optimizing Complex Joins

In [None]:
-- Original complex query
EXPLAIN ANALYZE
SELECT 
    c.country,
    p.category,
    DATE_TRUNC('month', o.order_date) as month,
    COUNT(DISTINCT c.customer_id) as num_customers,
    COUNT(DISTINCT o.order_id) as num_orders,
    SUM(oi.quantity * oi.unit_price) as total_revenue
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.order_date >= '2022-01-01'
AND o.status = 'Completed'
GROUP BY c.country, p.category, DATE_TRUNC('month', o.order_date);

-- Rewritten with intermediate aggregations
EXPLAIN ANALYZE
WITH order_revenue AS (
    SELECT 
        o.order_id,
        o.customer_id,
        o.order_date,
        p.category,
        SUM(oi.quantity * oi.unit_price) as revenue
    FROM orders o
    JOIN order_items oi ON o.order_id = oi.order_id
    JOIN products p ON oi.product_id = p.product_id
    WHERE o.order_date >= '2022-01-01'
    AND o.status = 'Completed'
    GROUP BY o.order_id, o.customer_id, o.order_date, p.category
)
SELECT 
    c.country,
    or.category,
    DATE_TRUNC('month', or.order_date) as month,
    COUNT(DISTINCT c.customer_id) as num_customers,
    COUNT(DISTINCT or.order_id) as num_orders,
    SUM(or.revenue) as total_revenue
FROM customers c
JOIN order_revenue or ON c.customer_id = or.customer_id
GROUP BY c.country, or.category, DATE_TRUNC('month', or.order_date);

## 3. Improving Aggregations

In [None]:
-- Original query with multiple aggregations
EXPLAIN ANALYZE
SELECT 
    p.category,
    COUNT(DISTINCT o.customer_id) as num_customers,
    COUNT(DISTINCT o.order_id) as num_orders,
    SUM(oi.quantity) as total_items,
    SUM(oi.quantity * oi.unit_price) as total_revenue,
    AVG(oi.quantity * oi.unit_price) as avg_order_value
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status = 'Completed'
GROUP BY p.category;

-- Rewritten with window functions
EXPLAIN ANALYZE
WITH order_metrics AS (
    SELECT 
        o.order_id,
        o.customer_id,
        p.category,
        SUM(oi.quantity) as items,
        SUM(oi.quantity * oi.unit_price) as revenue
    FROM orders o
    JOIN order_items oi ON o.order_id = oi.order_id
    JOIN products p ON oi.product_id = p.product_id
    WHERE o.status = 'Completed'
    GROUP BY o.order_id, o.customer_id, p.category
)
SELECT DISTINCT
    category,
    COUNT(DISTINCT customer_id) OVER w as num_customers,
    COUNT(DISTINCT order_id) OVER w as num_orders,
    SUM(items) OVER w as total_items,
    SUM(revenue) OVER w as total_revenue,
    AVG(revenue) OVER w as avg_order_value
FROM order_metrics
WINDOW w AS (PARTITION BY category);

## 4. Optimizing IN Clauses

In [None]:
-- Original query with IN
EXPLAIN ANALYZE
SELECT *
FROM products
WHERE product_id IN (
    SELECT product_id
    FROM order_items oi
    JOIN orders o ON oi.order_id = o.order_id
    WHERE o.order_date >= '2022-01-01'
    GROUP BY product_id
    HAVING SUM(quantity) > 100
);

-- Rewritten using JOIN
EXPLAIN ANALYZE
SELECT DISTINCT p.*
FROM products p
JOIN (
    SELECT product_id
    FROM order_items oi
    JOIN orders o ON oi.order_id = o.order_id
    WHERE o.order_date >= '2022-01-01'
    GROUP BY product_id
    HAVING SUM(quantity) > 100
) high_volume ON p.product_id = high_volume.product_id;

## 5. Complex Query Optimization

In [None]:
-- Original complex query
EXPLAIN ANALYZE
SELECT 
    c.country,
    c.segment,
    p.category,
    COUNT(DISTINCT o.order_id) as num_orders,
    COUNT(DISTINCT o.customer_id) as num_customers,
    SUM(oi.quantity) as total_items,
    SUM(oi.quantity * oi.unit_price) as total_revenue,
    SUM(oi.quantity * oi.unit_price) / COUNT(DISTINCT o.order_id) as avg_order_value,
    SUM(oi.quantity * (oi.unit_price * oi.discount)) as total_discounts
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.order_date >= '2022-01-01'
AND o.status = 'Completed'
GROUP BY c.country, c.segment, p.category
HAVING COUNT(DISTINCT o.order_id) >= 10;

-- Rewritten with multiple CTEs and staged aggregations
EXPLAIN ANALYZE
WITH order_details AS (
    SELECT 
        o.order_id,
        o.customer_id,
        p.category,
        SUM(oi.quantity) as items,
        SUM(oi.quantity * oi.unit_price) as revenue,
        SUM(oi.quantity * (oi.unit_price * oi.discount)) as discounts
    FROM orders o
    JOIN order_items oi ON o.order_id = oi.order_id
    JOIN products p ON oi.product_id = p.product_id
    WHERE o.order_date >= '2022-01-01'
    AND o.status = 'Completed'
    GROUP BY o.order_id, o.customer_id, p.category
),
customer_category_metrics AS (
    SELECT 
        c.country,
        c.segment,
        od.category,
        COUNT(DISTINCT od.order_id) as num_orders,
        COUNT(DISTINCT od.customer_id) as num_customers,
        SUM(od.items) as total_items,
        SUM(od.revenue) as total_revenue,
        SUM(od.discounts) as total_discounts
    FROM customers c
    JOIN order_details od ON c.customer_id = od.customer_id
    GROUP BY c.country, c.segment, od.category
)
SELECT 
    country,
    segment,
    category,
    num_orders,
    num_customers,
    total_items,
    total_revenue,
    total_revenue / NULLIF(num_orders, 0) as avg_order_value,
    total_discounts
FROM customer_category_metrics
WHERE num_orders >= 10;

## Best Practices for Query Rewriting

1. **General Guidelines**
   - Break down complex queries
   - Use appropriate indexes
   - Apply filters early
   - Consider data volume

2. **Join Optimization**
   - Choose proper join types
   - Consider join order
   - Use indexes effectively
   - Pre-filter large tables

3. **Aggregation Strategies**
   - Stage complex aggregations
   - Use window functions when appropriate
   - Consider materialized views
   - Pre-aggregate common calculations

4. **Query Structure**
   - Maintain readability
   - Use CTEs for clarity
   - Consider maintenance
   - Document complex logic