# üîÑ B√ÄI 3: ETL PATTERNS & BEST PRACTICES

## M·ª•c ti√™u:
- Extract patterns (Database, CSV, API)
- Transform patterns (Cleaning, Enrichment, Aggregation)
- Load patterns (Database, Files)
- Error handling & logging
- Pipeline orchestration

In [1]:
# Setup
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
import sys
sys.path.append('/home/jovyan/week-03-04-python-etl/scripts')

from db_connector import DatabaseConnector
from data_cleaner import DataCleaner
from etl_pipeline import ETLPipeline

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("‚úÖ Setup complete!")

‚úÖ Setup complete!


---
## üìä PART 1: Extract Patterns

### Pattern 1: Extract from Database

In [2]:
# TODO: Create extract function with error handling
def extract_customers(db, date_from=None):
    """
    Extract customers from database
    
    Args:
        db: DatabaseConnector instance
        date_from: Optional date filter
    
    Returns:
        DataFrame with customers
    """
    try:
        logger.info("Starting customer extraction...")
        
        # YOUR CODE HERE
        query = """
            SELECT * FROM analytics.customers
            WHERE 1=1
        """
        
        if date_from:
            query += f" AND created_at >= '{date_from}'"
        
        df = db.read_sql(query)
        
        logger.info(f"Extracted {len(df)} customers")
        return df
        
    except Exception as e:
        logger.error(f"Extraction failed: {e}")
        raise

# Test
db = DatabaseConnector()
customers = extract_customers(db)
print(f"Extracted {len(customers)} customers")
customers.head()

2025-12-17 17:40:41,237 - db_connector - INFO - Database connector initialized for data_engineer@postgres
2025-12-17 17:40:41,238 - __main__ - INFO - Starting customer extraction...
  df = pd.read_sql_query(query, conn, params=params)
2025-12-17 17:40:41,418 - db_connector - INFO - Query executed, DataFrame shape: (1000, 8)
2025-12-17 17:40:41,420 - __main__ - INFO - Extracted 1000 customers


Extracted 1000 customers


Unnamed: 0,customer_id,customer_name,email,country,signup_date,customer_segment,created_at,updated_at
0,1,Timothy Vincent,maria99@example.org,Azerbaijan,2023-06-18,Standard,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
1,2,Edward Williamson,mcdonaldlisa@example.com,Anguilla,2025-05-09,Basic,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
2,3,Jessica Reed,sdavis@example.net,Thailand,2025-03-11,Basic,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
3,4,Carrie Davis,taylorleslie@example.org,Cote d'Ivoire,2023-11-01,Standard,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
4,5,Rita Fuller,gonzalezsamantha@example.org,Greece,2023-07-26,Premium,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901


### Pattern 2: Incremental Extract

In [3]:
# TODO: Implement incremental extraction
def extract_orders_incremental(db, last_extracted_date):
    """
    Extract only new/updated orders since last extraction
    
    Args:
        db: DatabaseConnector
        last_extracted_date: Last extraction timestamp
    
    Returns:
        DataFrame with new orders
    """
    # YOUR CODE HERE
    query = """
        SELECT * FROM analytics.orders
        WHERE updated_at > %s
        ORDER BY updated_at
    """
    
    df = db.read_sql(query, (last_extracted_date,))
    logger.info(f"Extracted {len(df)} new/updated orders since {last_extracted_date}")
    
    return df

# Test
last_date = datetime.now() - timedelta(days=7)
new_orders = extract_orders_incremental(db, last_date)
print(f"New orders: {len(new_orders)}")

  df = pd.read_sql_query(query, conn, params=params)
2025-12-17 17:40:51,889 - db_connector - INFO - Query executed, DataFrame shape: (10000, 7)
2025-12-17 17:40:51,890 - __main__ - INFO - Extracted 10000 new/updated orders since 2025-12-10 17:40:51.826839


New orders: 10000


### Pattern 3: Extract from CSV with Validation

In [6]:
# TODO: Create CSV extraction with validation
def extract_from_csv(file_path, expected_columns):
    """
    Extract data from CSV with validation
    
    Args:
        file_path: Path to CSV file
        expected_columns: List of expected column names
    
    Returns:
        DataFrame
    """
    try:
        logger.info(f"Reading CSV: {file_path}")
        
        # YOUR CODE HERE
        df = pd.read_csv(file_path)
        
        # Validate columns
        missing_cols = set(expected_columns) - set(df.columns)
        if missing_cols:
            raise ValueError(f"Missing columns: {missing_cols}")
        
        logger.info(f"Successfully loaded {len(df)} rows")
        return df
        
    except Exception as e:
        logger.error(f"CSV extraction failed: {e}")
        raise

# Create sample CSV for testing
sample_data = pd.DataFrame({
    'product_id': [1, 2, 3],
    'product_name': ['Product A', 'Product B', 'Product C'],
    'price': [100, 200, 300]
})
sample_data.to_csv('/home/jovyan/work/week-03-04-python-etl/data/raw/sample_products.csv', index=False)

# Test
products = extract_from_csv(
    '/home/jovyan/work/week-03-04-python-etl/data/raw/sample_products.csv',
    ['product_id', 'product_name', 'price']
)
products

---
## üîß PART 2: Transform Patterns

### Pattern 1: Data Enrichment

In [7]:
# TODO: Create enrichment transformation
def enrich_customer_data(customers_df, orders_df):
    """
    Enrich customers with order statistics
    
    Args:
        customers_df: Customer DataFrame
        orders_df: Orders DataFrame
    
    Returns:
        Enriched DataFrame
    """
    logger.info("Enriching customer data...")
    
    # YOUR CODE HERE
    # Calculate order statistics per customer
    order_stats = orders_df.groupby('customer_id').agg({
        'order_id': 'count',
        'total_amount': ['sum', 'mean', 'max'],
        'order_date': ['min', 'max']
    }).reset_index()
    
    order_stats.columns = [
        'customer_id', 'total_orders', 'total_revenue',
        'avg_order_value', 'max_order_value',
        'first_order_date', 'last_order_date'
    ]
    
    # Merge with customers
    enriched = customers_df.merge(order_stats, on='customer_id', how='left')
    
    # Fill nulls for customers without orders
    enriched['total_orders'] = enriched['total_orders'].fillna(0)
    enriched['total_revenue'] = enriched['total_revenue'].fillna(0)
    
    logger.info(f"Enriched {len(enriched)} customers")
    return enriched

# Test
orders = db.read_sql("SELECT * FROM analytics.orders LIMIT 1000")
enriched_customers = enrich_customer_data(customers, orders)
enriched_customers.head()

2025-12-17 17:41:32,818 - db_connector - INFO - Query executed, DataFrame shape: (1000, 7)
2025-12-17 17:41:32,820 - __main__ - INFO - Enriching customer data...
2025-12-17 17:41:32,906 - __main__ - INFO - Enriched 1000 customers


Unnamed: 0,customer_id,customer_name,email,country,signup_date,customer_segment,created_at,updated_at,total_orders,total_revenue,avg_order_value,max_order_value,first_order_date,last_order_date
0,1,Timothy Vincent,maria99@example.org,Azerbaijan,2023-06-18,Standard,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901,0.0,0.0,,,,
1,2,Edward Williamson,mcdonaldlisa@example.com,Anguilla,2025-05-09,Basic,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901,1.0,4694.03,4694.03,4694.03,2024-02-02,2024-02-02
2,3,Jessica Reed,sdavis@example.net,Thailand,2025-03-11,Basic,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901,3.0,21812.05,7270.683333,10240.26,2024-07-08,2025-11-12
3,4,Carrie Davis,taylorleslie@example.org,Cote d'Ivoire,2023-11-01,Standard,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901,1.0,3342.87,3342.87,3342.87,2024-04-14,2024-04-14
4,5,Rita Fuller,gonzalezsamantha@example.org,Greece,2023-07-26,Premium,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901,0.0,0.0,,,,


### Pattern 2: Data Aggregation

In [8]:
# TODO: Create aggregation transformation
def create_daily_summary(orders_df):
    """
    Create daily order summary
    
    Args:
        orders_df: Orders DataFrame
    
    Returns:
        Daily summary DataFrame
    """
    logger.info("Creating daily summary...")
    
    # YOUR CODE HERE
    orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])
    
    daily_summary = orders_df.groupby(orders_df['order_date'].dt.date).agg({
        'order_id': 'count',
        'total_amount': ['sum', 'mean'],
        'customer_id': 'nunique'
    }).reset_index()
    
    daily_summary.columns = [
        'date', 'total_orders', 'total_revenue',
        'avg_order_value', 'unique_customers'
    ]
    
    logger.info(f"Created summary for {len(daily_summary)} days")
    return daily_summary

# Test
daily_summary = create_daily_summary(orders)
daily_summary.head(10)

2025-12-17 17:46:00,092 - __main__ - INFO - Creating daily summary...
2025-12-17 17:46:00,116 - __main__ - INFO - Created summary for 546 days


Unnamed: 0,date,total_orders,total_revenue,avg_order_value,unique_customers
0,2023-12-15,1,8405.16,8405.16,1
1,2023-12-16,1,2687.41,2687.41,1
2,2023-12-17,5,25711.65,5142.33,5
3,2023-12-18,1,9261.83,9261.83,1
4,2023-12-19,2,7797.27,3898.635,2
5,2023-12-20,2,6213.13,3106.565,2
6,2023-12-21,1,3577.18,3577.18,1
7,2023-12-22,1,517.66,517.66,1
8,2023-12-24,2,8047.95,4023.975,2
9,2023-12-25,1,6594.96,6594.96,1


### Pattern 3: Data Cleaning Pipeline

In [9]:
# TODO: Create reusable cleaning pipeline
def clean_customer_data(df):
    """
    Standard customer data cleaning
    """
    logger.info("Cleaning customer data...")
    
    cleaner = DataCleaner(df)
    
    cleaned = (
        cleaner
        .remove_duplicates(subset=['customer_id'])
        .handle_missing_values({
            'customer_name': 'Unknown',
            'email': 'no-email@unknown.com',
            'country': 'Unknown'
        })
        .standardize_text(['customer_name', 'country'])
        .get_cleaned_data()
    )
    
    logger.info(f"Cleaning complete: {len(df)} ‚Üí {len(cleaned)} rows")
    return cleaned

# Test
cleaned_customers = clean_customer_data(customers)
cleaned_customers.head()

2025-12-17 17:46:07,154 - __main__ - INFO - Cleaning customer data...
2025-12-17 17:46:07,157 - data_cleaner - INFO - DataCleaner initialized with shape (1000, 8)
2025-12-17 17:46:07,159 - data_cleaner - INFO - remove_duplicates: Removed 0 duplicates (0.00%)
2025-12-17 17:46:07,162 - data_cleaner - INFO - handle_missing: customer_name: 0 ‚Üí 0 missing values
2025-12-17 17:46:07,164 - data_cleaner - INFO - handle_missing: email: 0 ‚Üí 0 missing values
2025-12-17 17:46:07,166 - data_cleaner - INFO - handle_missing: country: 0 ‚Üí 0 missing values
2025-12-17 17:46:07,171 - data_cleaner - INFO - standardize_text: Standardized customer_name
2025-12-17 17:46:07,178 - data_cleaner - INFO - standardize_text: Standardized country
2025-12-17 17:46:07,179 - data_cleaner - INFO - Cleaning complete: (1000, 8) ‚Üí (1000, 8)
2025-12-17 17:46:07,180 - __main__ - INFO - Cleaning complete: 1000 ‚Üí 1000 rows


Unnamed: 0,customer_id,customer_name,email,country,signup_date,customer_segment,created_at,updated_at
0,1,timothy vincent,maria99@example.org,azerbaijan,2023-06-18,Standard,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
1,2,edward williamson,mcdonaldlisa@example.com,anguilla,2025-05-09,Basic,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
2,3,jessica reed,sdavis@example.net,thailand,2025-03-11,Basic,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
3,4,carrie davis,taylorleslie@example.org,cote d'ivoire,2023-11-01,Standard,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901
4,5,rita fuller,gonzalezsamantha@example.org,greece,2023-07-26,Premium,2025-12-14 10:05:15.115901,2025-12-14 10:05:15.115901


---
## üíæ PART 3: Load Patterns

### Pattern 1: Full Load (Replace)

In [10]:
# TODO: Implement full load pattern
def load_full_replace(df, table_name, db):
    """
    Full load - replace entire table
    """
    logger.info(f"Loading {len(df)} rows to {table_name} (REPLACE)")
    
    try:
        # YOUR CODE HERE
        rows = db.write_dataframe(
            df,
            table_name,
            schema='analytics',
            if_exists='replace'
        )
        
        logger.info(f"Successfully loaded {rows} rows")
        return rows
        
    except Exception as e:
        logger.error(f"Load failed: {e}")
        raise

# Test (don't actually run to preserve data)
# load_full_replace(daily_summary, 'daily_order_summary', db)

### Pattern 2: Incremental Load (Append)

In [11]:
# TODO: Implement incremental load
def load_incremental(df, table_name, db):
    """
    Incremental load - append new records
    """
    logger.info(f"Loading {len(df)} rows to {table_name} (APPEND)")
    
    try:
        # YOUR CODE HERE
        rows = db.write_dataframe(
            df,
            table_name,
            schema='analytics',
            if_exists='append'
        )
        
        logger.info(f"Successfully appended {rows} rows")
        return rows
        
    except Exception as e:
        logger.error(f"Load failed: {e}")
        raise

### Pattern 3: Upsert (Update or Insert)

In [12]:
# TODO: Implement upsert pattern
def load_upsert(df, table_name, key_columns, db):
    """
    Upsert - update existing records or insert new ones
    
    Args:
        df: DataFrame to load
        table_name: Target table
        key_columns: Columns to match for updates
        db: DatabaseConnector
    """
    logger.info(f"Upserting {len(df)} rows to {table_name}")
    
    # YOUR CODE HERE
    # Strategy: Load to temp table, then merge
    temp_table = f"{table_name}_temp"
    
    # Load to temp table
    db.write_dataframe(df, temp_table, if_exists='replace')
    
    # Build upsert query
    key_condition = " AND ".join([f"t.{col} = s.{col}" for col in key_columns])
    
    upsert_query = f"""
        -- Delete existing records
        DELETE FROM analytics.{table_name} t
        USING analytics.{temp_table} s
        WHERE {key_condition};
        
        -- Insert all records from temp
        INSERT INTO analytics.{table_name}
        SELECT * FROM analytics.{temp_table};
        
        -- Drop temp table
        DROP TABLE analytics.{temp_table};
    """
    
    db.execute_query(upsert_query, fetch=False)
    logger.info(f"Upsert complete")

# Test (commented out)
# load_upsert(enriched_customers, 'customers_enriched', ['customer_id'], db)

---
## üéØ EXERCISE: Build Complete ETL Pipeline

In [15]:
def customer_enrichment_pipeline():
    """Using ETLPipeline framework"""
    
    pipeline = ETLPipeline('customer_enrichment')
    
    try:
        # EXTRACT - D√πng pipeline.extract()
        customers = pipeline.extract(
            source='database',
            query="SELECT * FROM analytics.customers"
        )
        
        orders = pipeline.extract(
            source='database',
            query="SELECT * FROM analytics.orders"
        )
        
        # TRANSFORM - D√πng pipeline.transform()
        def calc_stats(df):
            return df.groupby('customer_id').agg({
                'order_id': 'count',
                'total_amount': ['sum', 'mean']
            }).reset_index()
        
        def merge_data(df):
            order_stats = calc_stats(orders)
            return df.merge(order_stats, on='customer_id', how='left')
        
        enriched = pipeline.transform(
            customers,
            transformations=[merge_data]
        )
        
        # LOAD - D√πng pipeline.load()
        pipeline.load(
            enriched,
            destination='database',
            table='customers_enriched'
        )
        
        # Get summary
        summary = pipeline.get_summary()
        print(summary)
        
        return True
        
    except Exception as e:
        pipeline.log_step('ERROR', 'FAILED', str(e))
        return False

---
## üéØ CHALLENGE: Build Your Own Pipeline

In [18]:
def product_performance_pipeline():
    """
    Product Performance ETL Pipeline - FIXED VERSION
    """
    import logging
    from datetime import datetime, timedelta
    
    logger = logging.getLogger(__name__)
    
    try:
        # Initialize database connection
        db = DatabaseConnector()
        
        # ==========================================
        # EXTRACT PHASE
        # ==========================================
        logger.info("=" * 60)
        logger.info("EXTRACT PHASE")
        logger.info("=" * 60)
        
        # Calculate date range
        end_date = datetime.now()
        start_date = end_date - timedelta(days=30)
        
        logger.info(f"Extracting orders from {start_date.date()} to {end_date.date()}")
        
        # ‚úÖ FIX: Calculate subtotal instead of selecting it
        orders_query = """
            SELECT 
                o.order_id,
                o.customer_id,
                o.order_date,
                o.total_amount,
                oi.product_id,
                oi.quantity,
                oi.unit_price,
                (oi.quantity * oi.unit_price) as subtotal,  -- ‚úÖ T√çNH SUBTOTAL
                p.product_name,
                p.category
            FROM analytics.orders o
            JOIN analytics.order_items oi ON o.order_id = oi.order_id
            JOIN analytics.products p ON oi.product_id = p.product_id
            WHERE o.order_date >= %s
              AND o.order_date <= %s
        """
        
        orders_df = db.read_sql(orders_query, (start_date, end_date))
        logger.info(f"‚úÖ Extracted {len(orders_df)} order items")
        
        if len(orders_df) == 0:
            logger.warning("‚ö†Ô∏è No orders found in the last 30 days")
            return None
        
        # ==========================================
        # TRANSFORM PHASE
        # ==========================================
        logger.info("=" * 60)
        logger.info("TRANSFORM PHASE")
        logger.info("=" * 60)
        
        # 1. Calculate product performance metrics
        logger.info("Calculating product metrics...")
        
        product_metrics = orders_df.groupby(['product_id', 'product_name', 'category']).agg({
            'order_id': 'nunique',           # Number of unique orders
            'quantity': 'sum',                # Total quantity sold
            'subtotal': 'sum',                # Total revenue
            'unit_price': 'mean',             # Average price
            'order_date': ['min', 'max']      # First and last sale date
        }).reset_index()
        
        # Flatten column names
        product_metrics.columns = [
            'product_id', 'product_name', 'category',
            'number_of_orders', 'total_quantity_sold', 'total_revenue',
            'avg_unit_price', 'first_sale_date', 'last_sale_date'
        ]
        
        logger.info(f"‚úÖ Calculated metrics for {len(product_metrics)} products")
        
        # 2. Calculate average order value per product
        product_metrics['avg_order_value'] = (
            product_metrics['total_revenue'] / product_metrics['number_of_orders']
        ).round(2)
        
        # 3. Calculate revenue trend (daily average)
        product_metrics['days_in_period'] = (
            pd.to_datetime(product_metrics['last_sale_date']) - 
            pd.to_datetime(product_metrics['first_sale_date'])
        ).dt.days + 1
        
        product_metrics['daily_avg_revenue'] = (
            product_metrics['total_revenue'] / product_metrics['days_in_period']
        ).round(2)
        
        # 4. Add performance ranking
        product_metrics['revenue_rank'] = product_metrics['total_revenue'].rank(
            ascending=False, method='dense'
        ).astype(int)
        
        product_metrics['quantity_rank'] = product_metrics['total_quantity_sold'].rank(
            ascending=False, method='dense'
        ).astype(int)
        
        # 5. Add analysis period
        product_metrics['analysis_start_date'] = start_date.date()
        product_metrics['analysis_end_date'] = end_date.date()
        product_metrics['created_at'] = datetime.now()
        
        # 6. Round numeric columns
        numeric_columns = ['total_revenue', 'avg_unit_price', 'avg_order_value', 'daily_avg_revenue']
        product_metrics[numeric_columns] = product_metrics[numeric_columns].round(2)
        
        logger.info("‚úÖ Transformation complete")
        
        # Display summary
        logger.info("\nüìä PERFORMANCE SUMMARY:")
        logger.info(f"  Total Products: {len(product_metrics)}")
        logger.info(f"  Total Revenue: ${product_metrics['total_revenue'].sum():,.2f}")
        logger.info(f"  Total Quantity Sold: {product_metrics['total_quantity_sold'].sum():,}")
        logger.info(f"  Avg Order Value: ${product_metrics['avg_order_value'].mean():,.2f}")
        
        # ==========================================
        # VALIDATE PHASE
        # ==========================================
        logger.info("=" * 60)
        logger.info("VALIDATION PHASE")
        logger.info("=" * 60)
        
        validator = (
            DataValidator(product_metrics, "product_performance")
            .check_not_null(['product_id', 'product_name', 'category'])
            .check_unique(['product_id'])
            .check_range('total_revenue', 0, float('inf'))
            .check_range('total_quantity_sold', 0, float('inf'))
            .check_range('number_of_orders', 1, float('inf'))
        )
        
        validator.print_report()
        
        # Check if validation passed
        summary = validator.get_summary()
        if summary['failed'] > 0:
            logger.error("‚ùå Validation failed! Aborting load.")
            return None
        
        # ==========================================
        # LOAD PHASE
        # ==========================================
        logger.info("=" * 60)
        logger.info("LOAD PHASE")
        logger.info("=" * 60)
        
        # Create table if not exists
        create_table_sql = """
            CREATE TABLE IF NOT EXISTS analytics.product_performance (
                product_id INTEGER PRIMARY KEY,
                product_name VARCHAR(255),
                category VARCHAR(100),
                number_of_orders INTEGER,
                total_quantity_sold INTEGER,
                total_revenue DECIMAL(15,2),
                avg_unit_price DECIMAL(10,2),
                avg_order_value DECIMAL(10,2),
                daily_avg_revenue DECIMAL(10,2),
                revenue_rank INTEGER,
                quantity_rank INTEGER,
                first_sale_date DATE,
                last_sale_date DATE,
                days_in_period INTEGER,
                analysis_start_date DATE,
                analysis_end_date DATE,
                created_at TIMESTAMP
            );
        """
        
        db.execute_query(create_table_sql, fetch=False)
        logger.info("‚úÖ Table created/verified")
        
        # Load data
        rows_loaded = db.write_dataframe(
            product_metrics,
            table_name='product_performance',
            schema='analytics',
            if_exists='replace'
        )
        
        logger.info(f"‚úÖ Loaded {rows_loaded} rows to analytics.product_performance")
        
        # ==========================================
        # COMPLETION
        # ==========================================
        logger.info("=" * 60)
        logger.info("PIPELINE COMPLETED SUCCESSFULLY")
        logger.info("=" * 60)
        
        return product_metrics
        
    except Exception as e:
        logger.error(f"‚ùå Pipeline failed: {e}")
        import traceback
        logger.error(traceback.format_exc())
        raise


# Run the pipeline
if __name__ == "__main__":
    print("\nüöÄ Starting Product Performance Pipeline...\n")
    
    result = product_performance_pipeline()
    
    if result is not None:
        print("\n" + "=" * 60)
        print("üìä TOP 10 PRODUCTS BY REVENUE")
        print("=" * 60)
        top_products = result.nlargest(10, 'total_revenue')[
            ['product_name', 'category', 'total_revenue', 'total_quantity_sold', 'number_of_orders']
        ]
        print(top_products.to_string(index=False))
        
        print("\n" + "=" * 60)
        print("üìà CATEGORY PERFORMANCE")
        print("=" * 60)
        category_summary = result.groupby('category').agg({
            'total_revenue': 'sum',
            'total_quantity_sold': 'sum',
            'product_id': 'count'
        }).round(2)
        category_summary.columns = ['Total Revenue', 'Total Quantity', 'Number of Products']
        print(category_summary.to_string())
        
        print("\n‚úÖ Pipeline completed successfully!")
    else:
        print("\n‚ùå Pipeline failed or no data found!")

2025-12-17 18:20:08,438 - db_connector - INFO - Database connector initialized for data_engineer@postgres
2025-12-17 18:20:08,442 - __main__ - INFO - EXTRACT PHASE
2025-12-17 18:20:08,445 - __main__ - INFO - Extracting orders from 2025-11-17 to 2025-12-17
  df = pd.read_sql_query(query, conn, params=params)
2025-12-17 18:20:08,508 - db_connector - INFO - Query executed, DataFrame shape: (1030, 10)
2025-12-17 18:20:08,510 - __main__ - INFO - ‚úÖ Extracted 1030 order items
2025-12-17 18:20:08,512 - __main__ - INFO - TRANSFORM PHASE
2025-12-17 18:20:08,514 - __main__ - INFO - Calculating product metrics...
2025-12-17 18:20:08,558 - __main__ - INFO - ‚úÖ Calculated metrics for 100 products
2025-12-17 18:20:08,582 - __main__ - INFO - ‚úÖ Transformation complete
2025-12-17 18:20:08,583 - __main__ - INFO - 
üìä PERFORMANCE SUMMARY:
2025-12-17 18:20:08,584 - __main__ - INFO -   Total Products: 100
2025-12-17 18:20:08,585 - __main__ - INFO -   Total Revenue: $1,690,651.81
2025-12-17 18:20:08,5


üöÄ Starting Product Performance Pipeline...



NameError: name 'DataValidator' is not defined

---
## üìö KEY TAKEAWAYS

### ETL Best Practices:
1. **Always log** - Track what's happening
2. **Handle errors** - Use try/except blocks
3. **Validate data** - Check before and after transformations
4. **Use transactions** - Ensure data consistency
5. **Make it idempotent** - Safe to run multiple times
6. **Document** - Clear docstrings and comments
7. **Test** - Unit tests for each function
8. **Monitor** - Track pipeline performance

In [17]:
# Ch·∫°y cell n√†y ƒë·ªÉ xem c·∫•u tr√∫c b·∫£ng
db = DatabaseConnector()

# Ki·ªÉm tra columns c·ªßa order_items
check_query = """
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'analytics' 
      AND table_name = 'order_items'
    ORDER BY ordinal_position
"""

columns = db.read_sql(check_query)
print("üìã Columns in analytics.order_items:")
print(columns)

2025-12-17 18:19:55,641 - db_connector - INFO - Database connector initialized for data_engineer@postgres
2025-12-17 18:19:55,726 - db_connector - INFO - Query executed, DataFrame shape: (8, 2)


üìã Columns in analytics.order_items:
        column_name                    data_type
0     order_item_id                      integer
1          order_id                      integer
2        product_id                      integer
3          quantity                      integer
4        unit_price                      numeric
5  discount_percent                      numeric
6        line_total                      numeric
7        created_at  timestamp without time zone
