In [None]:
def export_complete_dataset_no_filters():
    """Export complete dataset - ALL rows with canonical transaction IDs, no date filters"""
    
    print("🚀 Exporting COMPLETE dataset - NO date filters, ALL canonical transaction IDs")
    
    export_dir = '../exports'
    os.makedirs(export_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Complete flat dataset - NO date filters, ALL canonical IDs
    complete_flat_query = '''
        SELECT 
            CanonicalTxID,
            TransactionID,
            DeviceID,
            StoreID,
            StoreName,
            brand,
            product_name,
            category,
            Amount,
            Basket_Item_Count,
            payment_method,
            audio_transcript,
            Txn_TS,
            daypart,
            weekday_weekend,
            transaction_date
        FROM gold.v_transactions_flat
        -- NO DATE FILTER - GET ALL ROWS
        ORDER BY CanonicalTxID
    '''
    
    # Complete crosstab - ALL data
    complete_crosstab_query = '''
        SELECT 
            [date],
            store_id,
            store_name,
            municipality_name,
            daypart,
            brand,
            txn_count,
            total_amount,
            avg_basket_amount,
            substitution_events
        FROM gold.v_transactions_crosstab
        -- NO DATE FILTER - GET ALL ROWS
        ORDER BY [date] DESC, store_id, brand
    '''
    
    with pyodbc.connect(conn_str) as conn:
        print("📊 Extracting COMPLETE flat dataset (no filters)...")
        df_complete_flat = pd.read_sql(complete_flat_query, conn)
        
        print("📊 Extracting COMPLETE crosstab dataset (no filters)...")
        df_complete_crosstab = pd.read_sql(complete_crosstab_query, conn)
    
    # Export complete datasets
    complete_flat_file = f'{export_dir}/scout_flat_COMPLETE_{timestamp}.csv'
    complete_crosstab_file = f'{export_dir}/scout_crosstab_COMPLETE_{timestamp}.csv'
    
    df_complete_flat.to_csv(complete_flat_file, index=False)
    df_complete_crosstab.to_csv(complete_crosstab_file, index=False)
    
    # Summary report
    complete_summary = {
        'export_timestamp': datetime.now().isoformat(),
        'dataset_type': 'COMPLETE - NO FILTERS',
        'files_created': {
            'complete_flat': complete_flat_file,
            'complete_crosstab': complete_crosstab_file
        },
        'row_counts': {
            'flat_total': len(df_complete_flat),
            'flat_with_timestamps': len(df_complete_flat[df_complete_flat['Txn_TS'].notna()]),
            'flat_without_timestamps': len(df_complete_flat[df_complete_flat['Txn_TS'].isna()]),
            'crosstab_total': len(df_complete_crosstab)
        },
        'data_quality': {
            'unique_canonical_ids': df_complete_flat['CanonicalTxID'].nunique(),
            'date_range': {
                'min_date': str(df_complete_flat['transaction_date'].min()) if not df_complete_flat['transaction_date'].isna().all() else 'N/A',
                'max_date': str(df_complete_flat['transaction_date'].max()) if not df_complete_flat['transaction_date'].isna().all() else 'N/A'
            },
            'stores_covered': df_complete_flat['StoreID'].nunique(),
            'brands_covered': df_complete_flat['brand'].nunique()
        }
    }
    
    # Save summary
    summary_file = f'{export_dir}/COMPLETE_export_summary_{timestamp}.json'
    with open(summary_file, 'w') as f:
        json.dump(complete_summary, f, indent=2)
    
    print(f"\n✅ COMPLETE FLAT DATASET: {complete_flat_file}")
    print(f"   📊 Total rows: {len(df_complete_flat):,}")
    print(f"   ✅ With timestamps: {len(df_complete_flat[df_complete_flat['Txn_TS'].notna()]):,}")
    print(f"   ⚠️  Without timestamps: {len(df_complete_flat[df_complete_flat['Txn_TS'].isna()]):,}")
    print(f"   🆔 Unique canonical IDs: {df_complete_flat['CanonicalTxID'].nunique():,}")
    
    print(f"\n✅ COMPLETE CROSSTAB DATASET: {complete_crosstab_file}")
    print(f"   📊 Total rows: {len(df_complete_crosstab):,}")
    
    print(f"\n📋 Summary report: {summary_file}")
    
    # Final answer to your question
    total_canonical_ids = df_complete_flat['CanonicalTxID'].nunique()
    total_rows = len(df_complete_flat)
    
    if total_rows >= 12000:
        print(f"\n🎯 YOU HAVE {total_rows:,} TOTAL ROWS ({total_canonical_ids:,} unique canonical IDs)")
    elif total_rows >= 6000:
        print(f"\n🎯 YOU HAVE {total_rows:,} TOTAL ROWS ({total_canonical_ids:,} unique canonical IDs)")
    else:
        print(f"\n🎯 YOU HAVE {total_rows:,} TOTAL ROWS ({total_canonical_ids:,} unique canonical IDs)")
    
    return complete_summary

# Export complete dataset with no filters
complete_export_summary = export_complete_dataset_no_filters()

## 16. Full Dataset Export (Complete 12K+ Rows)

In [None]:
def export_with_identical_ordering():
    """Export with identical ordering to ensure data parity between views"""
    
    print("🔄 Exporting with identical ordering for data parity validation...")
    
    export_dir = '../exports'
    os.makedirs(export_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 1. Export from gold.v_transactions_flat (source of truth)
    flat_query = '''
        SELECT *
        FROM gold.v_transactions_flat
        ORDER BY CanonicalTxID, TransactionID
    '''
    
    # 2. Export from gold.v_transactions_flat_v24 (compatibility view)
    v24_query = '''
        SELECT *
        FROM gold.v_transactions_flat_v24
        ORDER BY CanonicalTxID, TransactionID
    '''
    
    # 3. Export crosstab with consistent ordering
    crosstab_query = '''
        SELECT *
        FROM gold.v_transactions_crosstab
        ORDER BY [date] DESC, store_id, brand
    '''
    
    with pyodbc.connect(conn_str) as conn:
        print("📊 Extracting flat view (gold.v_transactions_flat)...")
        df_flat = pd.read_sql(flat_query, conn)
        
        print("📊 Extracting v24 compatibility view...")
        df_v24 = pd.read_sql(v24_query, conn)
        
        print("📊 Extracting crosstab view...")
        df_crosstab = pd.read_sql(crosstab_query, conn)
    
    # Export with identical naming convention
    flat_file = f'{export_dir}/scout_flat_ordered_{timestamp}.csv'
    v24_file = f'{export_dir}/scout_v24_ordered_{timestamp}.csv'
    crosstab_file = f'{export_dir}/scout_crosstab_ordered_{timestamp}.csv'
    
    df_flat.to_csv(flat_file, index=False)
    df_v24.to_csv(v24_file, index=False)
    df_crosstab.to_csv(crosstab_file, index=False)
    
    # Data parity validation
    validation_results = {
        'export_timestamp': datetime.now().isoformat(),
        'files_created': {
            'flat': flat_file,
            'v24': v24_file,
            'crosstab': crosstab_file
        },
        'row_counts': {
            'flat': len(df_flat),
            'v24': len(df_v24),
            'crosstab': len(df_crosstab)
        },
        'column_counts': {
            'flat': len(df_flat.columns),
            'v24': len(df_v24.columns),
            'crosstab': len(df_crosstab.columns)
        },
        'data_parity_check': {
            'flat_v24_row_match': len(df_flat) == len(df_v24),
            'flat_columns': list(df_flat.columns),
            'v24_columns': list(df_v24.columns)
        }
    }
    
    # Save validation report
    validation_file = f'{export_dir}/data_parity_validation_{timestamp}.json'
    with open(validation_file, 'w') as f:
        json.dump(validation_results, f, indent=2)
    
    print(f"✅ Flat view exported: {flat_file} ({len(df_flat):,} rows)")
    print(f"✅ V24 view exported: {v24_file} ({len(df_v24):,} rows)")
    print(f"✅ Crosstab exported: {crosstab_file} ({len(df_crosstab):,} rows)")
    print(f"📋 Validation report: {validation_file}")
    
    # Data quality summary
    print(f"\n🔍 Data Parity Validation:")
    print(f"  Flat vs V24 row count match: {validation_results['data_parity_check']['flat_v24_row_match']}")
    print(f"  Flat columns: {len(df_flat.columns)}")
    print(f"  V24 columns: {len(df_v24.columns)}")
    print(f"  Crosstab records: {len(df_crosstab):,}")
    
    if validation_results['data_parity_check']['flat_v24_row_match']:
        print("✅ Data parity validation PASSED")
    else:
        print("⚠️  Data parity validation needs review")
        print(f"   Flat rows: {len(df_flat)} vs V24 rows: {len(df_v24)}")
    
    return validation_results

# Run identical ordering export for data parity
parity_results = export_with_identical_ordering()

In [None]:
def export_enhanced_excel_crosstab():
    """Export enhanced crosstab with business research framework metrics"""
    
    # Extract comprehensive crosstab with all business metrics
    enhanced_crosstab_query = '''
        WITH DaypartBrandMetrics AS (
            SELECT 
                CAST(flat.transaction_date AS date) as [date],
                flat.store_id,
                flat.StoreName,
                'Metro Manila' as municipality_name,  -- Simplified for now
                flat.daypart,
                flat.brand,
                flat.category,
                flat.weekday_weekend,
                COUNT(DISTINCT flat.CanonicalTxID) as txn_count,
                SUM(flat.Amount) as total_amount,
                AVG(flat.Amount) as avg_transaction_value,
                SUM(flat.Basket_Item_Count) as total_items,
                AVG(flat.Basket_Item_Count) as avg_basket_amount,
                COUNT(DISTINCT CASE WHEN flat.payment_method IS NOT NULL THEN flat.payment_method END) as payment_methods_used,
                
                -- Business intelligence metrics
                CASE 
                    WHEN flat.daypart = 'Morning' AND flat.category LIKE '%snack%' THEN 'Morning_Snack_Peak'
                    WHEN flat.daypart = 'Evening' AND flat.category LIKE '%beverage%' THEN 'Evening_Beverage_Peak'
                    ELSE 'Regular_Pattern'
                END as peak_pattern,
                
                -- Substitution risk indicators (placeholder)
                CASE 
                    WHEN flat.brand IN ('Oishi', 'Jack n Jill') THEN 1
                    WHEN flat.brand IN ('Bear Brand', 'Alaska') THEN 1
                    ELSE 0
                END as substitution_events
                
            FROM gold.v_transactions_flat flat
            WHERE flat.Txn_TS IS NOT NULL
            AND flat.transaction_date >= DATEADD(day, -90, SYSUTCDATETIME())  -- Last 90 days
            GROUP BY 
                CAST(flat.transaction_date AS date),
                flat.store_id,
                flat.StoreName,
                flat.daypart,
                flat.brand,
                flat.category,
                flat.weekday_weekend
        )
        SELECT 
            [date],
            store_id,
            StoreName as store_name,
            municipality_name,
            daypart,
            brand,
            category,
            weekday_weekend,
            txn_count,
            total_amount,
            avg_transaction_value,
            total_items,
            avg_basket_amount,
            payment_methods_used,
            peak_pattern,
            substitution_events
        FROM DaypartBrandMetrics
        ORDER BY [date] DESC, total_amount DESC
    '''
    
    with pyodbc.connect(conn_str) as conn:
        df_enhanced_crosstab = pd.read_sql(enhanced_crosstab_query, conn)
    
    # Create export directory
    export_dir = '../exports'
    os.makedirs(export_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Excel export with multiple sheets
    excel_file = f'{export_dir}/scout_crosstab_enhanced_{timestamp}.xlsx'
    
    with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
        # Main crosstab data
        df_enhanced_crosstab.to_excel(writer, sheet_name='Enhanced_Crosstab', index=False)
        
        # Business analytics sheets
        
        # 1. Daypart Analysis (Time of Day Framework)
        daypart_analysis = df_enhanced_crosstab.groupby(['daypart', 'category']).agg({
            'txn_count': 'sum',
            'total_amount': 'sum',
            'avg_transaction_value': 'mean'
        }).round(2).reset_index()
        daypart_analysis.to_excel(writer, sheet_name='Daypart_Analysis', index=False)
        
        # 2. Brand Performance by Daypart
        brand_daypart = df_enhanced_crosstab.groupby(['brand', 'daypart']).agg({
            'txn_count': 'sum',
            'total_amount': 'sum',
            'substitution_events': 'sum'
        }).round(2).reset_index()
        brand_daypart.to_excel(writer, sheet_name='Brand_Daypart', index=False)
        
        # 3. Weekend vs Weekday Patterns
        weekend_analysis = df_enhanced_crosstab.groupby(['weekday_weekend', 'category']).agg({
            'txn_count': 'sum',
            'total_amount': 'sum',
            'avg_basket_amount': 'mean'
        }).round(2).reset_index()
        weekend_analysis.to_excel(writer, sheet_name='Weekend_Patterns', index=False)
        
        # 4. Store Performance Rankings
        store_performance = df_enhanced_crosstab.groupby(['store_id', 'store_name']).agg({
            'txn_count': 'sum',
            'total_amount': 'sum',
            'avg_transaction_value': 'mean',
            'substitution_events': 'sum'
        }).round(2).reset_index().sort_values('total_amount', ascending=False)
        store_performance.to_excel(writer, sheet_name='Store_Performance', index=False)
        
        # 5. Peak Pattern Summary
        peak_summary = df_enhanced_crosstab.groupby('peak_pattern').agg({
            'txn_count': 'sum',
            'total_amount': 'sum',
            'brand': 'nunique'
        }).round(2).reset_index()
        peak_summary.to_excel(writer, sheet_name='Peak_Patterns', index=False)
    
    print(f"✅ Enhanced Excel crosstab exported: {excel_file}")
    print(f"📊 Sheets created: Enhanced_Crosstab, Daypart_Analysis, Brand_Daypart, Weekend_Patterns, Store_Performance, Peak_Patterns")
    print(f"📈 Records: {len(df_enhanced_crosstab):,} rows")
    
    return excel_file, df_enhanced_crosstab

def export_enriched_flat_csv():
    """Export enriched flat transactions with all business context"""
    
    # Extract comprehensive flat data with enrichments
    enriched_flat_query = '''
        SELECT 
            flat.CanonicalTxID,
            flat.TransactionID,
            flat.DeviceID,
            flat.StoreID,
            flat.StoreName,
            flat.brand,
            flat.product_name,
            flat.category,
            flat.Amount,
            flat.Basket_Item_Count,
            flat.payment_method,
            flat.audio_transcript,
            flat.Txn_TS,
            flat.daypart,
            flat.weekday_weekend,
            flat.transaction_date,
            
            -- Business intelligence enrichments
            CASE 
                WHEN flat.Amount > 500 THEN 'High_Value'
                WHEN flat.Amount > 100 THEN 'Medium_Value'
                ELSE 'Low_Value'
            END as transaction_value_tier,
            
            CASE 
                WHEN flat.Basket_Item_Count > 10 THEN 'Large_Basket'
                WHEN flat.Basket_Item_Count > 5 THEN 'Medium_Basket'
                ELSE 'Small_Basket'
            END as basket_size_tier,
            
            CASE 
                WHEN flat.daypart = 'Morning' AND flat.weekday_weekend = 'weekday' THEN 'Commuter_Rush'
                WHEN flat.daypart = 'Evening' AND flat.weekday_weekend = 'weekend' THEN 'Social_Evening'
                WHEN flat.daypart = 'Afternoon' AND flat.weekday_weekend = 'weekend' THEN 'Family_Shopping'
                ELSE 'Regular_Shopping'
            END as shopping_context,
            
            -- Customer behavior indicators
            CASE 
                WHEN flat.category LIKE '%snack%' AND flat.daypart = 'Morning' THEN 'Impulse_Snacking'
                WHEN flat.category LIKE '%beverage%' AND flat.daypart = 'Evening' THEN 'Social_Drinking'
                WHEN flat.category LIKE '%essential%' THEN 'Necessity_Purchase'
                ELSE 'Regular_Purchase'
            END as purchase_motivation,
            
            -- Geographic context (simplified for demo)
            CASE 
                WHEN flat.StoreID IN (102, 103) THEN 'Urban_Core'
                WHEN flat.StoreID IN (109, 110) THEN 'Suburban'
                ELSE 'Rural'
            END as location_type,
            
            -- Time-based features for analysis
            DATEPART(hour, flat.Txn_TS) as transaction_hour,
            DATEPART(dayofweek, flat.transaction_date) as day_of_week,
            DATEPART(month, flat.transaction_date) as transaction_month
            
        FROM gold.v_transactions_flat flat
        WHERE flat.Txn_TS IS NOT NULL
        AND flat.transaction_date >= DATEADD(day, -90, SYSUTCDATETIME())  -- Last 90 days
        ORDER BY flat.transaction_date DESC, flat.Txn_TS DESC
    '''
    
    with pyodbc.connect(conn_str) as conn:
        df_enriched_flat = pd.read_sql(enriched_flat_query, conn)
    
    # Create export directory
    export_dir = '../exports'
    os.makedirs(export_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # CSV export
    csv_file = f'{export_dir}/scout_flat_enriched_complete_{timestamp}.csv'
    df_enriched_flat.to_csv(csv_file, index=False)
    
    # Create summary statistics
    summary_stats = {
        'total_transactions': len(df_enriched_flat),
        'date_range': {
            'start': df_enriched_flat['transaction_date'].min(),
            'end': df_enriched_flat['transaction_date'].max()
        },
        'value_distribution': df_enriched_flat['transaction_value_tier'].value_counts().to_dict(),
        'basket_distribution': df_enriched_flat['basket_size_tier'].value_counts().to_dict(),
        'shopping_context': df_enriched_flat['shopping_context'].value_counts().to_dict(),
        'location_distribution': df_enriched_flat['location_type'].value_counts().to_dict()
    }
    
    # Save summary as JSON
    summary_file = f'{export_dir}/scout_flat_summary_{timestamp}.json'
    with open(summary_file, 'w') as f:
        json.dump(summary_stats, f, indent=2, default=str)
    
    print(f"✅ Enriched flat CSV exported: {csv_file}")
    print(f"📊 Summary statistics: {summary_file}")
    print(f"📈 Records: {len(df_enriched_flat):,} rows")
    print(f"📅 Date range: {summary_stats['date_range']['start']} to {summary_stats['date_range']['end']}")
    print(f"💰 Value tiers: {summary_stats['value_distribution']}")
    
    return csv_file, df_enriched_flat, summary_stats

# Export enhanced files
print("🚀 Starting enhanced export process...")
excel_file, df_crosstab = export_enhanced_excel_crosstab()
csv_file, df_flat, summary = export_enriched_flat_csv()

print(f"\n📋 Enhanced Export Summary:")
print(f"✅ Excel crosstab with business analytics: {excel_file}")
print(f"✅ Enriched flat CSV with context: {csv_file}")
print(f"📊 Total data points: {len(df_crosstab) + len(df_flat):,}")
print(f"🎯 Ready for business research framework analysis")

In [None]:
def run_ai_enhanced_etl_pipeline(export_full=False):
    """Complete AI-enhanced ETL pipeline with CRISP-DM methodology"""
    
    pipeline_start = datetime.now()
    print(f"AI-Enhanced ETL Pipeline started at: {pipeline_start}")
    print("=" * 60)
    
    try:
        # CRISP-DM Phase 1: Business Understanding (already defined)
        print("\\nCRISP-DM PHASE 1: Business Understanding ✅")
        print("- Customer segmentation requirements defined")
        print("- CLV prediction objectives established")
        print("- Anomaly detection goals outlined")
        
        # CRISP-DM Phase 2: Data Understanding
        print("\\nCRISP-DM PHASE 2: Data Understanding")
        health = run_health_check()
        customer_features = extract_customer_features()
        print(f"✅ Extracted {len(customer_features)} customer profiles")
        
        # CRISP-DM Phase 3: Data Preparation
        print("\\nCRISP-DM PHASE 3: Data Preparation")
        X_raw, X_scaled, scaler, feature_cols = prepare_ml_features(customer_features)
        print(f"✅ Prepared {len(feature_cols)} ML features")
        
        # CRISP-DM Phase 4: Modeling
        print("\\nCRISP-DM PHASE 4: Modeling")
        
        # Customer Segmentation
        seg_model, clusters, cluster_stats = build_customer_segmentation_model(X_scaled.values)
        print(f"✅ Built customer segmentation model with {len(cluster_stats)} segments")
        
        # CLV Prediction
        clv_model, clv_metrics, clv_importance = build_clv_prediction_model(customer_features, X_raw)
        if clv_model:
            print(f"✅ Built CLV prediction model (R² = {clv_metrics['r2']:.3f})")
        
        # Anomaly Detection
        anomaly_model, anomaly_labels, anomaly_scores = build_anomaly_detection_model(X_scaled.values)
        anomaly_count = (anomaly_labels == -1).sum()
        print(f"✅ Built anomaly detection model ({anomaly_count} anomalies detected)")
        
        # CRISP-DM Phase 5: Evaluation
        print("\\nCRISP-DM PHASE 5: Evaluation")
        saved_models = save_models_and_create_scoring_functions()
        print("✅ Models evaluated and saved for production")
        
        # CRISP-DM Phase 6: Deployment
        print("\\nCRISP-DM PHASE 6: Deployment")
        
        # Export enhanced customer data with ML predictions
        enhanced_customers = customer_features.copy()
        enhanced_customers['customer_segment'] = clusters
        enhanced_customers['segment_name'] = enhanced_customers['customer_segment'].map(
            dict(zip(cluster_stats['cluster'], cluster_stats['cluster_name']))
        )
        enhanced_customers['is_anomaly'] = (anomaly_labels == -1).astype(int)
        enhanced_customers['anomaly_score'] = anomaly_scores
        
        if clv_model:
            purchase_mask = enhanced_customers['total_purchases'] > 0
            enhanced_customers.loc[purchase_mask, 'predicted_clv'] = clv_model.predict(X_raw[purchase_mask])
            enhanced_customers['predicted_clv'] = enhanced_customers['predicted_clv'].fillna(0)
        
        # Export AI-enhanced datasets
        export_dir = '../exports'
        os.makedirs(export_dir, exist_ok=True)
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Enhanced customer analytics export
        customer_file = f'{export_dir}/scout_customer_analytics_ai_{timestamp}.csv'
        enhanced_customers.to_csv(customer_file, index=False)
        print(f"✅ Exported AI-enhanced customer analytics: {customer_file}")
        
        # Business intelligence dashboard data
        dashboard_data = {
            'segment_summary': cluster_stats.to_dict('records'),
            'anomaly_summary': {
                'total_customers': len(enhanced_customers),
                'anomalies_detected': enhanced_customers['is_anomaly'].sum(),
                'anomaly_rate': enhanced_customers['is_anomaly'].mean()
            },
            'clv_summary': {
                'avg_predicted_clv': enhanced_customers.get('predicted_clv', pd.Series([0])).mean(),
                'high_value_customers': len(enhanced_customers[enhanced_customers.get('predicted_clv', 0) > 1000])
            } if clv_model else None
        }
        
        dashboard_file = f'{export_dir}/scout_dashboard_data_{timestamp}.json'
        with open(dashboard_file, 'w') as f:
            json.dump(dashboard_data, f, indent=2, default=str)
        print(f"✅ Exported dashboard data: {dashboard_file}")
        
        # Traditional ETL exports (if requested)
        if export_full:
            traditional_summary = run_full_export_pipeline()
            print("✅ Completed traditional ETL exports")
        
        # Pipeline success summary
        pipeline_end = datetime.now()
        duration = (pipeline_end - pipeline_start).total_seconds()
        
        ai_pipeline_summary = {
            'status': 'success',
            'pipeline_type': 'AI-Enhanced ETL with CRISP-DM',
            'duration_seconds': duration,
            'crisp_dm_phases_completed': 6,
            'models_built': {
                'customer_segmentation': True,
                'clv_prediction': clv_model is not None,
                'anomaly_detection': True
            },
            'exports_created': {
                'customer_analytics': customer_file,
                'dashboard_data': dashboard_file,
                'traditional_etl': export_full
            },
            'ml_metrics': {
                'customers_analyzed': len(enhanced_customers),
                'segments_created': len(cluster_stats),
                'anomalies_detected': enhanced_customers['is_anomaly'].sum(),
                'clv_r2_score': clv_metrics['r2'] if clv_model else None
            }
        }
        
        print(f"\\n🎯 AI-Enhanced ETL Pipeline completed successfully!")
        print(f"⏱️  Duration: {duration:.2f} seconds")
        print(f"📊 CRISP-DM methodology: All 6 phases completed")
        print(f"🤖 ML models deployed and ready for production")
        
        return ai_pipeline_summary
        
    except Exception as e:
        print(f"\\n❌ AI-Enhanced ETL Pipeline failed: {str(e)}")
        return {
            'status': 'failed',
            'error': str(e),
            'pipeline_type': 'AI-Enhanced ETL with CRISP-DM'
        }

# Run AI-Enhanced ETL Pipeline
ai_result = run_ai_enhanced_etl_pipeline(export_full=False)
print(f"\\n📋 Final Pipeline Result:")
print(json.dumps(ai_result, indent=2, default=str))

## 14. Enhanced ETL Pipeline with AI/ML Integration

In [None]:
def save_models_and_create_scoring_functions():
    """Save trained models and create production scoring functions"""
    
    # Create models directory
    models_dir = '../models'
    os.makedirs(models_dir, exist_ok=True)
    
    # Save models
    model_files = {}
    
    try:
        # Save segmentation model
        joblib.dump(segmentation_model, f'{models_dir}/customer_segmentation_model.pkl')
        model_files['segmentation'] = f'{models_dir}/customer_segmentation_model.pkl'
        
        # Save scaler
        joblib.dump(scaler, f'{models_dir}/feature_scaler.pkl')
        model_files['scaler'] = f'{models_dir}/feature_scaler.pkl'
        
        # Save CLV model if available
        if clv_model is not None:
            joblib.dump(clv_model, f'{models_dir}/clv_prediction_model.pkl')
            model_files['clv'] = f'{models_dir}/clv_prediction_model.pkl'
        
        # Save anomaly detection model
        joblib.dump(anomaly_model, f'{models_dir}/anomaly_detection_model.pkl')
        model_files['anomaly'] = f'{models_dir}/anomaly_detection_model.pkl'
        
        print(f"Models saved successfully:")
        for model_type, file_path in model_files.items():
            print(f"  {model_type}: {file_path}")
        
    except Exception as e:
        print(f"Error saving models: {str(e)}")
        return None
    
    return model_files

def create_production_scoring_function():
    """Create production-ready scoring function for new customers"""
    
    scoring_function = '''
def score_new_customer(customer_features, models_dir='../models'):
    \"\"\"
    Score new customer with segmentation, CLV prediction, and anomaly detection
    
    Args:
        customer_features (dict): Customer feature dictionary
        models_dir (str): Path to saved models
    
    Returns:
        dict: Scoring results
    \"\"\"
    import joblib
    import pandas as pd
    import numpy as np
    
    # Load models
    segmentation_model = joblib.load(f'{models_dir}/customer_segmentation_model.pkl')
    scaler = joblib.load(f'{models_dir}/feature_scaler.pkl')
    anomaly_model = joblib.load(f'{models_dir}/anomaly_detection_model.pkl')
    
    try:
        clv_model = joblib.load(f'{models_dir}/clv_prediction_model.pkl')
    except:
        clv_model = None
    
    # Prepare features
    feature_cols = [
        'total_interactions', 'stores_visited', 'customer_lifetime_days',
        'total_purchases', 'total_spent', 'avg_transaction_value',
        'total_items_purchased', 'brands_purchased', 'dayparts_active',
        'avg_interactions_per_day', 'purchase_conversion_rate', 
        'avg_items_per_transaction', 'weekend_preference',
        'recency_score', 'frequency_score', 'monetary_score',
        'engagement_intensity', 'cross_store_mobility', 'brand_loyalty'
    ]
    
    # Convert to DataFrame and scale
    X = pd.DataFrame([customer_features])[feature_cols]
    X_scaled = scaler.transform(X)
    
    # Make predictions
    segment = segmentation_model.predict(X_scaled)[0]
    anomaly_score = anomaly_model.decision_function(X_scaled)[0]
    is_anomaly = anomaly_model.predict(X_scaled)[0] == -1
    
    clv_prediction = None
    if clv_model is not None and customer_features.get('total_purchases', 0) > 0:
        clv_prediction = clv_model.predict(X)[0]
    
    # Cluster name mapping
    cluster_names = {0: 'VIP_High_Value', 1: 'Premium_Loyal', 2: 'Regular_Active', 
                    3: 'Casual_Shoppers', 4: 'New_Customers'}
    
    return {
        'customer_segment': segment,
        'segment_name': cluster_names.get(segment, 'Unknown'),
        'predicted_clv': clv_prediction,
        'anomaly_score': anomaly_score,
        'is_anomaly': is_anomaly,
        'risk_level': 'HIGH' if is_anomaly else 'LOW'
    }
    '''
    
    # Save scoring function to file
    with open('../models/scoring_function.py', 'w') as f:
        f.write(scoring_function)
    
    print("Production scoring function created: ../models/scoring_function.py")
    
    return scoring_function

# Save models and create scoring functions
saved_models = save_models_and_create_scoring_functions()
scoring_func = create_production_scoring_function()

# Model evaluation summary
evaluation_summary = {
    'segmentation_model': {
        'algorithm': 'K-Means',
        'clusters': len(cluster_analysis),
        'silhouette_score': max([silhouette_score(X_scaled.values, customer_clusters)])
    },
    'clv_model': clv_metrics if clv_model is not None else None,
    'anomaly_model': {
        'algorithm': 'Isolation Forest',
        'contamination_rate': 0.1,
        'anomalies_detected': df_customers['is_anomaly'].sum()
    }
}

print(f"\\nModel Evaluation Summary:")
print(json.dumps(evaluation_summary, indent=2))

## 13. CRISP-DM Phase 5: Evaluation & Production Deployment

In [None]:
def build_anomaly_detection_model(X_scaled):
    """Build Isolation Forest model for anomaly detection"""
    
    # Train Isolation Forest model
    isolation_forest = IsolationForest(
        contamination=0.1,  # Expect 10% anomalies
        random_state=42,
        n_estimators=100
    )
    
    # Fit and predict anomalies
    anomaly_labels = isolation_forest.fit_predict(X_scaled)
    anomaly_scores = isolation_forest.decision_function(X_scaled)
    
    # Convert labels: -1 (anomaly) to 1, 1 (normal) to 0
    is_anomaly = (anomaly_labels == -1).astype(int)
    
    print(f"Anomaly Detection Results:")
    print(f"Total customers analyzed: {len(X_scaled)}")
    print(f"Anomalies detected: {is_anomaly.sum()} ({is_anomaly.mean()*100:.1f}%)")
    print(f"Normal customers: {(1-is_anomaly).sum()} ({(1-is_anomaly.mean())*100:.1f}%)")
    
    # Analyze anomaly characteristics
    df_customers['is_anomaly'] = is_anomaly
    df_customers['anomaly_score'] = anomaly_scores
    
    # Compare anomaly vs normal customer characteristics
    comparison = df_customers.groupby('is_anomaly').agg({
        'total_interactions': ['mean', 'median'],
        'total_spent': ['mean', 'median'],
        'stores_visited': ['mean', 'median'],
        'customer_lifetime_days': ['mean', 'median'],
        'purchase_conversion_rate': ['mean', 'median']
    }).round(2)
    
    print(f"\\nAnomaly vs Normal Customer Comparison:")
    print(comparison)
    
    # Identify top anomalies
    top_anomalies = df_customers[df_customers['is_anomaly'] == 1].nsmallest(5, 'anomaly_score')
    print(f"\\nTop 5 Anomalous Customers:")
    print(top_anomalies[['FacialID', 'total_interactions', 'total_spent', 'stores_visited', 'anomaly_score']].to_string(index=False))
    
    return isolation_forest, anomaly_labels, anomaly_scores

# Build anomaly detection model
anomaly_model, anomaly_labels, anomaly_scores = build_anomaly_detection_model(X_scaled.values)

# Summary of anomaly detection insights
anomaly_summary = {
    'total_customers': len(df_customers),
    'anomalies_detected': df_customers['is_anomaly'].sum(),
    'anomaly_rate': df_customers['is_anomaly'].mean(),
    'avg_anomaly_score': df_customers[df_customers['is_anomaly'] == 1]['anomaly_score'].mean()
}

print(f"\\nAnomaly Detection Summary: {anomaly_summary}")

## 12. CRISP-DM Phase 4: Modeling - Anomaly Detection

In [None]:
def build_clv_prediction_model(df_customers, X_raw):
    """Build Random Forest model for Customer Lifetime Value prediction"""
    
    # Prepare target variable (CLV)
    # Using total_spent as proxy for historical CLV, predict future CLV
    y = df_customers['total_spent'].values
    
    # Filter customers with sufficient purchase history
    purchase_mask = df_customers['total_purchases'] > 0
    X_clv = X_raw[purchase_mask].copy()
    y_clv = y[purchase_mask]
    
    if len(X_clv) < 50:
        print("Insufficient customers with purchase history for CLV modeling")
        return None, None, None
    
    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(
        X_clv, y_clv, test_size=0.2, random_state=42
    )
    
    # Train Random Forest model
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42
    )
    
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"CLV Prediction Model Performance:")
    print(f"MSE: {mse:.2f}")
    print(f"R² Score: {r2:.3f}")
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_clv.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Generate CLV predictions for all customers
    clv_predictions = rf_model.predict(X_raw[purchase_mask])
    df_customers.loc[purchase_mask, 'predicted_clv'] = clv_predictions
    df_customers['predicted_clv'] = df_customers['predicted_clv'].fillna(0)
    
    return rf_model, {'mse': mse, 'r2': r2}, feature_importance

# Build CLV prediction model
clv_model, clv_metrics, clv_feature_importance = build_clv_prediction_model(df_customers, X_raw)

if clv_model is not None:
    print(f"\\nCLV Predictions Summary:")
    print(f"Mean predicted CLV: ₱{df_customers['predicted_clv'].mean():.2f}")
    print(f"Max predicted CLV: ₱{df_customers['predicted_clv'].max():.2f}")
    print(f"Customers with CLV > ₱1000: {(df_customers['predicted_clv'] > 1000).sum()}")

## 11. CRISP-DM Phase 4: Modeling - CLV Prediction

In [None]:
def build_customer_segmentation_model(X_scaled, n_clusters=5):
    """Build K-Means customer segmentation model"""
    
    # Determine optimal number of clusters using elbow method
    inertias = []
    silhouette_scores = []
    k_range = range(2, 8)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))\n    
    # Select best k based on silhouette score
    best_k = k_range[np.argmax(silhouette_scores)]
    print(f"Optimal number of clusters: {best_k} (silhouette score: {max(silhouette_scores):.3f})")
    
    # Train final model
    kmeans_model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    cluster_labels = kmeans_model.fit_predict(X_scaled)
    
    # Calculate cluster characteristics
    cluster_stats = pd.DataFrame()
    for i in range(best_k):
        cluster_mask = cluster_labels == i
        cluster_data = X_raw[cluster_mask]
        
        stats = {
            'cluster': i,
            'count': cluster_mask.sum(),
            'avg_interactions': cluster_data['total_interactions'].mean(),
            'avg_spent': cluster_data['total_spent'].mean(),
            'avg_stores': cluster_data['stores_visited'].mean(),
            'avg_lifetime_days': cluster_data['customer_lifetime_days'].mean(),
            'conversion_rate': cluster_data['purchase_conversion_rate'].mean()
        }
        cluster_stats = pd.concat([cluster_stats, pd.DataFrame([stats])], ignore_index=True)
    
    # Assign cluster names based on characteristics
    cluster_stats = cluster_stats.sort_values('avg_spent', ascending=False).reset_index(drop=True)
    cluster_names = ['VIP_High_Value', 'Premium_Loyal', 'Regular_Active', 'Casual_Shoppers', 'New_Customers']
    cluster_stats['cluster_name'] = cluster_names[:len(cluster_stats)]
    
    print("\\nCluster Analysis:")
    print(cluster_stats.to_string(index=False))
    
    return kmeans_model, cluster_labels, cluster_stats

# Build segmentation model
segmentation_model, customer_clusters, cluster_analysis = build_customer_segmentation_model(X_scaled.values)

# Add cluster labels to customer data
df_customers['customer_segment'] = customer_clusters
df_customers['segment_name'] = df_customers['customer_segment'].map(
    dict(zip(cluster_analysis['cluster'], cluster_analysis['cluster_name']))
)

print(f"\\nCustomer segment distribution:")
print(df_customers['segment_name'].value_counts())

## 10. CRISP-DM Phase 4: Modeling - Customer Segmentation

In [None]:
def prepare_ml_features(df_customers):
    """Prepare ML-ready features with scaling and encoding"""
    
    # Create a copy for ML processing
    df_ml = df_customers.copy()
    
    # Fill missing values
    numeric_cols = df_ml.select_dtypes(include=[np.number]).columns
    df_ml[numeric_cols] = df_ml[numeric_cols].fillna(0)
    
    # Feature engineering: RFM-like metrics
    df_ml['recency_score'] = (datetime.now() - pd.to_datetime(df_ml['last_seen'])).dt.days
    df_ml['frequency_score'] = df_ml['total_interactions']
    df_ml['monetary_score'] = df_ml['total_spent']
    
    # Customer engagement metrics
    df_ml['engagement_intensity'] = df_ml['total_interactions'] / np.maximum(df_ml['customer_lifetime_days'], 1)
    df_ml['cross_store_mobility'] = df_ml['stores_visited'] / np.maximum(df_ml['total_interactions'], 1)
    df_ml['brand_loyalty'] = df_ml['brands_purchased'] / np.maximum(df_ml['total_purchases'], 1)
    
    # Select features for ML models
    feature_cols = [
        'total_interactions', 'stores_visited', 'customer_lifetime_days',
        'total_purchases', 'total_spent', 'avg_transaction_value',
        'total_items_purchased', 'brands_purchased', 'dayparts_active',
        'avg_interactions_per_day', 'purchase_conversion_rate', 
        'avg_items_per_transaction', 'weekend_preference',
        'recency_score', 'frequency_score', 'monetary_score',
        'engagement_intensity', 'cross_store_mobility', 'brand_loyalty'
    ]
    
    X = df_ml[feature_cols].copy()
    
    # Handle infinite values
    X = X.replace([np.inf, -np.inf], 0)
    
    # Scale features for ML algorithms
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols, index=X.index)
    
    print(f"Prepared ML features: {X_scaled_df.shape}")
    print(f"Feature summary:\n{X.describe()}")
    
    return X, X_scaled_df, scaler, feature_cols

# Prepare ML features
X_raw, X_scaled, scaler, feature_columns = prepare_ml_features(df_customers)

# Data quality validation
print(f"\nData Quality Check:")
print(f"Missing values: {X_raw.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(X_raw.values).sum()}")
print(f"Feature correlation matrix shape: {X_raw.corr().shape}")

## 9. CRISP-DM Phase 2: Data Understanding & Phase 3: Data Preparation

In [None]:
def extract_customer_features():
    """Extract customer-centric features for ML modeling"""
    
    # Extract comprehensive customer data with FacialID grouping
    customer_query = '''
        SELECT 
            si.FacialID,
            COUNT(DISTINCT si.InteractionID) as total_interactions,
            COUNT(DISTINCT si.StoreID) as stores_visited,
            COUNT(DISTINCT CAST(si.TransactionDate AS date)) as active_days,
            MIN(si.TransactionDate) as first_seen,
            MAX(si.TransactionDate) as last_seen,
            DATEDIFF(day, MIN(si.TransactionDate), MAX(si.TransactionDate)) as customer_lifetime_days,
            
            -- Purchase behavior (from matched transactions)
            COUNT(DISTINCT flat.CanonicalTxID) as total_purchases,
            ISNULL(SUM(flat.Amount), 0) as total_spent,
            ISNULL(AVG(flat.Amount), 0) as avg_transaction_value,
            ISNULL(MAX(flat.Amount), 0) as max_transaction_value,
            ISNULL(SUM(flat.Basket_Item_Count), 0) as total_items_purchased,
            
            -- Behavioral patterns
            COUNT(DISTINCT flat.brand) as brands_purchased,
            COUNT(DISTINCT flat.daypart) as dayparts_active,
            COUNT(CASE WHEN flat.weekday_weekend = 'weekend' THEN 1 END) as weekend_purchases,
            COUNT(CASE WHEN flat.weekday_weekend = 'weekday' THEN 1 END) as weekday_purchases
            
        FROM dbo.SalesInteractions si
        LEFT JOIN gold.v_transactions_flat flat ON si.InteractionID = flat.CanonicalTxID
        WHERE si.FacialID IS NOT NULL
        GROUP BY si.FacialID
        HAVING COUNT(DISTINCT si.InteractionID) >= 5  -- Minimum activity threshold
        ORDER BY total_interactions DESC
    '''
    
    with pyodbc.connect(conn_str) as conn:
        df_customers = pd.read_sql(customer_query, conn)
    
    # Calculate derived features
    df_customers['avg_interactions_per_day'] = df_customers['total_interactions'] / np.maximum(df_customers['customer_lifetime_days'], 1)
    df_customers['purchase_conversion_rate'] = df_customers['total_purchases'] / df_customers['total_interactions']
    df_customers['avg_items_per_transaction'] = df_customers['total_items_purchased'] / np.maximum(df_customers['total_purchases'], 1)
    df_customers['weekend_preference'] = df_customers['weekend_purchases'] / (df_customers['weekend_purchases'] + df_customers['weekday_purchases'])
    df_customers['clv_estimate'] = df_customers['total_spent'] * (df_customers['customer_lifetime_days'] / 30) * 0.1  # Simple CLV proxy
    
    print(f"Extracted features for {len(df_customers):,} customers")
    print(f"Feature columns: {list(df_customers.columns)}")
    
    return df_customers

# Extract customer features
df_customers = extract_customer_features()
print(f"\nCustomer feature dataset shape: {df_customers.shape}")
print(f"Top customers by interactions:\n{df_customers[['FacialID', 'total_interactions', 'stores_visited', 'total_spent']].head()}")

# Scout v7 AI-Enhanced ETL Pipeline

CRISP-DM methodology ETL pipeline for Scout transaction data with AI/ML components, JSON safety, and canonical joins.

## Features:
- **CRISP-DM Methodology**: 6-phase data science workflow
- **AI/ML Components**: Customer segmentation, CLV prediction, anomaly detection
- **JSON Safety**: Malformed payload handling (ISJSON guards)
- **Canonical Joins**: Transaction ID joins with timestamp authority
- **Export Pipeline**: Clean CSV formats with ML-ready features
- **Data Quality**: Comprehensive validation and monitoring
- **Real-time Scoring**: Production-ready ML inference

## 1. Environment Setup

In [None]:
import os
import pandas as pd
import pyodbc
import json
from datetime import datetime, timedelta
import warnings
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# Database connection parameters
SERVER = 'sqltbwaprojectscoutserver.database.windows.net'
DATABASE = 'SQL-TBWA-ProjectScout-Reporting-Prod'
USERNAME = 'sqladmin'
PASSWORD = 'Azure_pw26'

# Connection string
conn_str = f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={SERVER};DATABASE={DATABASE};UID={USERNAME};PWD={PASSWORD}'

print(f"Connecting to: {SERVER}/{DATABASE}")
print(f"Timestamp: {datetime.now()}")
print("AI/ML Components: Customer Segmentation, CLV Prediction, Anomaly Detection")

## 2. Data Quality Health Check

In [None]:
def run_health_check():
    """Run comprehensive data quality checks"""
    
    health_queries = {
        'json_health': '''
            SELECT
              'JSON_HEALTH' as check_type,
              SUM(CASE WHEN ISJSON(payload_json)=1 THEN 1 ELSE 0 END) AS good_json,
              SUM(CASE WHEN ISJSON(payload_json)=0 THEN 1 ELSE 0 END) AS bad_json,
              COUNT(*) as total_payloads
            FROM dbo.PayloadTransactions
        ''',
        
        'timestamp_coverage': '''
            SELECT
              'TIMESTAMP_COVERAGE' as check_type,
              COUNT(*) AS flat_rows,
              SUM(CASE WHEN txn_ts IS NOT NULL THEN 1 ELSE 0 END) AS with_ts,
              CAST(100.0 * SUM(CASE WHEN txn_ts IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*) AS decimal(5,2)) AS coverage_pct
            FROM dbo.v_transactions_flat_production
        ''',
        
        'view_row_counts': '''
            SELECT 'gold.v_transactions_flat' as view_name, COUNT(*) as row_count FROM gold.v_transactions_flat
            UNION ALL
            SELECT 'gold.v_transactions_crosstab', COUNT(*) FROM gold.v_transactions_crosstab
            UNION ALL  
            SELECT 'gold.v_transactions_flat_v24', COUNT(*) FROM gold.v_transactions_flat_v24
        '''
    }
    
    results = {}
    
    with pyodbc.connect(conn_str) as conn:
        for check_name, query in health_queries.items():
            df = pd.read_sql(query, conn)
            results[check_name] = df
            print(f"\n{check_name.upper()}:")
            print(df.to_string(index=False))
    
    return results

# Run health check
health_results = run_health_check()

## 3. Data Extraction Functions

In [None]:
def extract_flat_transactions(days_back=30, limit=None):
    """Extract flat transaction data with optional filtering"""
    
    base_query = '''
        SELECT 
            CanonicalTxID,
            TransactionID,
            DeviceID,
            StoreID,
            StoreName,
            brand,
            product_name,
            category,
            Amount,
            Basket_Item_Count,
            payment_method,
            audio_transcript,
            Txn_TS,
            daypart,
            weekday_weekend,
            transaction_date
        FROM gold.v_transactions_flat
        WHERE Txn_TS IS NOT NULL
    '''
    
    if days_back:
        base_query += f" AND transaction_date >= CONVERT(date, DATEADD(day,-{days_back},SYSUTCDATETIME()))"
    
    base_query += " ORDER BY transaction_date DESC, Txn_TS DESC"
    
    if limit:
        base_query = f"SELECT TOP {limit} * FROM ({base_query}) t"
    
    with pyodbc.connect(conn_str) as conn:
        df = pd.read_sql(base_query, conn)
    
    print(f"Extracted {len(df):,} flat transactions")
    return df

def extract_crosstab_analytics(days_back=30):
    """Extract crosstab analytics data"""
    
    query = '''
        SELECT 
            [date],
            store_id,
            store_name,
            municipality_name,
            daypart,
            brand,
            txn_count,
            total_amount,
            avg_basket_amount,
            substitution_events
        FROM gold.v_transactions_crosstab
    '''
    
    if days_back:
        query += f" WHERE [date] >= CONVERT(date, DATEADD(day,-{days_back},SYSUTCDATETIME()))"
    
    query += " ORDER BY [date] DESC, total_amount DESC"
    
    with pyodbc.connect(conn_str) as conn:
        df = pd.read_sql(query, conn)
    
    print(f"Extracted {len(df):,} crosstab analytics rows")
    return df

def extract_v24_compatibility():
    """Extract v24 compatibility format"""
    
    query = 'SELECT * FROM gold.v_transactions_flat_v24 WHERE Txn_TS IS NOT NULL ORDER BY Txn_TS DESC'
    
    with pyodbc.connect(conn_str) as conn:
        df = pd.read_sql(query, conn)
    
    print(f"Extracted {len(df):,} v24 compatibility rows")
    return df

# Test extraction functions
print("Testing extraction functions...")
sample_flat = extract_flat_transactions(days_back=7, limit=100)
sample_crosstab = extract_crosstab_analytics(days_back=7)
print(f"\nSample flat shape: {sample_flat.shape}")
print(f"Sample crosstab shape: {sample_crosstab.shape}")

## 4. Data Analysis & Insights

In [None]:
def analyze_transaction_patterns(df_flat, df_crosstab):
    """Analyze transaction patterns and generate insights"""
    
    insights = {}
    
    # Time-based patterns
    insights['daypart_analysis'] = df_flat.groupby('daypart').agg({
        'CanonicalTxID': 'count',
        'Amount': ['sum', 'mean']
    }).round(2)
    
    # Brand performance
    insights['top_brands'] = df_flat.groupby('brand').agg({
        'CanonicalTxID': 'count',
        'Amount': 'sum'
    }).sort_values('Amount', ascending=False).head(10)
    
    # Store performance
    insights['store_performance'] = df_flat.groupby(['StoreID', 'StoreName']).agg({
        'CanonicalTxID': 'count',
        'Amount': ['sum', 'mean']
    }).round(2)
    
    # Payment method distribution
    insights['payment_methods'] = df_flat['payment_method'].value_counts()
    
    # Date range
    insights['date_range'] = {
        'min_date': df_flat['transaction_date'].min(),
        'max_date': df_flat['transaction_date'].max(),
        'total_days': (df_flat['transaction_date'].max() - df_flat['transaction_date'].min()).days
    }
    
    return insights

# Generate insights
print("Generating transaction insights...")
insights = analyze_transaction_patterns(sample_flat, sample_crosstab)

print("\nDAYPART ANALYSIS:")
print(insights['daypart_analysis'])

print("\nTOP BRANDS:")
print(insights['top_brands'].head())

print("\nDATE RANGE:")
print(insights['date_range'])

## 5. Export Functions

In [None]:
def export_to_csv(df, filename, export_dir='../exports'):
    """Export dataframe to CSV with timestamp"""
    
    # Create export directory if it doesn't exist
    os.makedirs(export_dir, exist_ok=True)
    
    # Add timestamp to filename
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    base_name = filename.replace('.csv', '')
    timestamped_filename = f"{base_name}_{timestamp}.csv"
    
    filepath = os.path.join(export_dir, timestamped_filename)
    
    # Export to CSV
    df.to_csv(filepath, index=False)
    
    print(f"Exported {len(df):,} rows to {filepath}")
    print(f"File size: {os.path.getsize(filepath) / 1024 / 1024:.2f} MB")
    
    return filepath

def run_full_export_pipeline():
    """Run complete export pipeline"""
    
    print("Starting full export pipeline...")
    
    # Extract all data
    print("\n1. Extracting flat transactions...")
    df_flat = extract_flat_transactions(days_back=None)  # All data
    
    print("\n2. Extracting crosstab analytics...")
    df_crosstab = extract_crosstab_analytics(days_back=None)  # All data
    
    print("\n3. Extracting v24 compatibility...")
    df_v24 = extract_v24_compatibility()
    
    # Export to CSV
    print("\n4. Exporting to CSV...")
    flat_file = export_to_csv(df_flat, 'scout_flat_enriched_complete.csv')
    crosstab_file = export_to_csv(df_crosstab, 'scout_crosstab_enriched_complete.csv')
    v24_file = export_to_csv(df_v24, 'scout_v24_compatibility_complete.csv')
    
    # Generate summary
    summary = {
        'export_timestamp': datetime.now().isoformat(),
        'files_created': [flat_file, crosstab_file, v24_file],
        'row_counts': {
            'flat': len(df_flat),
            'crosstab': len(df_crosstab),
            'v24': len(df_v24)
        }
    }
    
    print("\n5. Export Summary:")
    print(json.dumps(summary, indent=2))
    
    return summary

# Run sample export
print("Running sample export (last 7 days)...")
sample_flat_7d = extract_flat_transactions(days_back=7)
sample_crosstab_7d = extract_crosstab_analytics(days_back=7)

export_to_csv(sample_flat_7d, 'scout_flat_sample_7d.csv')
export_to_csv(sample_crosstab_7d, 'scout_crosstab_sample_7d.csv')

## 6. Automated ETL Pipeline

In [None]:
def run_etl_pipeline(export_full=False):
    """Complete ETL pipeline with health checks"""
    
    pipeline_start = datetime.now()
    print(f"ETL Pipeline started at: {pipeline_start}")
    print("=" * 50)
    
    try:
        # Step 1: Health Check
        print("\nSTEP 1: Running health checks...")
        health = run_health_check()
        
        # Step 2: Data Extraction
        print("\nSTEP 2: Extracting data...")
        if export_full:
            summary = run_full_export_pipeline()
        else:
            # Sample export for testing
            df_flat = extract_flat_transactions(days_back=30, limit=1000)
            df_crosstab = extract_crosstab_analytics(days_back=30)
            
            export_to_csv(df_flat, 'scout_flat_sample.csv')
            export_to_csv(df_crosstab, 'scout_crosstab_sample.csv')
            
            summary = {
                'export_timestamp': datetime.now().isoformat(),
                'mode': 'sample',
                'row_counts': {
                    'flat': len(df_flat),
                    'crosstab': len(df_crosstab)
                }
            }
        
        # Step 3: Success Summary
        pipeline_end = datetime.now()
        duration = (pipeline_end - pipeline_start).total_seconds()
        
        print("\nSTEP 3: Pipeline completed successfully!")
        print(f"Duration: {duration:.2f} seconds")
        print(f"End time: {pipeline_end}")
        
        return {
            'status': 'success',
            'duration_seconds': duration,
            'summary': summary,
            'health_checks': health
        }
        
    except Exception as e:
        print(f"\nERROR: Pipeline failed with error: {str(e)}")
        return {
            'status': 'failed',
            'error': str(e)
        }

# Run ETL pipeline (sample mode)
result = run_etl_pipeline(export_full=False)
print("\nPipeline Result:")
print(json.dumps(result, indent=2, default=str))

## 7. Production Export (Uncomment to run full export)

In [None]:
# Uncomment the line below to run full production export
# production_result = run_etl_pipeline(export_full=True)
# print(json.dumps(production_result, indent=2, default=str))

print("To run full production export, uncomment the lines above and execute this cell.")
print("This will export ALL transaction data to timestamped CSV files.")

## 15. Enhanced Excel & CSV Export Functions

## 8. CRISP-DM Phase 1: Business Understanding

### Customer Analytics Requirements
- **Customer Segmentation**: Identify customer personas and behavior patterns
- **Customer Lifetime Value (CLV)**: Predict future value and optimize acquisition
- **Anomaly Detection**: Identify unusual patterns and potential fraud
- **Real-time Scoring**: Enable production ML inference for new transactions