In [0]:
print("📊 DAY 5: BUSINESS DASHBOARD & INSIGHTS")
print("="*60)

📊 DAY 5: BUSINESS DASHBOARD & INSIGHTS


## set correct catalog

In [0]:
spark.sql("USE CATALOG brazil_project")
print("✅ Using brazil_project catalog")

✅ Using brazil_project catalog


### Churn Overview


In [0]:
print("\n1️⃣ CHURN OVERVIEW DASHBOARD")
churn_summary = spark.sql("""
SELECT 
    risk_level,
    COUNT(*) as customers,
    ROUND(AVG(churn_probability), 3) as avg_churn_prob,
    ROUND(SUM(CASE WHEN churn_prediction = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) as predicted_churn_rate,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM ml.customer_churn_predictions), 1) as percentage_of_total
FROM ml.customer_churn_predictions
GROUP BY risk_level
ORDER BY 
    CASE risk_level 
        WHEN 'High' THEN 1
        WHEN 'Medium' THEN 2 
        WHEN 'Low' THEN 3
    END
""")
print("📊 Risk Level Distribution:")
churn_summary.show()



1️⃣ CHURN OVERVIEW DASHBOARD
📊 Risk Level Distribution:
+----------+---------+--------------+--------------------+-------------------+
|risk_level|customers|avg_churn_prob|predicted_churn_rate|percentage_of_total|
+----------+---------+--------------+--------------------+-------------------+
|      High|    46910|         0.651|               100.0|               47.5|
|    Medium|    29692|         0.349|                77.1|               30.1|
|       Low|    22064|         0.053|                 0.0|               22.4|
+----------+---------+--------------+--------------------+-------------------+



### HIGH-RISK CUSTOMER ANALYSIS BY STATE

In [0]:
print("\n2️⃣ HIGH-RISK CUSTOMER ANALYSIS BY STATE")
high_risk_insights = spark.sql("""
SELECT 
    c.customer_state,
    COUNT(*) as high_risk_customers,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 1) as state_share_percent,
    ROUND(AVG(c.total_spent), 2) as avg_customer_value,
    ROUND(AVG(c.avg_review_score), 2) as avg_satisfaction,
    ROUND(AVG(p.churn_probability), 3) as avg_churn_prob
FROM gold.ml_customer_features c
JOIN ml.customer_churn_predictions p ON c.customer_id = p.customer_id
WHERE p.risk_level = 'High'
GROUP BY c.customer_state
ORDER BY high_risk_customers DESC
LIMIT 10
""")
print("📍 Top 10 States with High-Risk Customers:")
high_risk_insights.show()



2️⃣ HIGH-RISK CUSTOMER ANALYSIS BY STATE
📍 Top 10 States with High-Risk Customers:
+--------------+-------------------+-------------------+------------------+----------------+--------------+
|customer_state|high_risk_customers|state_share_percent|avg_customer_value|avg_satisfaction|avg_churn_prob|
+--------------+-------------------+-------------------+------------------+----------------+--------------+
|            SP|              21146|               45.1|             43.56|            4.22|         0.653|
|            RJ|               5683|               12.1|             45.36|            3.91|         0.647|
|            MG|               5321|               11.3|             45.14|            4.15|         0.648|
|            RS|               2545|                5.4|             45.46|            4.19|         0.646|
|            PR|               2419|                5.2|             43.73|            4.26|         0.652|
|            SC|               1644|                

###  CUSTOMER SEGMENT ANALYSIS

In [0]:
print("\n3️⃣ CUSTOMER SEGMENT ANALYSIS")
segment_analysis = spark.sql("""
SELECT 
    c.value_segment,
    c.customer_segment,
    COUNT(*) as total_customers,
    SUM(CASE WHEN p.risk_level = 'High' THEN 1 ELSE 0 END) as high_risk_count,
    ROUND(SUM(CASE WHEN p.risk_level = 'High' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) as high_risk_percent,
    ROUND(AVG(p.churn_probability), 3) as avg_churn_prob,
    ROUND(AVG(c.total_spent), 2) as avg_customer_value
FROM gold.ml_customer_features c
JOIN ml.customer_churn_predictions p ON c.customer_id = p.customer_id
GROUP BY c.value_segment, c.customer_segment
ORDER BY high_risk_percent DESC
""")
print("👥 Which Segments are Most at Risk?:")
segment_analysis.show()


3️⃣ CUSTOMER SEGMENT ANALYSIS
👥 Which Segments are Most at Risk?:
+--------------+----------------+---------------+---------------+-----------------+--------------+------------------+
| value_segment|customer_segment|total_customers|high_risk_count|high_risk_percent|avg_churn_prob|avg_customer_value|
+--------------+----------------+---------------+---------------+-----------------+--------------+------------------+
|Very Low Value|        Very New|          32641|          30996|             95.0|         0.673|             37.54|
|     Low Value|        Very New|          29028|          15914|             54.8|         0.493|             86.22|
|    High Value|        Very New|          12590|              0|              0.0|         0.033|            475.55|
|  Medium Value|        Very New|          24407|              0|              0.0|          0.22|            158.83|
+--------------+----------------+---------------+---------------+-----------------+--------------+---------

### BUSINESS IMPACT & ROI CALCULATION

In [0]:
print("\n4️⃣ BUSINESS IMPACT & ROI CALCULATION")
roi_calculation = spark.sql("""
WITH high_risk AS (
    SELECT 
        c.customer_id,
        c.total_spent,
        p.churn_probability,
        c.avg_order_value,
        c.total_orders
    FROM gold.ml_customer_features c
    JOIN ml.customer_churn_predictions p ON c.customer_id = p.customer_id
    WHERE p.risk_level = 'High'
)
SELECT 
    'Business Impact Summary' as metric,
    COUNT(*) as value,
    'High-risk customers identified' as description
FROM high_risk

UNION ALL

SELECT 
    'Total Revenue at Risk',
    ROUND(SUM(total_spent), 2),
    'R$ value from high-risk customers'
FROM high_risk

UNION ALL

SELECT 
    'Average Customer Value',
    ROUND(AVG(total_spent), 2),
    'R$ per high-risk customer'
FROM high_risk

UNION ALL

SELECT 
    'Campaign Cost (R$10 each)',
    COUNT(*) * 10,
    'Cost to offer R$10 discount to all'
FROM high_risk

UNION ALL

SELECT 
    'Expected Retained Value (25% retention)',
    ROUND(SUM(total_spent) * 0.25, 2),
    'Assuming 25% retention success rate'
FROM high_risk

UNION ALL

SELECT 
    'ROI Multiplier',
    ROUND((SUM(total_spent) * 0.25) / (COUNT(*) * 10), 2),
    'For every R$1 spent, get R$X back'
FROM high_risk
ORDER BY 
    CASE metric
        WHEN 'Business Impact Summary' THEN 1
        WHEN 'Total Revenue at Risk' THEN 2
        WHEN 'Average Customer Value' THEN 3
        WHEN 'Campaign Cost (R$10 each)' THEN 4
        WHEN 'Expected Retained Value (25% retention)' THEN 5
        WHEN 'ROI Multiplier' THEN 6
    END
""")
print("💰 Retention Campaign Business Case:")
roi_calculation.show(truncate=False)


4️⃣ BUSINESS IMPACT & ROI CALCULATION
💰 Retention Campaign Business Case:
+---------------------------------------+----------+-----------------------------------+
|metric                                 |value     |description                        |
+---------------------------------------+----------+-----------------------------------+
|Business Impact Summary                |46910.0   |High-risk customers identified     |
|Total Revenue at Risk                  |2076740.44|R$ value from high-risk customers  |
|Average Customer Value                 |44.27     |R$ per high-risk customer          |
|Campaign Cost (R$10 each)              |469100.0  |Cost to offer R$10 discount to all |
|Expected Retained Value (25% retention)|519185.11 |Assuming 25% retention success rate|
|ROI Multiplier                         |1.11      |For every R$1 spent, get R$X back  |
+---------------------------------------+----------+-----------------------------------+



In [0]:
print("\n5️⃣ MODEL PERFORMANCE SUMMARY")
model_summary = spark.sql("""
SELECT 
    'Model Performance' as category,
    'Decision Tree' as model,
    '0.390' as f1_score,
    '0.620' as accuracy,
    '0.748' as roc_auc
UNION ALL
SELECT 
    'Model Performance',
    'Logistic Regression',
    '0.385',
    '0.606', 
    '0.753'
UNION ALL
SELECT 
    'Model Performance',
    'Random Forest',
    '0.376',
    '0.572',
    '0.751'
UNION ALL
SELECT 
    'Data Statistics',
    'Total Customers',
    '98,666',
    '15.0% churn rate',
    '3-model comparison'
UNION ALL
SELECT 
    'Business Readiness',
    'MLflow Tracking',
    'Experiment logged',
    'Predictions saved',
    'Dashboard ready'
""")
print("📈 Project Success Metrics:")
model_summary.show(truncate=False)


5️⃣ MODEL PERFORMANCE SUMMARY
📈 Project Success Metrics:
+------------------+-------------------+-----------------+-----------------+------------------+
|category          |model              |f1_score         |accuracy         |roc_auc           |
+------------------+-------------------+-----------------+-----------------+------------------+
|Model Performance |Decision Tree      |0.390            |0.620            |0.748             |
|Model Performance |Logistic Regression|0.385            |0.606            |0.753             |
|Model Performance |Random Forest      |0.376            |0.572            |0.751             |
|Data Statistics   |Total Customers    |98,666           |15.0% churn rate |3-model comparison|
|Business Readiness|MLflow Tracking    |Experiment logged|Predictions saved|Dashboard ready   |
+------------------+-------------------+-----------------+-----------------+------------------+

