In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import *
import mlflow

In [0]:
# Start MLflow experiment for EDA tracking
mlflow.set_experiment("/Users/juan.lamadrid@databricks.com/experiments/insurance_cost_prediction_eda")

with mlflow.start_run(run_name="healthcare_insurance_eda"):
    # Load data from new healthcare tables with proper joins
    # Base data from silver_patients table (replaces juan_dev.ml.insurance_silver)
    silver_patients = spark.table("juan_dev.healthcare_data.silver_patients")
    
    # Join with dimensional tables for enriched features
    dim_patients = spark.table("juan_dev.healthcare_data.dim_patients")
    dim_patients_summary = spark.table("juan_dev.healthcare_data.dim_patients_summary")
    
    # Create enriched dataset with demographic attributes
    # Join silver_patients with dim_patients (key = patient_id)
    enriched_data = (
        silver_patients
        .join(
            dim_patients.filter(col("is_current_record") == True),
            silver_patients.patient_id == dim_patients.patient_natural_key,
            "left"
        )
        # Derive age from current date (age = floor(months_between(current_date, birth_date)/12))
        # Note: age_years is already calculated in silver table
        .select(
            silver_patients.patient_id,
            silver_patients.age_years.alias("age"),  # Map age_years → age
            silver_patients.sex_standardized.alias("sex"),  # Map sex_standardized → sex  
            silver_patients.bmi_validated.alias("bmi"),  # Map bmi_validated → bmi
            silver_patients.children_count.alias("children"),  # Map children_count → children
            silver_patients.smoker_flag.alias("smoker"),  # Map smoker_flag → smoker
            silver_patients.region_standardized.alias("region"),  # Map region_standardized → region
            silver_patients.insurance_charges.alias("charges"),  # Map insurance_charges → charges
            # Additional demographic attributes from dim_patients
            dim_patients.patient_age_category,
            dim_patients.patient_bmi_category,
            dim_patients.health_risk_score,
            dim_patients.health_risk_category,
            dim_patients.demographic_segment,
            dim_patients.lifestyle_segment,
            dim_patients.patient_data_quality_score
        )
    )
    
    # Convert to Pandas for EDA
    df = enriched_data.toPandas()
    
    # Healthcare-specific data profiling
    eda_results = {
        "total_patients": len(df),
        "avg_age": df['age'].mean(),
        "smoker_percentage": (df['smoker'].sum() / len(df)) * 100,
        "high_cost_threshold": df['charges'].quantile(0.95),
        "missing_data_percentage": (df.isnull().sum() / len(df)) * 100
    }
    
    # Log healthcare compliance metrics
    # mlflow.log_metrics(eda_results)
    # Log healthcare compliance metrics
    # mlflow.log_metrics({k: float(v) for k, v in eda_results.items()})
    
    # Risk factor analysis using new column mapping
    risk_analysis = df.groupby(['smoker', 'patient_age_category']).agg({
        'charges': ['mean', 'median', 'std'],
        'bmi': 'mean'
    }).round(2)
    
    # Log visualizations
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=df, x='region', y='charges', hue='smoker')
    plt.title('Healthcare Costs by Region and Smoking Status')
    plt.xticks(rotation=45)
    mlflow.log_figure(plt.gcf(), "cost_distribution_by_region_smoking.png")
    
    # Feature correlation analysis
    correlation_matrix = df[['age', 'bmi', 'children', 'charges']].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Healthcare Feature Correlations')
    mlflow.log_figure(plt.gcf(), "feature_correlations.png")
    
    # Healthcare risk insights
    high_risk_patients = df[
        (df['smoker'] == True) & 
        (df['bmi'] > 30) & 
        (df['age'] > 50)
    ]
    
    mlflow.log_metrics({
        "high_risk_patients_count": len(high_risk_patients),
        "high_risk_avg_cost": high_risk_patients['charges'].mean()
    })


In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.sql import Window
from pyspark.sql.functions import col, expr, coalesce

fe = FeatureEngineeringClient()

# Healthcare insurance feature engineering with new schema
def create_healthcare_features():
    # Load data from new healthcare schema - filter for current records
    df = spark.table("juan_dev.healthcare_data.dim_patients").filter(col("is_current_record") == True)
    
    # Map categorical fields back to numeric for ML compatibility
    healthcare_features = (
        df
        # Convert age category to numeric age (approximate middle values)
        .withColumn("age",
                   expr("CASE WHEN patient_age_category = 'YOUNG' THEN 25 " +
                        "WHEN patient_age_category = 'ADULT' THEN 35 " +
                        "WHEN patient_age_category = 'MIDDLE_AGED' THEN 45 " +
                        "WHEN patient_age_category = 'SENIOR' THEN 60 " +
                        "ELSE 70 END"))
        
        # Convert BMI category to numeric BMI (approximate values)
        .withColumn("bmi",
                   expr("CASE WHEN patient_bmi_category = 'UNDERWEIGHT' THEN 17.5 " +
                        "WHEN patient_bmi_category = 'NORMAL' THEN 22.5 " +
                        "WHEN patient_bmi_category = 'OVERWEIGHT' THEN 27.5 " +
                        "ELSE 32.5 END"))
        
        # Convert family size category to numeric children count
        .withColumn("children",
                   expr("CASE WHEN patient_family_size_category = 'SINGLE' THEN 0 " +
                        "WHEN patient_family_size_category = 'COUPLE' THEN 0 " +
                        "WHEN patient_family_size_category = 'SMALL_FAMILY' THEN 1 " +
                        "WHEN patient_family_size_category = 'MEDIUM_FAMILY' THEN 2 " +
                        "ELSE 4 END"))
        
        # Convert smoking status to boolean
        .withColumn("smoker", col("patient_smoking_status") == "SMOKER")
        
        # Use region and sex directly
        .withColumn("region", col("patient_region"))
        .withColumn("sex", col("patient_sex"))
        
        # Age risk scoring (recreated from original logic)
        .withColumn("age_risk_score",
                   expr("CASE WHEN age < 25 THEN 1 " +
                        "WHEN age < 35 THEN 2 " +
                        "WHEN age < 50 THEN 3 " +
                        "WHEN age < 65 THEN 4 " +
                        "ELSE 5 END"))
        
        # Smoking impact factor (recreated)
        .withColumn("smoking_impact", 
                   expr("CASE WHEN smoker THEN age * 2.5 ELSE age * 1.0 END"))
        
        # Family size risk adjustment (recreated)
        .withColumn("family_size_factor", 
                   expr("1 + (children * 0.15)"))
        
        # Regional cost adjustment (recreated)
        .withColumn("regional_multiplier",
                   expr("CASE WHEN region = 'NORTHEAST' THEN 1.2 " +
                        "WHEN region = 'NORTHWEST' THEN 1.1 " +
                        "WHEN region = 'SOUTHEAST' THEN 1.0 " +
                        "ELSE 0.95 END"))
        
        # Use existing health risk composite score or create new one
        .withColumn("health_risk_composite",
                   coalesce(col("health_risk_score"), 
                           expr("(age_risk_score * 20) + " +
                                "(CASE WHEN smoker THEN 50 ELSE 0 END) + " +
                                "(CASE WHEN bmi > 30 THEN 30 ELSE 0 END)")))
        
        # Add new features from healthcare schema
        .withColumn("data_quality_score", col("patient_data_quality_score"))
        .withColumn("hipaa_compliant", col("hipaa_deidentification_applied"))
        
        # Use patient_natural_key as the primary key for features
        .withColumn("customer_id", col("patient_natural_key"))
    )
    
    # Ensure unique customer_id
    healthcare_features = healthcare_features.dropDuplicates(["customer_id"])
    
    return healthcare_features

# Create feature table in Unity Catalog
healthcare_features_df = create_healthcare_features()

fe.create_table(
    name="juan_dev.healthcare_data.ml_insurance_features",
    primary_keys=["customer_id"],
    df=healthcare_features_df,
    description="Healthcare-specific features for insurance risk prediction using new schema"
)