In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import *
import mlflow

In [0]:
# Start MLflow experiment for EDA tracking
mlflow.set_experiment("/Users/juan.lamadrid@databricks.com/experiments/insurance_cost_prediction_eda")

with mlflow.start_run(run_name="healthcare_insurance_eda_new_schema"):
    # Load data from new healthcare schema - filter for current records only
    df = spark.table("juan_dev.healthcare_data.dim_patients").filter(col("is_current_record") == True).toPandas()
    
    # Healthcare-specific data profiling with new schema
    eda_results = {
        "total_patients": len(df),
        "average_health_risk_score": df['health_risk_score'wa].mean(),
        "smoker_percentage": (df['patient_smoking_status'] == 'SMOKER').sum() / len(df) * 100,
        "data_quality_avg": df['patient_data_quality_score'].mean(),
        "hipaa_compliance_rate": df['hipaa_deidentification_applied'].sum() / len(df) * 100,
        "missing_data_percentage": (df.isnull().sum() / len(df)) * 100
    }
    
    # Log healthcare compliance metrics
    mlflow.log_metrics({k: float(v.iloc[0]) if hasattr(v, 'iloc') else float(v) for k, v in eda_results.items() if not isinstance(v, pd.Series) or len(v) == 1})
    
    # Risk factor analysis with new schema
    risk_analysis = df.groupby(['patient_smoking_status', 'patient_age_category']).agg({
        'health_risk_score': ['mean', 'median', 'std'],
        'patient_data_quality_score': 'mean'
    }).round(2)
    
    # Log visualizations with new schema fields
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=df, x='patient_region', y='health_risk_score', hue='patient_smoking_status')
    plt.title('Health Risk Scores by Region and Smoking Status')
    plt.xticks(rotation=45)
    mlflow.log_figure(plt.gcf(), "risk_distribution_by_region_smoking.png")
    
    # Feature correlation analysis with new schema
    numeric_cols = ['health_risk_score', 'patient_data_quality_score', 'patient_data_completeness_score']
    correlation_matrix = df[numeric_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Healthcare Feature Correlations')
    mlflow.log_figure(plt.gcf(), "feature_correlations.png")
    
    # Healthcare risk insights with new schema
    high_risk_patients = df[
        (df['patient_smoking_status'] == 'SMOKER') & 
        (df['patient_bmi_category'] == 'OBESE') & 
        (df['patient_age_category'].isin(['SENIOR', 'ELDERLY']))
    ]
    
    mlflow.log_metrics({
        "high_risk_patients_count": len(high_risk_patients),
        "high_risk_avg_score": high_risk_patients['health_risk_score'].mean() if len(high_risk_patients) > 0 else 0
    })

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.sql import Window
from pyspark.sql.functions import col, expr, coalesce

fe = FeatureEngineeringClient()

# Healthcare insurance feature engineering with new schema
def create_healthcare_features():
    # Load data from new healthcare schema - filter for current records
    df = spark.table("juan_dev.healthcare_data.dim_patients").filter(col("is_current_record") == True)
    
    # Map categorical fields back to numeric for ML compatibility
    healthcare_features = (
        df
        # Convert age category to numeric age (approximate middle values)
        .withColumn("age",
                   expr("CASE WHEN patient_age_category = 'YOUNG' THEN 25 " +
                        "WHEN patient_age_category = 'ADULT' THEN 35 " +
                        "WHEN patient_age_category = 'MIDDLE_AGED' THEN 45 " +
                        "WHEN patient_age_category = 'SENIOR' THEN 60 " +
                        "ELSE 70 END"))
        
        # Convert BMI category to numeric BMI (approximate values)
        .withColumn("bmi",
                   expr("CASE WHEN patient_bmi_category = 'UNDERWEIGHT' THEN 17.5 " +
                        "WHEN patient_bmi_category = 'NORMAL' THEN 22.5 " +
                        "WHEN patient_bmi_category = 'OVERWEIGHT' THEN 27.5 " +
                        "ELSE 32.5 END"))
        
        # Convert family size category to numeric children count
        .withColumn("children",
                   expr("CASE WHEN patient_family_size_category = 'SINGLE' THEN 0 " +
                        "WHEN patient_family_size_category = 'COUPLE' THEN 0 " +
                        "WHEN patient_family_size_category = 'SMALL_FAMILY' THEN 1 " +
                        "WHEN patient_family_size_category = 'MEDIUM_FAMILY' THEN 2 " +
                        "ELSE 4 END"))
        
        # Convert smoking status to boolean
        .withColumn("smoker", col("patient_smoking_status") == "SMOKER")
        
        # Use region and sex directly
        .withColumn("region", col("patient_region"))
        .withColumn("sex", col("patient_sex"))
        
        # Age risk scoring (recreated from original logic)
        .withColumn("age_risk_score",
                   expr("CASE WHEN age < 25 THEN 1 " +
                        "WHEN age < 35 THEN 2 " +
                        "WHEN age < 50 THEN 3 " +
                        "WHEN age < 65 THEN 4 " +
                        "ELSE 5 END"))
        
        # Smoking impact factor (recreated)
        .withColumn("smoking_impact", 
                   expr("CASE WHEN smoker THEN age * 2.5 ELSE age * 1.0 END"))
        
        # Family size risk adjustment (recreated)
        .withColumn("family_size_factor", 
                   expr("1 + (children * 0.15)"))
        
        # Regional cost adjustment (recreated)
        .withColumn("regional_multiplier",
                   expr("CASE WHEN region = 'NORTHEAST' THEN 1.2 " +
                        "WHEN region = 'NORTHWEST' THEN 1.1 " +
                        "WHEN region = 'SOUTHEAST' THEN 1.0 " +
                        "ELSE 0.95 END"))
        
        # Use existing health risk composite score or create new one
        .withColumn("health_risk_composite",
                   coalesce(col("health_risk_score"), 
                           expr("(age_risk_score * 20) + " +
                                "(CASE WHEN smoker THEN 50 ELSE 0 END) + " +
                                "(CASE WHEN bmi > 30 THEN 30 ELSE 0 END)")))
        
        # Add new features from healthcare schema
        .withColumn("data_quality_score", col("patient_data_quality_score"))
        .withColumn("hipaa_compliant", col("hipaa_deidentification_applied"))
        
        # Use patient_natural_key as the primary key for features
        .withColumn("customer_id", col("patient_natural_key"))
    )
    
    # Ensure unique customer_id
    healthcare_features = healthcare_features.dropDuplicates(["customer_id"])
    
    return healthcare_features

# Create feature table in Unity Catalog
healthcare_features_df = create_healthcare_features()

fe.create_table(
    name="juan_dev.healthcare_data.ml_insurance_features",
    primary_keys=["customer_id"],
    df=healthcare_features_df,
    description="Healthcare-specific features for insurance risk prediction using new schema"
)