In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.sql.functions import *
from pyspark.sql import Window
import mlflow

fe = FeatureEngineeringClient()

In [0]:
def create_healthcare_patient_features():
    """Create feature table based on the new healthcare schema"""
    # Load data from new healthcare schema
    df = spark.table("juan_dev.healthcare_data.silver_patients")
    
    # Create features that match the original training expectations
    healthcare_features = (
        df
        # Use the actual column names from silver_patients table
        .withColumn("age", col("age_years"))
        .withColumn("sex", col("sex_standardized"))
        .withColumn("region", col("region_standardized"))
        .withColumn("bmi", col("bmi_validated"))
        .withColumn("children", col("children_count"))
        .withColumn("smoker", col("smoker_flag"))
        .withColumn("charges", col("insurance_charges"))  # This is our target variable
        
        # Create derived features similar to original
        # Age risk scoring
        .withColumn("age_risk_score",
                   expr("CASE WHEN age < 25 THEN 1 " +
                        "WHEN age < 35 THEN 2 " +
                        "WHEN age < 50 THEN 3 " +
                        "WHEN age < 65 THEN 4 " +
                        "ELSE 5 END"))
        
        # Smoking impact factor
        .withColumn("smoking_impact", 
                   expr("CASE WHEN smoker THEN age * 2.5 ELSE age * 1.0 END"))
        
        # Family size risk adjustment
        .withColumn("family_size_factor", 
                   expr("1 + (children * 0.15)"))
        
        # Regional cost adjustment
        .withColumn("regional_multiplier",
                   expr("CASE WHEN region = 'northeast' THEN 1.2 " +
                        "WHEN region = 'northwest' THEN 1.1 " +
                        "WHEN region = 'southeast' THEN 1.0 " +
                        "ELSE 0.95 END"))
        
        # Health risk composite score
        .withColumn("health_risk_composite",
                   expr("(age_risk_score * 20) + " +
                        "(CASE WHEN smoker THEN 50 ELSE 0 END) + " +
                        "(CASE WHEN bmi > 30 THEN 30 ELSE 0 END)"))
        
        # Add healthcare specific features
        .withColumn("data_quality_score", coalesce(col("data_quality_score"), lit(0.8)))
        .withColumn("hipaa_compliant", col("hipaa_deidentified"))
        
        # Use patient_id as the primary key for features
        .select(
            col("patient_id"),
            col("age"), col("sex"), col("region"), col("bmi"), 
            col("children"), col("smoker"), col("charges"),
            col("age_risk_score"), col("smoking_impact"), 
            col("family_size_factor"), col("regional_multiplier"),
            col("health_risk_composite"), col("data_quality_score"),
            col("hipaa_compliant"), col("processed_at")
        )
    )
    
    return healthcare_features

# Create feature table
print("Creating healthcare patient features...")
healthcare_features_df = create_healthcare_patient_features()

# Display sample to verify
healthcare_features_df.show(5)

# Create the feature table
feature_table_name = "juan_dev.healthcare_data.patient_features"

try:
    fe.create_table(
        name=feature_table_name,
        primary_keys=["patient_id"],
        df=healthcare_features_df,
        description="Healthcare patient features for insurance cost prediction using new schema"
    )
    print(f"Successfully created feature table: {feature_table_name}")
except Exception as e:
    print(f"Feature table might already exist: {e}")
    # Try to write to the existing table
    fe.write_table(
        name=feature_table_name,
        df=healthcare_features_df,
        mode="overwrite"
    )
    print(f"Updated existing feature table: {feature_table_name}")