In [0]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureLookup
import pandas as pd
import numpy as np

from pyspark.sql.functions import *

# Configure Unity Catalog integration
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment("/Users/juan.lamadrid@databricks.com/experiments/insurance_cost_prediction_eda")

In [0]:
class HealthcareInsuranceModel:
    """
    Updated healthcare insurance model for new schema focused on health risk prediction.
    This model predicts health_risk_score instead of charges.
    """
    
    def __init__(self):
        self.fe = FeatureEngineeringClient()
        self.model_pipeline = None
        
    def prepare_training_data(self):
        """Prepare training dataset using new healthcare schema"""
        # Load base training data from new schema - filter for current records
        base_df = spark.table("juan_dev.healthcare_data.dim_patients").filter(col("is_current_record") == True)

        feature_table_name="juan_dev.healthcare_data.ml_insurance_features"

        # Define feature lookups for engineered features
        feature_lookups = [
            FeatureLookup(
                table_name=feature_table_name,
                lookup_key="customer_id",
                feature_name="age_risk_score"
            ),
            FeatureLookup(
                table_name=feature_table_name,
                lookup_key="customer_id",
                feature_name="smoking_impact"
            ),
            FeatureLookup(
                table_name=feature_table_name,
                lookup_key="customer_id",
                feature_name="family_size_factor"
            ),
            FeatureLookup(
                table_name=feature_table_name,
                lookup_key="customer_id",
                feature_name="regional_multiplier"
            ),
            FeatureLookup(
                table_name=feature_table_name,
                lookup_key="customer_id",
                feature_name="health_risk_composite"
            ),
            FeatureLookup(
                table_name=feature_table_name,
                lookup_key="customer_id",
                feature_name="data_quality_score"
            )
        ]

        # Create training set with automatic feature joining
        # Use health_risk_score as the new label instead of charges
        training_set = self.fe.create_training_set(
            df=base_df.withColumn("customer_id", col("patient_natural_key")),
            feature_lookups=feature_lookups,
            label="health_risk_score",
            exclude_columns=["timestamp", "ingestion_timestamp", "effective_from_date", "effective_to_date"]
        )

        return training_set
    
    def create_preprocessing_pipeline(self):
        """
        Create preprocessing pipeline for new schema fields
        """
        
        # Updated feature columns for new schema
        categorical_features = ['patient_sex', 'patient_region']  # Use new schema column names
        numerical_features = [
            'age_risk_score', 'smoking_impact', 'family_size_factor', 
            'health_risk_composite', 'regional_multiplier', 'data_quality_score'
        ]
        
        # Create preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_features),
                ('cat', LabelEncoder(), categorical_features)
            ],
            remainder='drop'
        )
        
        return preprocessor, categorical_features, numerical_features
    
    def train_model(self, training_set, model_type="random_forest"):
        """Train healthcare risk prediction model with new schema"""
        
        with mlflow.start_run(run_name=f"healthcare_risk_model_{model_type}_new_schema"):
            # Load training data
            training_df = training_set.load_df().toPandas()
            
            print(f"Training data shape: {training_df.shape}")
            print(f"Training data columns: {training_df.columns.tolist()}")
            
            # Create preprocessing pipeline
            preprocessor, categorical_features, numerical_features = self.create_preprocessing_pipeline()
            
            # Define all feature columns we'll use
            feature_columns = numerical_features + categorical_features
            
            # Prepare feature matrix and target (health_risk_score instead of charges)
            X = training_df[feature_columns]
            y = training_df['health_risk_score']  # Updated target variable
            
            print(f"Feature columns: {feature_columns}")
            print(f"X shape: {X.shape}, y shape: {y.shape}")
            print(f"Target variable stats: mean={y.mean():.2f}, std={y.std():.2f}, min={y.min()}, max={y.max()}")
            
            # Handle categorical encoding manually
            X_processed = X.copy()
            label_encoders = {}
            
            # Manually encode categorical features and store encoders
            for feature in categorical_features:
                le = LabelEncoder()
                X_processed[feature] = le.fit_transform(X[feature].astype(str))
                label_encoders[feature] = le
            
            # StandardScaler for all features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_processed)
            
            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, y, test_size=0.2, random_state=42
            )
            
            # Select model based on type
            if model_type == "random_forest":
                base_model = RandomForestRegressor(
                    n_estimators=100,
                    max_depth=10,
                    min_samples_split=5,
                    min_samples_leaf=2,
                    random_state=42
                )
            else:  # gradient_boosting
                base_model = GradientBoostingRegressor(
                    n_estimators=100,
                    learning_rate=0.1,
                    max_depth=6,
                    random_state=42
                )
            
            # Custom pipeline for new schema
            class HealthcareRiskPipeline:
                def __init__(self, label_encoders, scaler, model, feature_columns):
                    self.label_encoders = label_encoders
                    self.scaler = scaler
                    self.model = model
                    self.feature_columns = feature_columns
                    self.categorical_features = list(label_encoders.keys())
                    
                def fit(self, X, y):
                    return self.model.fit(X, y)
                    
                def predict(self, X):
                    # Apply the same preprocessing steps as training
                    X_processed = X[self.feature_columns].copy()
                    
                    # Encode categorical features
                    for feature in self.categorical_features:
                        if feature in X_processed.columns:
                            try:
                                X_processed[feature] = self.label_encoders[feature].transform(
                                    X_processed[feature].astype(str)
                                )
                            except ValueError as e:
                                print(f"Warning: Unknown category in {feature}, using fallback")
                                most_frequent = 0
                                X_processed[feature] = X_processed[feature].apply(
                                    lambda x: most_frequent if x not in self.label_encoders[feature].classes_ 
                                    else self.label_encoders[feature].transform([str(x)])[0]
                                )
                    
                    # Scale features
                    X_scaled = self.scaler.transform(X_processed)
                    
                    # Make prediction
                    return self.model.predict(X_scaled)
                    
                def get_params(self, deep=True):
                    return self.model.get_params(deep)
                    
                def set_params(self, **params):
                    return self.model.set_params(**params)
            
            # Create the custom pipeline
            healthcare_pipeline = HealthcareRiskPipeline(
                label_encoders=label_encoders,
                scaler=scaler,
                model=base_model,
                feature_columns=feature_columns
            )
            
            # Fit the pipeline
            healthcare_pipeline.fit(X_train, y_train)
            
            # Store the pipeline
            self.model_pipeline = healthcare_pipeline
            
            # Model evaluation
            y_pred = healthcare_pipeline.predict(training_df[feature_columns])
            y_test_pred = base_model.predict(X_test)
            
            # Calculate metrics
            r2 = r2_score(y_test, y_test_pred)
            mae = mean_absolute_error(y_test, y_test_pred)
            rmse = mean_squared_error(y_test, y_test_pred, squared=False)
            
            # Cross-validation on scaled data
            cv_scores = cross_val_score(base_model, X_scaled, y, cv=5, scoring='r2')
            
            # Healthcare-specific metrics for risk prediction
            high_risk_threshold = training_df['health_risk_score'].quantile(0.95)
            high_risk_accuracy = self._evaluate_high_risk_predictions(
                y_test, y_test_pred, high_risk_threshold
            )
            
            # Log parameters and metrics
            mlflow.log_params({
                "model_type": model_type,
                "n_features": len(feature_columns),
                "training_samples": len(X_train),
                "test_samples": len(X_test),
                "preprocessing": "custom_pipeline_new_schema",
                "target_variable": "health_risk_score",
                "schema_version": "healthcare_v2"
            })
            
            mlflow.log_metrics({
                "r2_score": r2,
                "mean_absolute_error": mae,
                "root_mean_squared_error": rmse,
                "cv_r2_mean": cv_scores.mean(),
                "cv_r2_std": cv_scores.std(),
                "high_risk_accuracy": high_risk_accuracy,
                "target_mean": y.mean(),
                "target_std": y.std()
            })
            
            # Log the complete pipeline with feature engineering integration
            model_info = self.fe.log_model(
                model=healthcare_pipeline,
                artifact_path="model",
                flavor=mlflow.sklearn,
                training_set=training_set,
                registered_model_name="juan_dev.healthcare_data.insurance_model",
                metadata={
                    "algorithm": model_type,
                    "preprocessing": "embedded_pipeline",
                    "healthcare_compliance": "HIPAA_ready",
                    "model_purpose": "health_risk_prediction",
                    "feature_count": len(feature_columns),
                    "training_data_size": len(training_df),
                    "categorical_features": categorical_features,
                    "numerical_features": numerical_features,
                    "schema_version": "healthcare_v2",
                    "target_variable": "health_risk_score"
                }
            )
            
            return model_info
    
    def _evaluate_high_risk_predictions(self, y_true, y_pred, threshold):
        """Evaluate model performance on high-risk patients"""
        high_risk_true = y_true >= threshold
        high_risk_pred = y_pred >= threshold
        return (high_risk_true == high_risk_pred).mean()


# Example usage with new schema
print("Starting healthcare risk model training with new schema...")

# Initialize the trainer
trainer = HealthcareInsuranceModel()

# Prepare training data from new schema
print("Preparing training data with new healthcare schema...")
training_set = trainer.prepare_training_data()

# Train the model for health risk prediction
print("Training Random Forest model for health risk prediction...")
rf_model_info = trainer.train_model(training_set, "random_forest")

print("Health risk model training completed successfully!")
# print(f"Model version: {rf_model_info.model_version}")
# print(f"Model URI: {rf_model_info.model_uri}")