In [0]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureLookup
import pandas as pd
import numpy as np


# Configure Unity Catalog integration
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment("/Users/juan.lamadrid@databricks.com/experiments/insurance_cost_prediction_eda")

class HealthcareInsuranceModelV2:
    """
    Improved healthcare insurance model with embedded preprocessing pipeline.
    This ensures training/inference consistency and production readiness.
    """
    
    def __init__(self):
        self.fe = FeatureEngineeringClient()
        self.model_pipeline = None  # Will contain the full preprocessing + model pipeline
        
    def prepare_training_data(self):
        """Prepare training dataset with optimized feature lookups"""
        # Load base training data
        base_df = spark.table("juan_dev.ml.insurance_silver")

        # Define feature lookups - these should match your feature engineering
        feature_lookups = [
            FeatureLookup(
                table_name="juan_dev.ml.healthcare_features",
                lookup_key="customer_id",
                feature_name="age_risk_score"
            ),
            FeatureLookup(
                table_name="juan_dev.ml.healthcare_features",
                lookup_key="customer_id",
                feature_name="smoking_impact"
            ),
            FeatureLookup(
                table_name="juan_dev.ml.healthcare_features",
                lookup_key="customer_id",
                feature_name="family_size_factor"
            ),
            FeatureLookup(
                table_name="juan_dev.ml.healthcare_features",
                lookup_key="customer_id",
                feature_name="regional_multiplier"
            ),
            FeatureLookup(
                table_name="juan_dev.ml.healthcare_features",
                lookup_key="customer_id",
                feature_name="health_risk_composite"
            )
        ]

        # Create training set with automatic feature joining
        training_set = self.fe.create_training_set(
            df=base_df,
            feature_lookups=feature_lookups,
            label="charges",
            exclude_columns=["timestamp", "ingestion_timestamp"]
        )

        return training_set
    
    def create_preprocessing_pipeline(self):
        """
        Create a comprehensive preprocessing pipeline that handles both
        categorical encoding and numerical scaling consistently.
        """
        
        # Define feature columns that will be used for training
        categorical_features = ['sex', 'region']  # Keep these as raw categorical names
        numerical_features = [
            'age', 'bmi', 'children', 'age_risk_score', 
            'smoking_impact', 'family_size_factor', 
            'health_risk_composite', 'regional_multiplier'
        ]
        
        # Create preprocessing steps
        # For categorical: encode to numerical values
        # For numerical: standardize the values
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_features),
                ('cat', LabelEncoder(), categorical_features)
            ],
            remainder='drop'  # Drop any columns not specified
        )
        
        return preprocessor, categorical_features, numerical_features
    
    def train_model(self, training_set, model_type="random_forest"):
        """Train healthcare insurance model with embedded preprocessing pipeline"""
        
        with mlflow.start_run(run_name=f"healthcare_model_pipeline_{model_type}"):
            # Load training data
            training_df = training_set.load_df().toPandas()
            
            print(f"Training data shape: {training_df.shape}")
            print(f"Training data columns: {training_df.columns.tolist()}")
            
            # Create preprocessing pipeline
            preprocessor, categorical_features, numerical_features = self.create_preprocessing_pipeline()
            
            # Define all feature columns we'll use
            feature_columns = numerical_features + categorical_features
            
            # Prepare feature matrix and target
            X = training_df[feature_columns]
            y = training_df['charges']
            
            print(f"Feature columns: {feature_columns}")
            print(f"X shape: {X.shape}, y shape: {y.shape}")
            
            # Handle categorical encoding manually for ColumnTransformer compatibility
            # ColumnTransformer expects specific format for LabelEncoder
            X_processed = X.copy()
            label_encoders = {}
            
            # Manually encode categorical features and store encoders
            for feature in categorical_features:
                le = LabelEncoder()
                X_processed[feature] = le.fit_transform(X[feature].astype(str))
                label_encoders[feature] = le
            
            # Now use StandardScaler for all features (numerical + encoded categorical)
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_processed)
            
            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, y, test_size=0.2, random_state=42
            )
            
            # Select model based on type
            if model_type == "random_forest":
                base_model = RandomForestRegressor(
                    n_estimators=100,
                    max_depth=10,
                    min_samples_split=5,
                    min_samples_leaf=2,
                    random_state=42
                )
            else:  # gradient_boosting
                base_model = GradientBoostingRegressor(
                    n_estimators=100,
                    learning_rate=0.1,
                    max_depth=6,
                    random_state=42
                )
            
            # Create a custom pipeline that handles our specific preprocessing needs
            # This is a custom solution since ColumnTransformer with LabelEncoder can be tricky
            class HealthcarePipeline:
                def __init__(self, label_encoders, scaler, model, feature_columns):
                    self.label_encoders = label_encoders
                    self.scaler = scaler
                    self.model = model
                    self.feature_columns = feature_columns
                    self.categorical_features = list(label_encoders.keys())
                    
                def fit(self, X, y):
                    # The preprocessing has already been done, just fit the model
                    return self.model.fit(X, y)
                    
                def predict(self, X):
                    # Apply the same preprocessing steps as training
                    X_processed = X[self.feature_columns].copy()
                    
                    # Encode categorical features
                    for feature in self.categorical_features:
                        if feature in X_processed.columns:
                            # Handle unseen categories by using the most frequent class
                            try:
                                X_processed[feature] = self.label_encoders[feature].transform(
                                    X_processed[feature].astype(str)
                                )
                            except ValueError as e:
                                print(f"Warning: Unknown category in {feature}, using fallback")
                                # For unseen categories, use the most frequent encoded value
                                most_frequent = 0  # or use mode of training data
                                X_processed[feature] = X_processed[feature].apply(
                                    lambda x: most_frequent if x not in self.label_encoders[feature].classes_ 
                                    else self.label_encoders[feature].transform([str(x)])[0]
                                )
                    
                    # Scale features
                    X_scaled = self.scaler.transform(X_processed)
                    
                    # Make prediction
                    return self.model.predict(X_scaled)
                    
                def get_params(self, deep=True):
                    return self.model.get_params(deep)
                    
                def set_params(self, **params):
                    return self.model.set_params(**params)
            
            # Create the custom pipeline
            healthcare_pipeline = HealthcarePipeline(
                label_encoders=label_encoders,
                scaler=scaler,
                model=base_model,
                feature_columns=feature_columns
            )
            
            # Fit the pipeline (model part)
            healthcare_pipeline.fit(X_train, y_train)
            
            # Store the pipeline
            self.model_pipeline = healthcare_pipeline
            
            # Model evaluation
            y_pred = healthcare_pipeline.predict(training_df[feature_columns])
            y_test_pred = base_model.predict(X_test)
            
            # Calculate metrics
            r2 = r2_score(y_test, y_test_pred)
            mae = mean_absolute_error(y_test, y_test_pred)
            rmse = mean_squared_error(y_test, y_test_pred, squared=False)
            
            # Cross-validation on scaled data
            cv_scores = cross_val_score(base_model, X_scaled, y, cv=5, scoring='r2')
            
            # Healthcare-specific metrics
            high_cost_threshold = training_df['charges'].quantile(0.95)
            high_cost_accuracy = self._evaluate_high_cost_predictions(
                y_test, y_test_pred, high_cost_threshold
            )
            
            # Log parameters and metrics
            mlflow.log_params({
                "model_type": model_type,
                "n_features": len(feature_columns),
                "training_samples": len(X_train),
                "test_samples": len(X_test),
                "preprocessing": "custom_pipeline_with_encoding"
            })
            
            mlflow.log_metrics({
                "r2_score": r2,
                "mean_absolute_error": mae,
                "root_mean_squared_error": rmse,
                "cv_r2_mean": cv_scores.mean(),
                "cv_r2_std": cv_scores.std(),
                "high_cost_accuracy": high_cost_accuracy
            })
            
            # Log the complete pipeline with feature engineering integration
            model_info = self.fe.log_model(
                model=healthcare_pipeline,  # Log the complete pipeline
                artifact_path="model",
                flavor=mlflow.sklearn,
                training_set=training_set,
                registered_model_name="juan_dev.ml.healthcare_insurance_model_v2",
                metadata={
                    "algorithm": model_type,
                    "preprocessing": "embedded_pipeline",
                    "healthcare_compliance": "HIPAA_ready",
                    "model_purpose": "insurance_cost_prediction",
                    "feature_count": len(feature_columns),
                    "training_data_size": len(training_df),
                    "categorical_features": categorical_features,
                    "numerical_features": numerical_features
                }
            )
            
            # print(f"Model registered with URI: {model_info.model_uri}")
            return model_info
    
    def _evaluate_high_cost_predictions(self, y_true, y_pred, threshold):
        """Evaluate model performance on high-cost patients"""
        high_cost_true = y_true >= threshold
        high_cost_pred = y_pred >= threshold
        return (high_cost_true == high_cost_pred).mean()


# Example usage
print("Starting improved model training with embedded preprocessing...")

# Initialize the improved trainer
trainer_v2 = HealthcareInsuranceModelV2()

# Prepare training data
print("Preparing training data with feature engineering...")
training_set = trainer_v2.prepare_training_data()

# Train the model with embedded preprocessing
print("Training Random Forest model with preprocessing pipeline...")
rf_model_info = trainer_v2.train_model(training_set, "random_forest")

print("Model training completed successfully!")
print(f"Model version: {rf_model_info.model_version}")
print(f"Model URI: {rf_model_info.model_uri}")