In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle

# Load data
df = pd.read_csv('/content/medical_insurance.csv')
X = df.drop('charges', axis=1)
y = df['charges'].values.reshape(-1, 1)  # reshape for scaler

# Categorical and numerical columns
categorical = ['sex', 'smoker', 'region']
numerical = ['age', 'bmi', 'children']

# Scaling for features
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical),
    ('num', StandardScaler(), numerical)
])

# Scale target variable
target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y).ravel()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_scaled, test_size=0.2, random_state=42
)

# Define models & hyperparameters
models = {
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            'regressor__n_estimators': [100, 200],
            'regressor__max_depth': [None, 10, 20]
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.05, 0.1]
        }
    },
    "XGBoost": {
        "model": XGBRegressor(random_state=42, verbosity=0),
        "params": {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.05, 0.1]
        }
    }
}

best_score = -np.inf
best_model = None
best_name = ""

# Train and evaluate each model
for name, config in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', config["model"])
    ])

    grid = GridSearchCV(pipeline, config["params"], cv=5, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)

    # Predict on the test set (scaled values)
    preds_scaled = grid.predict(X_test)

    # Calculate RMSE and MAE on scaled values (using scaled predictions and y_test)
    rmse_scaled = np.sqrt(mean_squared_error(y_test, preds_scaled))
    mae_scaled = mean_absolute_error(y_test, preds_scaled)

    # Calculate R² (for model evaluation)
    r2 = r2_score(y_test, preds_scaled)

    print(f"\n📊 Model: {name}")
    print(f"Best R²: {r2:.4f}")
    print(f"Scaled MAE: {mae_scaled:.2f}")
    print(f"Scaled RMSE: {rmse_scaled:.2f}")
    print(f"Best Params: {grid.best_params_}")

    if r2 > best_score:
        best_score = r2
        best_model = grid.best_estimator_
        best_name = name

# Save best model and target scaler
pickle.dump(best_model, open("insurance_model.pkl", "wb"))
pickle.dump(target_scaler, open("target_scaler.pkl", "wb"))

print(f"\n✅ Best Model: {best_name} with R² = {best_score:.4f}")



📊 Model: RandomForest
Best R²: 0.9504
Scaled MAE: 0.11
Scaled RMSE: 0.23
Best Params: {'regressor__max_depth': None, 'regressor__n_estimators': 200}

📊 Model: GradientBoosting
Best R²: 0.8898
Scaled MAE: 0.18
Scaled RMSE: 0.34
Best Params: {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 200}

📊 Model: XGBoost
Best R²: 0.9471
Scaled MAE: 0.12
Scaled RMSE: 0.23
Best Params: {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 200}

✅ Best Model: RandomForest with R² = 0.9504
