In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
import mlflow
from math import sqrt

In [2]:
df = pd.read_csv("../data/cleaned_insurance.csv")
df.head()


Unnamed: 0,age,sex,region,urban_rural,income,education,marital_status,employment_status,household_size,dependents,...,policy_term_years,provider_quality,risk_score,chronic_count,hypertension,diabetes,asthma,cardiovascular_disease,mental_health,monthly_premium
0,79,Female,North,Urban,12800.0,No HS,Married,Employed,3,1,...,1,3.1,1.0,2,0,0,0,0,1,37.09
1,53,Male,Central,Suburban,89600.0,Doctorate,Married,Self-employed,2,0,...,7,3.9,0.8681,2,1,0,0,0,0,41.74
2,63,Female,North,Rural,305000.0,HS,Single,Employed,3,2,...,5,4.66,0.6923,1,1,0,0,0,0,44.06
3,36,Male,West,Rural,38900.0,Masters,Single,Employed,1,0,...,3,4.3,0.1978,0,0,0,0,0,0,34.55
4,21,Female,South,Suburban,83700.0,HS,Single,Employed,3,2,...,6,4.65,0.3187,1,0,1,0,0,0,37.1


In [3]:
target = "monthly_premium"


In [4]:
X = df.drop(columns=[target])
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

In [6]:
from sklearn.pipeline import Pipeline

numeric = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols)
    ]
)


In [7]:
from sklearn.linear_model import LinearRegression

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
preds = model.predict(X_val)

rmse = sqrt(mean_squared_error(y_val, preds))
mae = mean_absolute_error(y_val, preds)
r2 = r2_score(y_val, preds)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.3f}")


RMSE: 30.64
MAE: 18.47
R2: 0.187


In [9]:
mlflow.set_tracking_uri("mlruns")
mlflow.set_experiment("insurance-cost-prediction")

with mlflow.start_run(run_name="baseline_linreg"):
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # log model artifact
    mlflow.sklearn.log_model(model, name="model")

print("Run logged to MLflow.")




Run logged to MLflow.


In [11]:
from sklearn.ensemble import RandomForestRegressor

rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])

rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)

rmse = sqrt(mean_squared_error(y_val, rf_preds))
mae = mean_absolute_error(y_val, rf_preds)
r2 = r2_score(y_val, rf_preds)

with mlflow.start_run(run_name="random_forest"):
    mlflow.log_param("model", "RandomForestRegressor")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(rf_model, name="model")

print(f"RF RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.3f}")




RF RMSE: 31.38, MAE: 19.46, R2: 0.147


In [17]:
def run_experiment(
    X_train, X_val, y_train, y_val,
    preprocessor,
    n_estimators, max_depth, min_samples_leaf, random_state=42
):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            n_jobs=-1,
            random_state=random_state
        ))
    ])

    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    rmse = sqrt(mean_squared_error(y_val, preds))
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val, preds)

    with mlflow.start_run(run_name=f"RF_ne{n_estimators}_md{max_depth}_ml{min_samples_leaf}"):
        mlflow.log_param("model", "RandomForestRegressor")
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_leaf", min_samples_leaf)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        mlflow.sklearn.log_model(model, name="model")

    print(f"→ n_estimators={n_estimators}, max_depth={max_depth}, "
          f"min_samples_leaf={min_samples_leaf} | RMSE={rmse:.2f}, MAE={mae:.2f}, R2={r2:.3f}")


In [14]:
import itertools

n_estimators_list = [100, 200, 300]
max_depth_list = [8, 12, None]
min_samples_leaf_list = [1, 3, 5]

param_grid = itertools.product(n_estimators_list, max_depth_list, min_samples_leaf_list)


In [18]:
for n_estimators, max_depth, min_samples_leaf in param_grid:
    run_experiment(
        X_train, X_val, y_train, y_val,
        preprocessor,
        n_estimators, max_depth, min_samples_leaf
    )




→ n_estimators=100, max_depth=8, min_samples_leaf=3 | RMSE=30.79, MAE=18.48, R2=0.179




→ n_estimators=100, max_depth=8, min_samples_leaf=5 | RMSE=30.76, MAE=18.46, R2=0.180




→ n_estimators=100, max_depth=12, min_samples_leaf=1 | RMSE=31.02, MAE=18.66, R2=0.167




→ n_estimators=100, max_depth=12, min_samples_leaf=3 | RMSE=30.86, MAE=18.56, R2=0.175




→ n_estimators=100, max_depth=12, min_samples_leaf=5 | RMSE=30.79, MAE=18.52, R2=0.178




→ n_estimators=100, max_depth=None, min_samples_leaf=1 | RMSE=31.44, MAE=19.53, R2=0.143




→ n_estimators=100, max_depth=None, min_samples_leaf=3 | RMSE=31.04, MAE=18.96, R2=0.165




→ n_estimators=100, max_depth=None, min_samples_leaf=5 | RMSE=30.90, MAE=18.77, R2=0.173




→ n_estimators=200, max_depth=8, min_samples_leaf=1 | RMSE=30.86, MAE=18.50, R2=0.175




→ n_estimators=200, max_depth=8, min_samples_leaf=3 | RMSE=30.75, MAE=18.47, R2=0.181




→ n_estimators=200, max_depth=8, min_samples_leaf=5 | RMSE=30.74, MAE=18.45, R2=0.181




→ n_estimators=200, max_depth=12, min_samples_leaf=1 | RMSE=30.97, MAE=18.65, R2=0.169




→ n_estimators=200, max_depth=12, min_samples_leaf=3 | RMSE=30.82, MAE=18.55, R2=0.177




→ n_estimators=200, max_depth=12, min_samples_leaf=5 | RMSE=30.77, MAE=18.51, R2=0.180




→ n_estimators=200, max_depth=None, min_samples_leaf=1 | RMSE=31.39, MAE=19.49, R2=0.147




→ n_estimators=200, max_depth=None, min_samples_leaf=3 | RMSE=30.99, MAE=18.93, R2=0.168




→ n_estimators=200, max_depth=None, min_samples_leaf=5 | RMSE=30.87, MAE=18.75, R2=0.174




→ n_estimators=300, max_depth=8, min_samples_leaf=1 | RMSE=30.86, MAE=18.50, R2=0.175




→ n_estimators=300, max_depth=8, min_samples_leaf=3 | RMSE=30.76, MAE=18.46, R2=0.180




→ n_estimators=300, max_depth=8, min_samples_leaf=5 | RMSE=30.74, MAE=18.45, R2=0.181




→ n_estimators=300, max_depth=12, min_samples_leaf=1 | RMSE=30.98, MAE=18.65, R2=0.169




→ n_estimators=300, max_depth=12, min_samples_leaf=3 | RMSE=30.82, MAE=18.54, R2=0.177




→ n_estimators=300, max_depth=12, min_samples_leaf=5 | RMSE=30.78, MAE=18.51, R2=0.179




→ n_estimators=300, max_depth=None, min_samples_leaf=1 | RMSE=31.38, MAE=19.46, R2=0.147




→ n_estimators=300, max_depth=None, min_samples_leaf=3 | RMSE=30.98, MAE=18.91, R2=0.168




→ n_estimators=300, max_depth=None, min_samples_leaf=5 | RMSE=30.88, MAE=18.74, R2=0.174
