In [13]:
# --- Setup & Load Data ---
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('Dataset2_insurance.csv')
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.4706
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
# --- Minimal EDA ---
print("Nulls:\n", df.isnull().sum())
print("\nTarget summary (charges):")
print(df['charges'].agg(['mean','median','std','min','max']).to_frame().T)

print("\nSmoker vs Non-smoker mean charges:")
print(df.groupby('smoker')['charges'].mean().round(2))


Nulls:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Target summary (charges):
               mean     median         std        min         max
charges 13,270.4223 9,382.0330 12,110.0112 1,121.8739 63,770.4280

Smoker vs Non-smoker mean charges:
smoker
no     8,434.2700
yes   32,050.2300
Name: charges, dtype: float64


In [15]:
# --- Encoding & Feature Engineering ---
le_sex, le_smoker, le_region = LabelEncoder(), LabelEncoder(), LabelEncoder()
df['sex_encoded'] = le_sex.fit_transform(df['sex'])
df['smoker_encoded'] = le_smoker.fit_transform(df['smoker'])
df['region_encoded'] = le_region.fit_transform(df['region'])

# Polynomial and interaction terms for linear models
df['age_squared']   = df['age']**2
df['bmi_squared']   = df['bmi']**2
df['smoker_bmi']    = df['smoker_encoded'] * df['bmi']
df['age_bmi']       = df['age'] * df['bmi']
df['age_smoker']    = df['age'] * df['smoker_encoded']

feature_cols = [
    'age','sex_encoded','bmi','children','smoker_encoded','region_encoded',
    'age_squared','bmi_squared','smoker_bmi','age_bmi','age_smoker'
]
X = df[feature_cols].copy()
y = df['charges'].copy()
X.shape, y.shape


((1338, 11), (1338,))

In [16]:
# --- 60/20/20 split + scaling ---
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val,  y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

print({'train':len(X_train), 'val':len(X_val), 'test':len(X_test)})


{'train': 802, 'val': 268, 'test': 268}


In [17]:
# --- Evaluation Helper ---
def evaluate(model, Xtr, ytr, Xv, yv, Xte, yte, name):
    model.fit(Xtr, ytr)
    pred_tr = model.predict(Xtr)
    pred_v  = model.predict(Xv)
    pred_te = model.predict(Xte)
    return {
        'Model': name,
        'Train_RMSE': np.sqrt(mean_squared_error(ytr, pred_tr)),
        'Train_MAE':  mean_absolute_error(ytr, pred_tr),
        'Train_R2':   r2_score(ytr, pred_tr),
        'Val_RMSE':   np.sqrt(mean_squared_error(yv, pred_v)),
        'Val_MAE':    mean_absolute_error(yv, pred_v),
        'Val_R2':     r2_score(yv, pred_v),
        'Test_RMSE':  np.sqrt(mean_squared_error(yte, pred_te)),
        'Test_MAE':   mean_absolute_error(yte, pred_te),
        'Test_R2':    r2_score(yte, pred_te)
    }, model


In [18]:
# --- Baseline Linear Regression ---
results = []

lr = LinearRegression()
lr_res, lr_model = evaluate(lr, X_train_s, y_train, X_val_s, y_val, X_test_s, y_test, 'Linear Regression')
results.append(lr_res)

pd.DataFrame(results)


Unnamed: 0,Model,Train_RMSE,Train_MAE,Train_R2,Val_RMSE,Val_MAE,Val_R2,Test_RMSE,Test_MAE,Test_R2
0,Linear Regression,4561.484,2753.5316,0.8469,5599.3656,3183.0943,0.814,4491.3273,2648.4456,0.8701


In [19]:
# --- Ridge Regression (L2) ---
ridge_grid = GridSearchCV(
    Ridge(random_state=42),
    param_grid={'alpha':[0.001,0.01,0.1,1,10,100,1000]},
    cv=5, scoring='neg_mean_squared_error', n_jobs=-1
).fit(X_train_s, y_train)

ridge_res, ridge_model = evaluate(
    ridge_grid.best_estimator_, X_train_s, y_train, X_val_s, y_val, X_test_s, y_test,
    f'Ridge (alpha={ridge_grid.best_params_["alpha"]})'
)
results.append(ridge_res)

pd.DataFrame(results)


Unnamed: 0,Model,Train_RMSE,Train_MAE,Train_R2,Val_RMSE,Val_MAE,Val_R2,Test_RMSE,Test_MAE,Test_R2
0,Linear Regression,4561.484,2753.5316,0.8469,5599.3656,3183.0943,0.814,4491.3273,2648.4456,0.8701
1,Ridge (alpha=0.1),4561.5172,2753.4047,0.8469,5600.6002,3184.7141,0.8139,4490.8579,2648.7279,0.8701


In [20]:
# --- Lasso Regression (L1) ---
lasso_grid = GridSearchCV(
    Lasso(random_state=42, max_iter=10000),
    param_grid={'alpha':[0.001,0.01,0.1,1,10,100]},
    cv=5, scoring='neg_mean_squared_error', n_jobs=-1
).fit(X_train_s, y_train)

lasso_res, lasso_model = evaluate(
    lasso_grid.best_estimator_, X_train_s, y_train, X_val_s, y_val, X_test_s, y_test,
    f'Lasso (alpha={lasso_grid.best_params_["alpha"]})'
)
results.append(lasso_res)

pd.DataFrame(results)


Unnamed: 0,Model,Train_RMSE,Train_MAE,Train_R2,Val_RMSE,Val_MAE,Val_R2,Test_RMSE,Test_MAE,Test_R2
0,Linear Regression,4561.484,2753.5316,0.8469,5599.3656,3183.0943,0.814,4491.3273,2648.4456,0.8701
1,Ridge (alpha=0.1),4561.5172,2753.4047,0.8469,5600.6002,3184.7141,0.8139,4490.8579,2648.7279,0.8701
2,Lasso (alpha=1),4561.5498,2752.2297,0.8469,5599.5425,3183.9493,0.8139,4491.3908,2646.7771,0.8701


In [21]:
# --- Comparison Table (sort by Val RMSE) ---
results_df = pd.DataFrame(results).sort_values('Val_RMSE')
results_df[['Model','Train_RMSE','Val_RMSE','Test_RMSE','Val_R2','Test_R2']]


Unnamed: 0,Model,Train_RMSE,Val_RMSE,Test_RMSE,Val_R2,Test_R2
0,Linear Regression,4561.484,5599.3656,4491.3273,0.814,0.8701
2,Lasso (alpha=1),4561.5498,5599.5425,4491.3908,0.8139,0.8701
1,Ridge (alpha=0.1),4561.5172,5600.6002,4490.8579,0.8139,0.8701


In [22]:
# --- Coefficient Interpretation (Lasso) ---
coef_series = pd.Series(lasso_model.coef_, index=feature_cols).sort_values(key=np.abs, ascending=False)
coef_series.head(12)


smoker_bmi       17,748.6867
smoker_encoded   -7,803.1060
age_squared       4,616.4725
bmi_squared      -3,235.3815
bmi               2,878.3208
age              -1,691.8402
age_bmi             980.9867
children            706.3270
region_encoded     -388.5598
sex_encoded        -255.6298
age_smoker          -63.1697
dtype: float64

In [23]:
# --- Business talking points from coefficients ---
smoker_impact = abs(coef_series.get('smoker_encoded', 0.0))
bmi_impact    = abs(coef_series.get('bmi', 0.0))
age_impact    = abs(coef_series.get('age', 0.0))

print("Pricing implications:")
print(f"- Smoker effect ≈ ${smoker_impact:,.0f} → smoker surcharge tiers are justified.")
print(f"- BMI per unit ≈ ${bmi_impact:,.0f} → BMI-based premium ladder (25/30/35).")
print(f"- Age per year ≈ ${age_impact:,.0f} → age-based brackets (18–30/31–40/41–50/51+).")


Pricing implications:
- Smoker effect ≈ $7,803 → smoker surcharge tiers are justified.
- BMI per unit ≈ $2,878 → BMI-based premium ladder (25/30/35).
- Age per year ≈ $1,692 → age-based brackets (18–30/31–40/41–50/51+).
