In [38]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

import pickle

In [2]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
x = df.drop('expenses',axis = 1)
y = df['expenses']

In [4]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)
x_train.shape

(1070, 6)

In [5]:
x_test.shape

(268, 6)

In [6]:
x_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,female,20.0,2,no,northwest
1285,47,female,24.3,0,no,northeast
1142,52,female,24.9,0,no,southeast
969,39,female,34.3,5,no,southeast
486,54,female,21.5,3,no,northwest


In [7]:
y_train.head()

560      9193.84
1285     8534.67
1142    27117.99
969      8596.83
486     12475.35
Name: expenses, dtype: float64

In [8]:
num_features = ['age','bmi','children']
cat_features = ['sex','smoker','region']
trf = ColumnTransformer(
    transformers = [
        ('num',StandardScaler(),num_features),
        ('cat',OneHotEncoder(drop = 'first',sparse_output = False),cat_features)
    ]
)

In [9]:
pipeline = Pipeline(steps = [
    ("preprocessor",trf),
    ('regressor',LinearRegression())
])

In [10]:
pipeline.fit(x_train,y_train)

In [11]:
y_pred = pipeline.predict(x_test)

In [12]:
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R² Score:", r2)

MSE: 33600065.35507785
RMSE: 5796.556335884078
R² Score: 0.7835726930039904


In [36]:
decision_tree = Pipeline(steps=[
    ('preprocessor', trf),
    ('regressor', DecisionTreeRegressor(max_depth=4, min_samples_split=4,random_state=42))
])

# Train the model
decision_tree.fit(x_train, y_train)

# Predict on test set
y_pred = decision_tree.predict(x_test)

# Evaluate the model
mse_dt = mean_squared_error(y_test, y_pred)
rmse_dt = np.sqrt(mse)
r2_dt = r2_score(y_test, y_pred)

print("MSE:", mse_dt)
print("RMSE:", rmse_dt)
print("R² Score:", r2_dt)

MSE: 21076774.271162838
RMSE: 4590.94481247192
R² Score: 0.8642386719351669


In [18]:
random_forest = Pipeline(steps=[
    ('preprocessor', trf),
    ('regressor', RandomForestRegressor(
        n_estimators=100,       # Number of trees
        max_depth=6,            # Limit depth to avoid overfitting
        min_samples_split=4,    # Minimum samples to split a node
        random_state=42
    ))
])

# Train the model
random_forest.fit(x_train, y_train)

# Predict on test set
y_pred_rf = random_forest.predict(x_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("MSE:", mse_rf)
print("RMSE:", rmse_rf)
print("R² Score:", r2_rf)

MSE: 19931815.522976078
RMSE: 4464.506190271896
R² Score: 0.871613667664279


In [30]:
gradient_boosting = Pipeline(steps=[
    ('preprocessor', trf),
    ('regressor', GradientBoostingRegressor(
        n_estimators=45,       # Number of boosting stages
        learning_rate=0.1,      # Step size shrinkage
        max_depth=4,            # Depth of each tree
        min_samples_split=4,    # Minimum samples to split
        random_state=42
    ))
])

# Train the model
gradient_boosting.fit(x_train, y_train)

# Predict on test set
y_pred_gb = gradient_boosting.predict(x_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("Gradient Boosting MSE:", mse_gb)
print("Gradient Boosting RMSE:", rmse_gb)
print("Gradient Boosting R² Score:", r2_gb)

Gradient Boosting MSE: 19584640.78994954
Gradient Boosting RMSE: 4425.453738313116
Gradient Boosting R² Score: 0.8738499160683206


In [32]:
!pip install xgboost



Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
    --------------------------------------- 0.8/56.8 MB 4.2 MB/s eta 0:00:14
   - -------------------------------------- 1.6/56.8 MB 3.8 MB/s eta 0:00:15
   - -------------------------------------- 2.4/56.8 MB 3.9 MB/s eta 0:00:14
   -- ------------------------------------- 3.4/56.8 MB 4.1 MB/s eta 0:00:14
   --- ------------------------------------ 4.5/56.8 MB 4.2 MB/s eta 0:00:13
   --- ------------------------------------ 5.2/56.8 MB 4.1 MB/s eta 0:00:13
   ---- ----------------------------------- 6.0/56.8 MB 4.1 MB/s eta 0:00:13
   ---- ----------------------------------- 6.8/56.8 MB 4.1 MB/s eta 0:00:13
   ----- ---------------------------------- 8.1/56.8 MB 4.3 MB/s eta 0:00:12
   ------ ----------

In [35]:
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', trf),  # Ensure 'trf' is your preprocessor
    ('regressor', XGBRegressor(
        n_estimators=100,       # Number of boosting rounds
        max_depth=4,            # Depth of each tree
        learning_rate=0.1,      # Step size shrinkage
        min_child_weight=4,    # Minimum samples required to split a node
        random_state=42
    ))
])

# Train the model
xgboost_pipeline.fit(x_train, y_train)

# Predict on test set
y_pred_xgb = xgboost_pipeline.predict(x_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost MSE:", mse_xgb)
print("XGBoost RMSE:", rmse_xgb)
print("XGBoost R² Score:", r2_xgb)

XGBoost MSE: 19757283.073033452
XGBoost RMSE: 4444.916542864832
XGBoost R² Score: 0.8727378794098608


In [37]:
models = pd.DataFrame({
    'Model': ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XGBoost'],
    'RMSE': [rmse,rmse_dt,rmse_rf,rmse_gb,rmse_xgb],
    'r2_score': [r2,r2_dt,r2_rf,r2_gb,r2_xgb]
})
models.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Model,RMSE,r2_score
3,Gradient Boosting,4425.453738,0.87385
4,XGBoost,4444.916543,0.872738
2,Random Forest,4464.50619,0.871614
1,Decision Tree,4590.944812,0.864239
0,Linear Regression,4590.944812,0.864239


In [39]:
pickle.dump(gradient_boosting,open('GB_regressor.pkl','wb'))