In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.ensemble import StackingRegressor


In [2]:
insurance=pd.read_csv("C:/Users/User/Downloads/insurance.csv")

In [3]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

insurance_encoded = encoder.fit_transform(insurance[['sex', 'smoker','region']])

insurance_encoded = insurance_encoded.toarray()

insurance.drop(['sex', 'smoker','region'], axis=1, inplace=True)
insurance_encoded_df = pd.DataFrame(insurance_encoded, columns=encoder. get_feature_names_out(['sex', 'smoker','region']))
insurance = pd.concat([insurance, insurance_encoded_df], axis=1)

In [5]:
insurance.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,32,28.88,0,3866.8552,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [6]:
X = insurance.drop('charges', axis=1)
y = insurance['charges'] 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

In [9]:
stacking = StackingRegressor(
    estimators=[('rf', rf), ('gb', gb)],
    final_estimator=LinearRegression(),
    cv=5,
)

In [10]:
stacking.fit(X_train, y_train)

StackingRegressor(cv=5,
                  estimators=[('rf', RandomForestRegressor(random_state=42)),
                              ('gb',
                               GradientBoostingRegressor(random_state=42))],
                  final_estimator=LinearRegression())

In [11]:
y_pred = stacking.predict(X_test)

In [12]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: ", rmse)

RMSE:  4327.64877152996


In [13]:
print(f"R2 score: {r2_score(y_test, y_pred)}")
print(f"MAE score: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE score: {mean_squared_error(y_test, y_pred)}")

R2 score: 0.879364271065125
MAE score: 2372.844222965451
MSE score: 18728543.88972477
