In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from statsmodels import api as sm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Kaggle Dataset Link: https://www.kaggle.com/datasets/mirichoi0218/insurance/data

# Data Analysis and Preprocessing

In [None]:
data = pd.read_csv("insurance.csv")

In [None]:
data.shape()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
print("Missing values in the dataset:")
data.isnull().sum()

### Seperating Categorical and Numerical Columns

In [None]:
categoricals = data.select_dtypes(include=['object']).columns.tolist()
numericals = data.select_dtypes(include=['int64', 'float64']).columns.to_list()
categoricals, numericals

### Checking for Outliers

In [None]:
data.shape()
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)  # 25th percentile
        Q3 = df[col].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

### Removing Outliers

In [None]:
data = remove_outliers_iqr(data, ['age', 'children','bmi'])
data.shape()

In [None]:
for each in categoricals:
    print(each, len(data[each].unique().tolist()))

In [None]:
threshold = 25
data[categoricals] = data[categoricals].apply(
    lambda each: each.where(each.isin(each.value_counts().nlargest(threshold).index), "Other"))

In [None]:
data.head()

### Encoding Categorical Columns

In [None]:
encoder = LabelEncoder()
for each in categoricals:
    data[each] = encoder.fit_transform(data[each])

In [None]:
data.head()

### Splitting the Data

In [None]:
X = data.drop(columns=['charges']) 
y = data['charges'] 
numericals.remove('charges')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling the Numerical Columns

In [None]:
scaler = StandardScaler()
X_train[numericals] = scaler.fit_transform(X_train[numericals])
X_test[numericals]  = scaler.transform(X_test[numericals])

# Model Building

### Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor( max_depth=4,random_state=62)
model.fit(X_train, y_train)

In [None]:
print('DecisionTreeRegressor Train Score is : ' , model.score(X_train, y_train))
print('DecisionTreeRegressor Test Score is : ' , model.score(X_test, y_test))
print('-'*70)

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred,multioutput='uniform_average')
print(f"Mean Squared Error (MSE): {mse}")
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

In [None]:
model = RandomForestRegressor(n_estimators=100,max_depth=7, random_state=33)
model.fit(X_train, y_train)
print('Random Forest Regressor Train Score is : ' , model.score(X_train, y_train))
print('Random Forest Regressor Test Score is : ' , model.score(X_test, y_test))
print('-'*70)

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred,multioutput='uniform_average')
print(f"Mean Squared Error (MSE): {mse}")
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")