In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Kaggle Dataset Link: https://www.kaggle.com/datasets/mirichoi0218/insurance/data

# Data Analysis and Preprocessing

In [None]:
data = pd.read_csv("insurance.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
print("Missing values in the dataset:")
data.isnull().sum()

### Seperating Categorical and Numerical Columns

In [None]:
categoricals = data.select_dtypes(include=['object']).columns.tolist()
numericals = data.select_dtypes(include=['int64', 'float64']).columns.to_list()
categoricals, numericals

### Checking for Outliers

In [None]:
data.shape
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)  # 25th percentile
        Q3 = df[col].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

### Removing Outliers

In [None]:
data = remove_outliers_iqr(data, ['age', 'children','bmi'])
data.shape

In [None]:
for each in categoricals:
    print(each, len(data[each].unique().tolist()))

In [None]:
threshold = 25
data[categoricals] = data[categoricals].apply(
    lambda each: each.where(each.isin(each.value_counts().nlargest(threshold).index), "Other"))

In [None]:
data.head()

### Encoding Categorical Columns

In [None]:
encoder = LabelEncoder()
for each in categoricals:
    data[each] = encoder.fit_transform(data[each])

In [None]:
data.head()

### Splitting the Data

In [None]:
X = data.drop(columns=['charges']) 
y = data['charges'] 
numericals.remove('charges')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling the Numerical Columns

In [None]:
scaler = StandardScaler()
X_train[numericals] = scaler.fit_transform(X_train[numericals])
X_test[numericals]  = scaler.transform(X_test[numericals])

# Model Building

### Decision Tree Regressor

In [None]:
dt_model = DecisionTreeRegressor(max_depth=5, random_state=62)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

### Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor(n_estimators=250, max_depth=5, random_state=33)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

### Support Vector Regressor

In [None]:
svm_model = SVR(kernel='poly', C=10, gamma=10)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Model Evaluation

In [None]:
def evaluate_model(name, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} Performance:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}\n")

### Evaluating the Models

In [None]:
evaluate_model("Decision Tree", y_test, dt_pred)
evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("SVM", y_test, svm_pred)

# Visualize Predictions

In [None]:
sorted_idx = np.argsort(y_test.values)
y_test_sorted = y_test.values[sorted_idx]
dt_pred_sorted = dt_pred[sorted_idx]
rf_pred_sorted = rf_pred[sorted_idx]
svm_pred_sorted = svm_pred[sorted_idx]

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_test_sorted, label="Actual Charges", color="black", linestyle="-", linewidth=2)
plt.plot(dt_pred_sorted, label="Decision Tree", color="blue", linestyle="--")
plt.plot(rf_pred_sorted, label="Random Forest", color="red", linestyle=":")
plt.plot(svm_pred_sorted, label="SVM", color="green", linestyle="-." )
plt.xlabel("Test Samples (Sorted)")
plt.ylabel("Charges")
plt.title("Actual vs Predicted Charges for Decision Tree, Random Forest & SVM")
plt.legend()
plt.show()