In [1]:
import pandas as pd
import numpy as np
import sklearn.model_selection as skm
from ISLP.bart import BART
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt

I selected `insurance data` ['Insurance Premium Prediction'](https://www.kaggle.com/datasets/noordeen/insurance-premium-prediction) from Kaggle. I thought it would be meaningful to predict future medical expenses of individuals from insurance company point-of-view. So, I decided to use this data to apply BART method.

In [2]:
data = pd.read_csv('insurance_classification.csv')

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


#### Correlation Analysis

In [4]:
correlation_matrix = data.corr()
print(correlation_matrix)

               age       bmi  children  expenses
age       1.000000  0.109341  0.042469  0.299008
bmi       0.109341  1.000000  0.012645  0.198576
children  0.042469  0.012645  1.000000  0.067998
expenses  0.299008  0.198576  0.067998  1.000000


In [24]:
## Found weak correlations(lower than 0.4) between features(age, bmi, children) and expenses. But I just continued my analysis.

#### Tranform nominal features(sex, smoker, region) to dummy variables

In [5]:
data = pd.get_dummies(data)

#### Separate the input and output/ train and test dataset

In [6]:
X = data.drop(columns=['expenses'])

In [7]:
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,1,0,0,0,1
1,18,33.8,1,0,1,1,0,0,0,1,0
2,28,33.0,3,0,1,1,0,0,0,1,0
3,33,22.7,0,0,1,1,0,0,1,0,0
4,32,28.9,0,0,1,1,0,0,1,0,0


In [8]:
feature_names = list(X.columns)
X = np.asarray(X)

In [9]:
feature_names

['age',
 'bmi',
 'children',
 'sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest']

In [10]:
(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                data['expenses'],
                                test_size=0.3,
                                random_state=0)

#### Applying BART

In [11]:
bart_insurance = BART()

In [12]:
bart_insurance.fit(X_train, y_train)

In [14]:
yhat_test = bart_insurance.predict(X_test.astype(np.float32))

#### Evaluation: RMSE and R-squared

In [15]:
rmse = mean_squared_error(y_test, yhat_test, squared=False)



In [16]:
rmse

4649.793455671519

In [17]:
r2 = r2_score(y_test, yhat_test)

In [18]:
r2

0.8644218248393888

In [19]:
var_inclusion = pd.Series(bart_insurance.variable_inclusion_.mean(0),
                               index=feature_names)
var_inclusion

age                 27.4
bmi                 32.8
children            24.2
sex_female          23.8
sex_male            22.2
smoker_no           24.5
smoker_yes          31.7
region_northeast    26.7
region_northwest    29.7
region_southeast    30.4
region_southwest    25.7
dtype: float64