In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, AdaBoostRegressor, IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input/prediction-of-insurance-charges-using-age-gender'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/prediction-of-insurance-charges-using-age-gender/insurance.csv


In [2]:
df = pd.read_csv('/kaggle/input/prediction-of-insurance-charges-using-age-gender/insurance.csv')

In [None]:
for i in df.columns:
    print(df[i].value_counts())

Checking the values of different features

In [None]:
df.isna().sum()
df.duplicated().sum()

Checking for null values and duplicates. No null and duplicates in this dataset

In [3]:
y= df.charges
y= pd.DataFrame(y)

In [4]:
df= df.iloc[:,1:7]

Slicing the df excluding the first column

In [5]:
df = pd.get_dummies(df)

One hot encoding using get dummies

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
vif["features"] = df.columns
print(vif)

VIF to check multi collinearity. values >10 suggest multi collinearity

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap = 'coolwarm')

checking co relation among the features

In [None]:
If = IsolationForest()
If.fit(df)
df['anamoly'] = If.predict(df)
df.anamoly.value_counts()

To detect outliers. Not used here

In [6]:
trainx, testx, trainy, testy = train_test_split(df, y, test_size = 0.35, random_state = 100)

Splitting into test and train

In [7]:
trainxcon = trainx.iloc[:, 0:2]
trainxcat = trainx.iloc[:, 2:11]
testxcon = testx.iloc[:, 0:2]
testxcat = testx.iloc[:, 2:11]

Separating the categorial and continuous variables for the test and train set

In [8]:
poly = PolynomialFeatures(degree =2, include_bias = False, interaction_only = False )

Defining 2 degree polynomial features to generate interactions

In [9]:
trainxconp = poly.fit_transform(trainxcon)
columns = poly.get_feature_names_out(trainxcon.columns)
trainxconp = pd.DataFrame(trainxconp, columns = columns)

Transforming the train set with poly features to generate interactions.
Categorical variables are not used for poly interactions.
Standardization is always done after polynomial transformation.

In [10]:
testxconp = poly.transform(testxcon)
columns = poly.get_feature_names_out(testxcon.columns)
testxconp = pd.DataFrame(testxconp, columns = columns)

Transforming the test set

In [11]:
ss =  StandardScaler()
trainxconp_scaled = pd.DataFrame(ss.fit_transform(trainxconp), columns = trainxconp.columns)
testxconp_scaled = pd.DataFrame(ss.transform(testxconp), columns = testxconp.columns)

Standardizing the test and train set

In [12]:
trainxcat.reset_index(inplace = True)
trainxconp_scaled.reset_index(inplace = True)
trainX = pd.concat([trainxconp_scaled, trainxcat], axis = 1)
trainX.drop(columns = ['index'], inplace = True)
testxcat.reset_index(inplace = True)
testxconp_scaled.reset_index(inplace = True)
testX = pd.concat([testxconp_scaled, testxcat], axis = 1)
testX.drop(columns = ['index'], inplace = True)

Resetting the index and performing outer join

In [None]:
model = Ridge()
max_iter = np.array(range(100000, 10000000))
alpha = np.linspace(1, 10, 50)
solver = ['svd', 'lsqr', 'lbfgs', 'cholesky']
param = {'max_iter' : max_iter, 'solver': solver, 'alpha': alpha}
rridge = RandomizedSearchCV(model, param, cv= 50, scoring = 'neg_mean_squared_error')
rridge.fit(trainX, trainy)
print(rridge.best_params_)

Used RandomizedsearchCV for parameter tuning

In [13]:
model = Ridge( alpha = 3.3877551020408165 , max_iter = 3335434 , solver = 'svd' , random_state = 100)
model.fit(trainX, trainy)
predy =  model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 5840.473458668389
r2 score: 0.7526489873952422


## Ridge
RMSE: 5840.473458668389
r2 score: 0.7526489873952422


In [None]:
model = Lasso()
alpha = np.linspace(0.1, 10, 100)
max_iter = np.array(range(10000, 1000000))
param = {'alpha': alpha, 'max_iter': max_iter}
rlasso = RandomizedSearchCV(model, param, cv= 50, scoring = 'neg_mean_squared_error')
rlasso.fit(trainX, trainy)
print(rlasso.best_params_)

Used RandomizedsearchCV for parameter tuning

In [14]:
model = Lasso(max_iter = 998792, alpha = 7.1)
model.fit(trainX, trainy)
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 5846.090736451662
r2 score: 0.7521729617687796


## Lasso
RMSE: 5846.090736451662
r2 score: 0.7521729617687796

In [None]:
model = ElasticNet()
alpha = np.linspace(.1, 10, 20)
max_iter = np.array(range(100000, 1000000))
l1_ratio = np.linspace(0, 1, 10)
param = {'alpha': alpha, 'max_iter': max_iter, 'l1_ratio': l1_ratio}
relastic = RandomizedSearchCV(model, param, cv = 50, scoring = 'neg_mean_squared_error')
relastic.fit(trainX, trainy)
print(relastic.best_params_)



Used RandomizedsearchCV for parameter tuning

In [15]:
model = ElasticNet(alpha = 9.478947368421053, max_iter = 658577, l1_ratio = 1)
model.fit(trainX, trainy)
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))


RMSE: 5845.968159961697
r2 score: 0.7521833541664913


## ElasticNet
RMSE: 5845.968159961697
r2 score: 0.7521833541664913

In [None]:
model = DecisionTreeRegressor()
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1,100))
criterion = ['squared_error', 'friedman_mse', 'poisson', 'absolute_error']
param = {'min_samples_split': min_samples_split, 'max_depth': max_depth, 'criterion': criterion}
rdecision = RandomizedSearchCV(model, param, cv = 50, scoring = 'neg_mean_squared_error')
rdecision.fit(trainX, trainy)
print(rdecision.best_params_)

Used RandomizedsearchCV for parameter tuning

In [16]:
model = DecisionTreeRegressor(min_samples_split = 52, max_depth = 4, criterion = 'squared_error', random_state = 100)
model.fit(trainX, trainy)
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 4444.189548581638
r2 score: 0.8567803095639313


## Decision Tree
RMSE: 4444.189548581638
r2 score: 0.8567803095639313

In [None]:
model = RandomForestRegressor()
max_depth = np.array(range(1, 100))
criterion = ['squared_error', 'friedman_mse','poisson', 'absolute_error']
min_samples_split = np.array(range(1, 100))
n_estimators = np.array(range(1,100))
param = {'max_depth': max_depth, 'criterion': criterion, 'min_samples_split': min_samples_split, 'n_estimators': n_estimators}
rforest = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error')
rforest.fit(trainX, trainy.values.ravel())
print(rforest.best_params_)


Used RandomizedsearchCV for parameter tuning

In [17]:
model = RandomForestRegressor(criterion = 'squared_error', min_samples_split = 51, max_depth = 13, n_estimators = 76 , random_state = 100)
model.fit(trainX, trainy.values.ravel())
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 4463.021739340591
r2 score: 0.8555639550188641


## Random Forest
RMSE: 4463.021739340591
r2 score: 0.8555639550188641

In [None]:
model = AdaBoostRegressor()
learning_rate = np.linspace(0, 1, 100)
n_estimators = np.array(range(1, 100))
param = {'learning_rate': learning_rate, 'n_estimators': n_estimators}
rada = RandomizedSearchCV(model, param, cv=5, scoring = 'neg_mean_squared_error')
rada.fit(trainX, trainy.values.ravel())
print(rada.best_params_)

Used RandomizedsearchCV for parameter tuning

In [18]:
model = AdaBoostRegressor(n_estimators = 46, learning_rate = 0.07070707070707072)
model.fit(trainX, trainy.values.ravel())
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 4699.363114210492
r2 score: 0.839861561007456


## AdaBoost
RMSE: 4699.363114210492
r2 score: 0.839861561007456

In [None]:
model = BaggingRegressor()
n_estimators = np.array(range(1, 100))
max_features = np.array(range(1, 15))
param = {'n_estimators': n_estimators, 'max_features': max_features}
rbagging = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error')
rbagging.fit(trainX, trainy.values.ravel())
print(rbagging.best_params_)

Used RandomizedsearchCV for parameter tuning

In [19]:
model = BaggingRegressor(max_features = 12, n_estimators = 57)
model.fit(trainX, trainy.values.ravel())
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 4776.086665408034
r2 score: 0.834589916528252


## Bagging Regressor
RMSE: 4776.086665408034
r2 score: 0.834589916528252


In [None]:
model = ExtraTreesRegressor()
criterion = ['squared_error', 'friedman_mse', 'poisson', 'absolute_error']
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1, 100))
n_estimators = np.array(range(1, 100))
param = {'criterion': criterion, 'min_samples_split': min_samples_split, 'max_depth': max_depth, 'n_estimators': n_estimators}
rextra = RandomizedSearchCV(model, param, cv=10, scoring = 'neg_mean_squared_error')
rextra.fit(trainx, trainy.values.ravel())
print(rextra.best_params_)

Used RandomizedsearchCV for parameter tuning

In [20]:
model = ExtraTreesRegressor(n_estimators = 95, min_samples_split = 41, max_depth = 77, criterion = 'absolute_error')
model.fit(trainX, trainy.values.ravel())
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 4366.381805748672
r2 score: 0.8617513176874498


# The best model
## Extra Trees
RMSE: 4366.381805748672
r2 score: 0.8617513176874498

In [None]:
model = SVR()
kernel = ['rbf', 'sigmoid', 'linear']
C = np.linspace(.001, 100, 20)
gamma = np.linspace(0.001, 100, 20)
param = {'kernel': kernel, 'C': C, 'gamma': gamma}
rsvr = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error')
rsvr.fit(trainX, trainy.values.ravel())
print(rsvr.best_params_)

Used RandomizedsearchCV for parameter tuning

In [22]:
model = SVR(kernel = 'linear', gamma = 94.7368947368421, C = 36.84273684210525)
model.fit(trainX, trainy.values.ravel())
predy = model.predict(testX)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 8530.837772477636
r2 score: 0.472283164203825


## SVR
RMSE: 8530.837772477636
r2 score: 0.472283164203825