In [48]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [49]:
labelencoder = preprocessing.LabelEncoder()
data['smoker'] = labelencoder.fit_transform(data['smoker'])
data['sex'] = labelencoder.fit_transform(data['sex'])
data['region'] = labelencoder.fit_transform(data['region'])
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [50]:
x = data[["age", "sex", "bmi", "children", "smoker", "region"]]
y = data["charges"]

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.30)

In [52]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)

In [53]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
def modelresults(prediction):
    print("Mean absolute error on model is {}".format(mean_absolute_error(y_test,prediction)))
    print("Root mean squared error on model is {}".format(mean_squared_error(y_test, prediction)))

In [54]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(scaled_x_train, y_train)

In [55]:
predslr = lr.predict(scaled_x_test)
modelresults(predslr)

Mean absolute error on model is 4220.917840297269
Root mean squared error on model is 39299221.74782188


In [56]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
svrmodel = SVR()
param_gridsvr = {'C':[0.001, 0.01, 0.1, 0.5, 1], 'kernel':['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto'], 'degree':[2, 3, 4, 5]}
gridsvr = GridSearchCV(svrmodel, param_gridsvr)
gridsvr.fit(scaled_x_train, y_train)
print('Best parameter for this model {}'.format(gridsvr.best_params_))

Best parameter for this model {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


In [57]:
predgridsvr = gridsvr.predict(scaled_x_test)
modelresults(predgridsvr)

Mean absolute error on model is 7872.790461068223
Root mean squared error on model is 152987042.85681224


In [61]:
from sklearn.ensemble import RandomForestRegressor
rfmodel = RandomForestRegressor()
param_gridrf = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15],
    'max_features': ['sqrt', 'log2', None], 
    'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}
gridrf = GridSearchCV(rfmodel, param_gridrf)
gridrf.fit(scaled_x_train,y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [64]:
predgridrf = gridrf.predict(scaled_x_test)
modelresults(predgridrf)

Mean absolute error on model is 2502.3768100163725
Root mean squared error on model is 20016748.424532942


In [68]:
x.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [70]:
columniterate = 1
for index in x.columns:
    mean = data[index].mean()
    print("Mean of the column {} is {}".format(columniterate, mean))
    columniterate += 1

Mean of the column 1 is 39.20702541106129
Mean of the column 2 is 0.5052316890881914
Mean of the column 3 is 30.66339686098655
Mean of the column 4 is 1.0949177877429
Mean of the column 5 is 0.20478325859491778
Mean of the column 6 is 1.515695067264574


In [71]:
new_customer = np.array([39,0,30,1,0,1])

In [72]:
gridrf.predict(new_customer.reshape(1,-1))

array([17726.81279871])

In [73]:
print('The insurance cost of new customer is: {}'.format(gridrf.predict(new_customer.reshape(1,-1))))

The insurance cost of new customer is: [17726.81279871]
