In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor

In [20]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [21]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [23]:
dff=df.copy()

In [24]:
le = LabelEncoder()
dff['smoker']=le.fit_transform(dff['smoker'])
dff['sex'] = le.fit_transform(dff['sex'])
dff['region'] = le.fit_transform(dff['region'])
dff.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [35]:
#sns.boxplot(data=dff,y='charges')

In [34]:
#q1 = np.quantile(dff['charges'],0.25)
#q3 = np.quantile(dff['charges'],0.75)
#iqr = q3-1
#high = q3 + 1.5*iqr
#low = q1 - 1.5*iqr
#print(high)
#print(low)

In [36]:
features = dff.drop('charges',axis=1)
target = dff['charges']
x_train , x_test , y_train , y_test = train_test_split(features,target,test_size=0.2,random_state=123)

In [46]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'GradientBoosting':GradientBoostingRegressor(),
    'XGBRegressor':XGBRegressor()
}
result = pd.DataFrame(columns=['SCORE','MAE','MSE',"RMSE"])
def Regressor(models,x_train,x_test,y_train,y_test):
    for key,model in models.items():
        m = model
        m.fit(x_train,y_train)
        prediction = m.predict(x_test)
        SCORE = r2_score(y_test,prediction)
        MAE = mean_absolute_error(y_test,prediction)
        MSE = mean_squared_error(y_test,prediction)
        RMSE = np.sqrt(MSE)
        result.loc[key,:]=[SCORE,MAE,MSE,RMSE]
    return result

In [47]:
Regressor(models,x_train,x_test,y_train,y_test)

Unnamed: 0,SCORE,MAE,MSE,RMSE
LinearRegression,0.799859,4009.551705,30600637.578781,5531.784303
Ridge,0.799338,4021.408996,30680354.044084,5538.984929
DecisionTree,0.751117,2895.31406,38053068.73778,6168.716944
RandomForest,0.887482,2410.088203,17203475.953688,4147.707313
GradientBoosting,0.904935,2206.778451,14535028.887004,3812.483297
XGBRegressor,0.867087,2691.431326,20321833.865133,4507.974475
