In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,BayesianRidge
from sklearn import metrics


In [10]:
df=pd.read_csv('D:/datasets/Py_DS/insurance.csv')
print(df.shape)
print(df.dtypes)
df.head

(1338, 7)
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


<bound method NDFrame.head of       age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]>

In [11]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [13]:
df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [14]:
df['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [15]:
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [16]:
df.replace({'sex':{'male':0,'female':1}},inplace=True)
df.replace({'smoker':{'no':0,'yes':1}},inplace=True)
df.replace({'region':{'southeast':0,'northwest':1,'southwest':2,'northeast':3}},inplace=True)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,2,16884.924
1,18,0,33.77,1,0,0,1725.5523
2,28,0,33.0,3,0,0,4449.462
3,33,0,22.705,0,0,1,21984.47061
4,32,0,28.88,0,0,1,3866.8552


In [17]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Split data input & output

In [18]:
X=df.drop(['charges'],axis=1)
Y=df['charges']
print(X.isnull().sum())
print(Y.isnull().sum())
print(Y.head())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64
0
0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64


# ML Model

In [19]:
x_train,x_test, y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=102)
print(x_train,x_test)
print(y_train,y_test)

      age  sex     bmi  children  smoker  region
300    36    0  27.550         3       0       3
1136   44    1  25.000         1       0       2
464    19    0  25.175         0       0       1
212    24    0  28.500         2       0       1
11     62    1  26.290         0       1       0
...   ...  ...     ...       ...     ...     ...
626    36    0  28.880         3       0       3
242    55    1  26.800         1       0       2
590    58    1  29.000         0       0       2
755    31    0  27.645         2       0       3
256    56    0  33.630         0       1       1

[1003 rows x 6 columns]       age  sex     bmi  children  smoker  region
722    62    0  37.400         0       0       2
687    40    0  41.690         0       0       0
723    19    0  35.400         0       0       2
1253   40    1  29.300         4       0       2
1169   37    1  34.105         1       0       1
...   ...  ...     ...       ...     ...     ...
489    53    0  31.160         1       0    

In [20]:
#LM
lm_model=LinearRegression()
bays_model=BayesianRidge()
las_model=Lasso()

lm_model.fit(x_train,y_train)
bays_model.fit(x_train,y_train)
las_model.fit(x_train,y_train)

Lasso()

# Prediction apply on Train/Test on all models

In [21]:
lm_train_prediction=lm_model.predict(x_train)
lm_r2_train=metrics.r2_score(y_train,lm_train_prediction)
print('LM Train:',lm_r2_train)

bays_train_prediction=bays_model.predict(x_train)
bays_r2_train=metrics.r2_score(y_train,bays_train_prediction)
print('BayesianRidge Train: ',bays_r2_train)

las_train_prediction=las_model.predict(x_train)
las_r2_train=metrics.r2_score(y_train,las_train_prediction)
print('Lasso Train:',las_r2_train)


lm_test_prediction=lm_model.predict(x_test)
lm_r2_test=metrics.r2_score(y_test,lm_test_prediction)
print('LM Test:',lm_r2_test)

bays_test_prediction=bays_model.predict(x_test)
bays_r2_test=metrics.r2_score(y_test,bays_test_prediction)
print('BayesianRidge Test: ',bays_r2_train)

las_test_prediction=las_model.predict(x_test)
las_r2_test=metrics.r2_score(y_test,las_test_prediction)
print('Lasso Test:',las_r2_train)

LM Train: 0.7573843113075567
BayesianRidge Train:  0.7573810357542312
Lasso Train: 0.7573842270205171
LM Test: 0.725441365547174
BayesianRidge Test:  0.7573810357542312
Lasso Test: 0.7573842270205171


# Input Data

In [22]:
print(x_train)
# df.replace({'sex':{'male':0,'female':1}},inplace=True)
# df.replace({'smoker':{'no':0,'yes':1}},inplace=True)
# df.replace({'region':{'southeast':0,'northwest':1,'southwest':2,'northeast':3}},inplace=True)
# df.head()

      age  sex     bmi  children  smoker  region
300    36    0  27.550         3       0       3
1136   44    1  25.000         1       0       2
464    19    0  25.175         0       0       1
212    24    0  28.500         2       0       1
11     62    1  26.290         0       1       0
...   ...  ...     ...       ...     ...     ...
626    36    0  28.880         3       0       3
242    55    1  26.800         1       0       2
590    58    1  29.000         0       0       2
755    31    0  27.645         2       0       3
256    56    0  33.630         0       1       1

[1003 rows x 6 columns]


In [23]:
#print(x_train)
input_data=(32,0,28.88,0,0,1)
input_data_np=np.asarray(input_data)
input_data_reshape=input_data_np.reshape(1,-1)

input_lm_predict=lm_model.predict(input_data_reshape)
print('Linear Prediction : ', input_lm_predict)

input_bays_predict=bays_model.predict(input_data_reshape)
print('BayesianRidge Prediction : ', input_bays_predict)

input_las_predict=las_model.predict(input_data_reshape)
print('Lasso Prediction :', input_las_predict)

Linear Prediction :  [5196.21384291]
BayesianRidge Prediction :  [5210.23317929]
Lasso Prediction : [5201.09189862]


In [24]:
df_out=df
p=df_out[['age','sex','bmi','children','smoker','region']]
df_out['LM_predict_value']=lm_model.predict(p)
df_out['Bayesridge_predict_value']=bays_model.predict(p)
df_out['Lasso_predict_value']=las_model.predict(p)
print(df_out.head(5))
df_out.to_csv("D:/datasets/Py_DS/Prediction_insuracecharges.txt",sep='|',index=False)

   age  sex     bmi  children  smoker  region      charges  LM_predict_value  \
0   19    1  27.900         0       1       2  16884.92400      26501.338608   
1   18    0  33.770         1       0       0   1725.55230       3441.938085   
2   28    0  33.000         3       0       0   4449.46200       6414.835217   
3   33    0  22.705         0       0       1  21984.47061       3307.947362   
4   32    0  28.880         0       0       1   3866.85520       5196.213843   

   Bayesridge_predict_value  Lasso_predict_value  
0              26458.290593         26494.704948  
1               3457.599827          3446.745497  
2               6429.289763          6418.088532  
3               3321.651053          3313.208285  
4               5210.233179          5201.091899  
