In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
df.shape


(1338, 7)

In [4]:
categorical=[]
continuous=[]
check=[]

d_types=dict(df.dtypes)
for name, type in d_types.items():
    if str(type)=='object':
        categorical.append(name)
    elif str(type)=='float64':
        continuous.append(name)
    else:
        check.append(name)
        
print('categorical values:',categorical)
print('continuous values:',continuous) 
print('features to be checked',check)

categorical values: ['sex', 'smoker', 'region']
continuous values: ['bmi', 'expenses']
features to be checked ['age', 'children']


In [5]:
df['sex'].replace({'female':0,'male':1},inplace=True)
df['smoker'].replace({'no':0,'yes':1},inplace=True)

df.drop('region',axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86


In [7]:
x=df.drop('expenses',axis=1)
y=df['expenses']

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Lasso Regression

In [9]:
from sklearn.linear_model import Lasso

model1=Lasso()
model1.fit(x_train,y_train)

Lasso()

In [10]:
train_predictions=model1.predict(x_train)
test_predictions=model1.predict(x_test)

In [11]:
model1.score(x_train,y_train)

0.7411141811626791

In [12]:
model1.score(x_test,y_test)

0.7810864578730293

In [13]:
from sklearn.model_selection import cross_val_score
print('cross validation score:',cross_val_score(model1,x,y,cv=5).mean())

cross validation score: 0.7466664879428851


In [14]:
from sklearn.model_selection import GridSearchCV
estimator=Lasso()

param_grid={'alpha':[1,2,3,4,5,6,7,8,9,10]}

model1.hp=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model1.hp.fit(x_train,y_train)
model1.hp.best_params_

{'alpha': 10}

In [15]:
from sklearn.linear_model import Lasso

model1_best=Lasso(alpha=10)
model1_best.fit(x_train,y_train)

train_predictions=model1_best.predict(x_train)
test_predictions=model1_best.predict(x_test)

print(model1_best.score(x_train,y_train))
print(model1_best.score(x_test,y_test))

from sklearn.model_selection import cross_val_score
print('cross validation score:',cross_val_score(model1_best,x,y,cv=5).mean())

0.7411094487260133
0.7809614490519589
cross validation score: 0.7467299170217538


In [16]:
model1.intercept_


-12117.468526477174

In [17]:
model1.coef_

array([ 2.57042736e+02, -3.46456634e+00,  3.27665563e+02,  4.26944413e+02,
        2.36465356e+04])

In [18]:
model1_best.intercept_

-12090.596482929921

In [19]:
model1.coef_

array([ 2.57042736e+02, -3.46456634e+00,  3.27665563e+02,  4.26944413e+02,
        2.36465356e+04])

## Final Model Of Lasso regression

In [20]:
# FINAL MODEL
x=x.drop(x.columns[[1]],axis=1)

y=df['expenses']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
from sklearn.linear_model import Lasso

model1_best=Lasso(alpha=10)
model1_best.fit(x_train,y_train)

train_predictions=model1_best.predict(x_train)
test_predictions=model1_best.predict(x_test)

print(model1_best.score(x_train,y_train))
print(model1_best.score(x_test,y_test))

from sklearn.model_selection import cross_val_score
print('cross validation score:',cross_val_score(model1_best,x,y,cv=5).mean())


0.74110944870169
0.7809614477274642
cross validation score: 0.7469456440629066


In [21]:
input_data={'age':31,
           'sex':'female',
           'bmi':25.74,
           'children':0,
           'smoker':'no',
           'region':'northeast'}

In [22]:
df_test=pd.DataFrame(input_data,index=[0])

df_test.drop('region',axis=1,inplace=True)
df_test['sex'].replace({'female':0,'male':1},inplace=True)
df_test['smoker'].replace({'no':0,'yes':1},inplace=True)

transformed_data=df_test.drop(df_test.columns[[1]],axis=1)

In [23]:
model1_best.predict(transformed_data)

array([4302.89154655])

In [24]:
predicted_data={'age':18,
           'sex':'male',
           'bmi':33.8,
           'children':1,
           'smoker':'no',
           'region':'southeast'}

In [25]:
test_data=pd.DataFrame(predicted_data,index=[0])

test_data.drop('region',axis=1,inplace=True)
test_data['sex'].replace({'female':0,'male':1},inplace=True)
test_data['smoker'].replace({'no':0,'yes':1},inplace=True)

transformed_test_data=test_data.drop(test_data.columns[[1]],axis=1)

In [26]:
model1_best.predict(transformed_test_data)

array([4022.57422757])

In [27]:
test_pred=y_test-test_predictions

In [28]:
test_pred.head()

764      529.292648
887    -1710.309981
890    -7422.549158
1293    -105.693818
259     6922.784253
Name: expenses, dtype: float64

In [29]:
model1_best.score(x_test,y_test)

0.7809614477274642

In [30]:
cross_val_score(model1_best,x,y,cv=5).mean()

0.7469456440629066

# RIDGE Regression

In [36]:
df=pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [37]:
categorical=[]
continuous=[]
check=[]

d_types=dict(df.dtypes)
for name, type in d_types.items():
    if str(type)=='object':
        categorical.append(name)
    elif str(type)=='float64':
        continuous.append(name)
    else:
        check.append(name)
        
print('categorical values:',categorical)
print('continuous values:',continuous) 
print('features to be checked',check)

categorical values: ['sex', 'smoker', 'region']
continuous values: ['bmi', 'expenses']
features to be checked ['age', 'children']


In [38]:
df['sex'].replace({'female':0,'male':1},inplace=True)
df['smoker'].replace({'no':0,'yes':1},inplace=True)

df.drop('region',axis=1,inplace=True)

In [39]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86


In [40]:
x=df.drop('expenses',axis=1)
y=df['expenses']

In [41]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [42]:
from sklearn.linear_model import Ridge

model=Ridge()
model.fit(x_train,y_train)

Ridge()

In [43]:
train_predictions=model.predict(x_train)
test_predictions=model.predict(x_test)

In [44]:
model.score(x_train,y_train)
model.score(x_test,y_test)

0.7808503355095352

In [45]:
from sklearn.model_selection import cross_val_score
print('cross validation score:',cross_val_score(model,x,y,cv=5).mean())

cross validation score: 0.7466523551462286


In [46]:
from sklearn.model_selection import GridSearchCV
estimator=Ridge()

param_grid={'alpha':[1,2,3,4,5,6,7,8,9,10]}

model.hp=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model.hp.fit(x_train,y_train)
model.hp.best_params_

{'alpha': 1}

In [47]:
ridge_best=Ridge(alpha=1)
ridge_best.fit(x_train,y_train)

print('Intercept:',ridge_best.intercept_)
print('coefficient:',ridge_best.coef_)

train_predictios=ridge_best.predict(x_train)
test_predictions=ridge_best.predict(x_test)

print('train_score:',ridge_best.score(x_train,y_train))
print('test_score:',ridge_best.score(x_test,y_test))
print('cross_val_score:',cross_val_score(ridge_best,x,y,cv=5).mean())


Intercept: -12087.404764651963
coefficient: [ 2.56846489e+02 -1.64046217e-01  3.27707313e+02  4.28067878e+02
  2.35174031e+04]
train_score: 0.7410936091188807
test_score: 0.7808503355095352
cross_val_score: 0.7466523551462286


In [48]:
predicted_data={'age':28,
           'sex':'male',
           'bmi':33.8,
           'children':3,
           'smoker':'no',
           'region':'southeast'}

In [49]:
test_data=pd.DataFrame(predicted_data,index=[0])

test_data.drop('region',axis=1,inplace=True)
test_data['sex'].replace({'female':0,'male':1},inplace=True)
test_data['smoker'].replace({'no':0,'yes':1},inplace=True)



In [50]:
test_data

Unnamed: 0,age,sex,bmi,children,smoker
0,28,1,33.8,3,0


In [51]:
ridge_best.predict(test_data)

array([7464.84369154])

| MODEL                | Train Score | Test Score | CV  Score |
| ---------------------| ------------| -----------|-----------|
| LASSO REGRESSION     |       0.741 |       0.78 |   0.7469  |
| RIDGE REGRESSION     |    0.7410   |    0.7808  |   0.7466  |

# BEST MODEL = LASSO REGRESSION