In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pickle

In [2]:
# loading the data from csv file to a Pandas DataFrame
insurance_dataset = pd.read_csv('datasets/insurance.csv')

In [3]:
# first 5 rows of the dataframe
insurance_dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Data Pre-Processing

Encoding the categorical features

In [4]:
# encoding sex column
insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column
insurance_dataset.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

  insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)
  insurance_dataset.replace({'smoker':{'yes':0,'no':1}}, inplace=True)
  insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)


Splitting the Features and Target

In [5]:
X = insurance_dataset.drop(columns='charges', axis=1)
Y = insurance_dataset['charges']

Splitting the data into Training data & Testing Data

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [7]:
print(X.shape, X_train.shape, X_test.shape)

(1338, 6) (1070, 6) (268, 6)


Model Training

Linear Regression

In [8]:
# loading the Linear Regression model
regressor = LinearRegression()

In [9]:
regressor.fit(X_train, Y_train)

Model Evaluation

In [10]:
# prediction on training data
training_data_prediction =regressor.predict(X_train)

In [11]:
# R squared value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R squared vale : ', r2_train)

R squared vale :  0.751505643411174


In [12]:
# prediction on test data
test_data_prediction =regressor.predict(X_test)

In [13]:
# R squared value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R squared vale : ', r2_test)

R squared vale :  0.7447273869684077


Building a Predictive System

In [14]:
input_data = (21,1,25.74,0,1,0)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)
print(prediction)

print('The insurance cost is USD ', prediction[0])

[1246.0293569]
The insurance cost is USD  1246.029356904317




In [15]:
with open('insurance.pkl', 'wb') as f:
    pickle.dump(regressor, f)

Loading the model and checking

In [16]:
testing_load = pickle.load(open('insurance.pkl','rb'))

In [17]:
input_data = (31,1,25.74,0,1,0)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = testing_load.predict(input_data_reshaped)
print(prediction)

print('The insurance cost is USD ', prediction[0])

[3760.0805765]
The insurance cost is USD  3760.0805764960496


