# 📚Imports

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 🧾Data Loading

In [2]:
insurance=pd.read_csv("insurance.csv")

In [3]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 🔍Exploratory Data Analysis

## 🧽Data Cleaning

In [4]:
#To view the number of rows and columns of the dataset

insurance.shape

(1338, 7)

In [5]:
#Checking for missing values

insurance.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
#Check if there is duplicates

insurance.duplicated().sum()

1

In [7]:
#Drop the duplicate but keep the first

insurance.drop_duplicates(keep='first',inplace=True)

In [8]:
#Check again to confirm that duplicates have been removed

insurance.duplicated().sum()

0

In [9]:
#checking a summarized information of the data

insurance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


### The inference drawn from the summarized information indicates that we have both numerical and categorical columns in our dataset and also, no column has null values

In [10]:
#checking the columns in the dataset

insurance.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

### Columns Explanation

#### Age: tells of the age of a particular individual
#### Sex: tells if an individual is male or female
#### BMI: it shows the body mass index of an individual. People with bmi below 18.5 are considered underweight, people with bmi within the range of 18.5 to 24.9 are considered normal weighted, individuals from 25 to 29.9 are seen as overweighted and those above 30 are considered obese
#### Smoker: indicates if the individual smokes
#### Region: indicate which of the four region is the person located in.
#### Charges(in US dollars): used to indicate the estimated price paid by an individual.

## Statistical information

In [11]:
#Summarized statistical information of the numerical features

insurance.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


# 📊Visualization

## Univariate and Multivariant Analysis

In [None]:
insurance.pairpot()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set()
plt.figure(figsize=(10,8))
sns.displot(x=insurance['age'],kind='kde')
plt.title('Age Distribution')
plt.show()

##### The univatiate analysis of the age distribution for our dataset reveals a clear peak at the age of approx. 22, which means that people around 22 years old is the most common age among individual in our dataset. The second  most common age appears to be approximately or close to 49

In [None]:
sns.set()
plt.figure(figsize=(10,8))
sns.displot(x=insurance['age'],hue=insurance['smoker'])
plt.title('Ages by smoking status')
plt.show()

##### The univariant analysis of the age distribution by their smoking status shows that many younger adults in their twenties are smokers.

In [None]:
#value count of male and female

insurance['sex'].value_counts()

In [None]:
sns.set()
plt.figure(figsize=(8,6))
sns.countplot(x=insurance['sex'],hue=insurance['smoker'])
plt.title('Sex by smoker status')
plt.show()

##### This univariate analysis of the sex column indicates that we have a slight change in the numbers of female to male in our dataset but there are more male smokers to female smokers. i.e. the likelihood of a female being a smoker is lesser when compared to the male gender.

In [None]:
sns.set()
plt.figure(figsize=(8,6))
sns.displot(x=insurance['bmi'],kind='kde')
plt.title('BMI Distribution')
plt.show()

##### From the plot above, we can draw an inference that the BMI of individual in the dataset is normally distributed i.e the mean, median and mode are all centered. Also the most BMI in the dataset is around 30. This shows that most people are slightly overweighted and so close to being Obese.

In [None]:
sns.set()
plt.figure(figsize=(8,6))
sns.displot(x=insurance['children'])
plt.title('Children Distribution')
plt.show()

##### This univariant analysis shows that people with no children are much when compared to those with children and as such the plot is right-skewed

In [None]:
#Value count of  smmokers and non-smokers

insurance['smoker'].value_counts()

In [None]:
sns.set()
plt.figure(figsize=(8,6))
sns.displot(x=insurance['smoker'])
plt.title('Smokers Distribution')
plt.show()

##### The inference drawn from the plot above shows that we have very few smokers compared to non-smokers which is a good thing for the health of an individual

In [None]:
smokers=insurance[insurance['smoker']=='yes']

In [None]:
smokers

In [None]:
smokers['children'].value_counts()

In [None]:
smokers['region'].value_counts()

In [None]:
sns.set()
plt.figure(figsize=(8,6))
sns.barplot(smokers['region'].value_counts().keys(),smokers['region'].value_counts())
plt.title('Smokers by region')
plt.show()

##### From our plot, there are more number of people in the 'Southeast than any other region

#### So far, it is evident enough that we have both numerical and categorical columns. The categorical columns will not be accepted into the model if not converted to numbers. Thus, we can achieve that by encoding each categorical column.

#### The sex, region and smoker can be encoded

In [None]:
#Label encoding of the sex column
insurance['sex'].replace({"female":1,"male":0},inplace=True)

#label encoding of the smokers column
insurance['smoker'].replace({"yes":1,"no":0},inplace=True)

#label encoding of the region column
insurance['region'].replace({"southeast":0,"southwest":1,"northeast":2,"northwest":3},inplace=True)

In [None]:
insurance.head()

In [None]:
#Display the correlation matrix

plt.figure(figsize=(10,10))
sns.heatmap(insurance.corr(),annot=True)
plt.title('Correlation Matrix')

##### Inference from the correlation analysis shows that the children included in the scheme and smokers status of an inidividual are positively correlated with the charges incurred for medical insuance. Do note that the correlation be it positive or negative doesn't signify causation but can show us which features could impact on our model

# 🔧Feature Engineering

In [None]:
#Split the dataset into features and target

X=insurance.iloc[:,0:-1]

In [None]:
print(X)

In [None]:
print(X.shape)

In [None]:
y=insurance.iloc[:,-1]

In [None]:
print(y)

In [None]:
print(y.shape)

## Split the dataset into training and test data

In [None]:
#import the train_test_split function

from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
print(X.shape,X_train.shape,X_test.shape)

In [None]:
print(y.shape,y_train.shape,y_test.shape)

# 🤖Machine Learning

## 🏋️‍Building our model

In [None]:
#fitting the models

#import the linear regression model
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)

#import the decision tree regressor
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor()
dtr.fit(X_train,y_train)

#import the random forest regressor
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)

#import the support vector regressor
from sklearn.svm import SVR
svr=SVR()
svr.fit(X_train,y_train)

#import the Knearest neighbors regressor
from sklearn.neighbors import KNeighborsRegressor
knr=KNeighborsRegressor()
knr.fit(X_train,y_train)

## Training the models

In [None]:
pred1=lr.predict(X_test)

pred2=dtr.predict(X_test)

pred3=rfr.predict(X_test)

pred4=svr.predict(X_test)

pred5=knr.predict(X_test)

## Evaluation metrics of model's performance

### Using the R-Squared metric

In [None]:
from sklearn.metrics import r2_score

r1=round(r2_score(y_test,pred1),3)

r2=round(r2_score(y_test,pred2),3)

r3=round(r2_score(y_test,pred3),3)

r4=round(r2_score(y_test,pred4),3)

r5=round(r2_score(y_test,pred5),3)

In [None]:
print(f"The R-square value of logistic regression is: {r1}")

print(f"The R-square value of decision tree regressor is: {r2}")

print(f"The R-square value of random forest regressor is: {r3}")

print(f"The R-square value of support vector regressor is: {r4}")

print(f"The R-square value of k-nearest neighbors is: {r5}")

## 📈Visualization of the metrics

In [None]:
model_data={'model':['LR','DTR','RF','SVR','KNN'],
           'r_square':[r1,r2,r3,r4,r5]
           }

In [None]:
pd.DataFrame(model_data)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x=model_data['model'],y=model_data['r_square'])
plt.title('R-Square Metrics')

##### Visualiztion of the metrics shows that the Random Forest Model has a high R-square value and hence can be used to build a predictive model. Before we conclude on this, let's check the r-square of the training dataset and compare with that of the test data set 

In [None]:
#Predict the train dataset

train_data_prediction=rfr.predict(X_train)

In [None]:
#check the r-square value of the train dataset

r2_train_data=round(r2_score(y_train,train_data_prediction),3)

In [None]:
r2_train_data

In [None]:
difference=r2_train_data - r4

print(format(difference,'.3f'))

##### The difference between the r-square is quite small and hence can conclude that there is no overfitting. We can go ahead and make use of the model.

In [None]:
#Hypertune the regressor to iterate over 1000 in search of a better prediction and to maintain a particular state when iterating

regressor=RandomForestRegressor(random_state=2,n_estimators=1000)
regressor.fit(X_train,y_train)

## Building a predictive system

In [None]:
input_data=[27,0,42.13,0,1,0]

input_data_to_array=np.asarray(input_data)

input_data_reshaped=input_data_to_array.reshape(1,-1)

In [None]:
prediction=regressor.predict(input_data_reshaped)

In [None]:
print("The insurance cost is $",format(prediction[0],'.2f'),sep='')

## ✔Saving the model

In [None]:
import joblib

joblib.dump(regressor,'regressor.joblib')

print("Model has been saved")

In [None]:
model=joblib.load('regressor.joblib')

input_data_to_array=np.asarray([27,0,42.13,0,1,0])

input_data_reshaped=input_data_to_array.reshape(1,-1)
model.predict(input_data_reshaped)