# **Medical Cost Prediction**


Project that uses different ML to evaluate the insurance cost using regressions. The projects covers:


In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

ModuleNotFoundError: No module named 'numpy'

# Loading dataset

These are the columns in the data set
* age: age of primary beneficiary
* sex: insurance contractor gender, female, male
* bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
* children: Number of children covered by health insurance / Number of dependents
* smoker: Smoking
* region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
* charges: Individual medical costs billed by health insurance

In [None]:
dataset = pd.read_csv('../input/insurance/insurance.csv')
df = dataset.copy()

# Check for the first 5 data 

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
# Looking into type of data and if it is nullable

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
# Checking data missing values

df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
# Look at the data shape (rows and columns)

df.shape

(1338, 7)

In [None]:
# Outputing statstical summary of numeric data (age,bmi,children,charges)

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


# Preprocessing

In [None]:
# Looking into the data of the categorical features

df['sex'].unique()
# data['region']=data['region'].map({'southwest':1,'southeast':2,
#                    'northwest':3,'northeast':4})

array(['female', 'male'], dtype=object)

In [None]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [None]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

# **Ordinal Encoding**
Since (smoker,sex) only have 2 values each we manually map them to 0 and 1 

In [None]:
df['sex']= df['sex'].map({'female':0,'male':1})
df['smoker']= df['smoker'].map({'yes':1,'no':0})

# **One Hot Encoding**

For regions we creates new columns indicating the presence (or absence) of each possible value in the original regions.

In [None]:
# Transforming categorical features (sex,smokers,regions) into numeric
# Using One-Hot Encoding

one_hot_region = pd.get_dummies(df['region'])
one_hot_region.head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [None]:
# Add in the new One-Hot Encoding
df = pd.concat([df, one_hot_region], axis = 1)

# Drop the original region column and replace them with the One-Hot Encoding
df.drop('region',axis = 1, inplace = True)

In [None]:
# Check to see if the data is all numerical
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


# Spliting Data # 
Remove our target away from our features

* X will be our feature prediction
* y wil be our target

In [None]:
X = df.drop('charges',axis = 1)
y = df['charges']

# Train Test Spliting

In [None]:
# Importing the train_test_split package
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [None]:
# importing different models

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
# define a function to evaluate the models

def score_eval(scores):
    scores = np.sqrt(-scores)
    print(f"""
          RMSE Scores: {scores}
          Mean: {scores.mean()}
          Standard Deviation : {scores.std()}
          """)

In [None]:
linreg_scores = cross_val_score (LinearRegression(), X_train, y_train,
                                 scoring = 'neg_mean_squared_error', cv=10)

In [None]:
# Display all scores 
score_eval(linreg_scores)


          RMSE Scores: [6073.99997254 6580.87166754 5233.82683473 6045.25239632 5836.4647871
 6127.43525895 7281.08373608 6321.52706355 6193.3145481  5637.06299714]
          Mean: 6133.0839262055
          Standard Deviation : 520.7183652979136
          


In [None]:
ranForest_scores = cross_val_score (RandomForestRegressor(), X_train, y_train,
                                 scoring = 'neg_mean_squared_error', cv=10)

In [None]:
# Display all scores 
score_eval(ranForest_scores)


          RMSE Scores: [4787.92538437 5264.23247723 3921.91142335 4391.38307997 5257.07845114
 4944.21314058 5593.84539368 5389.06060756 5205.71593215 4679.09173734]
          Mean: 4943.44576273742
          Standard Deviation : 483.117307804376
          


In [None]:
svm_scores = cross_val_score (SVR(), X_train, y_train,
                                 scoring = 'neg_mean_squared_error', cv=10)

In [None]:
# Display all scores 
score_eval(svm_scores)


          RMSE Scores: [12545.69050299 12492.39078172 12837.35996149 14521.35165917
 13015.92317698 10432.22441546 12341.21621862 12172.2310244
 12952.29052569 12270.57033622]
          Mean: 12558.124860275553
          Standard Deviation : 953.9264581446631
          


In [None]:
decisionForest = cross_val_score (DecisionTreeRegressor(), X_train, y_train,
                                 scoring = 'neg_mean_squared_error', cv=10)

In [None]:
# Display all scores 
score_eval(decisionForest)


          RMSE Scores: [5822.60326009 6661.92693551 7062.01558147 6823.11996235 7232.64282777
 6159.43791389 7616.23917093 6668.51086082 6580.39410245 5840.8507411 ]
          Mean: 6646.774135637717
          Standard Deviation : 552.992136685402
          


In [None]:
gradientBoost = cross_val_score (GradientBoostingRegressor(), X_train, y_train,
                                 scoring = 'neg_mean_squared_error', cv=10)

In [None]:
# Display all scores 
score_eval(gradientBoost)


          RMSE Scores: [4603.82502154 4938.60773456 3367.58152847 3840.41278537 4761.83970827
 4672.67492152 5570.99211731 5022.74831213 4772.34996475 4716.36508299]
          Mean: 4626.7397176914965
          Standard Deviation : 583.061629233928
          


With RandomForestRegession has the lowest scores, we will continue using the model

# **Parameter Tuning**
Tuning parameters to find the best parameters for the model

In [None]:
params = {'n_estimators': [3, 5, 10, 20, 50, 70, 100],
          'n_jobs': [2, 3, 5, 8, 10, 12, 15]}

In [None]:
grid_s = GridSearchCV(RandomForestRegressor(), params, cv = 5,
                      scoring = 'neg_mean_squared_error')

In [None]:
grid_s.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [3, 5, 10, 20, 50, 70, 100],
                         'n_jobs': [2, 3, 5, 8, 10, 12, 15]},
             scoring='neg_mean_squared_error')

In [None]:
grid_s.best_params_

{'n_estimators': 70, 'n_jobs': 8}

In [None]:
# Printing all RMSE and the parameters

for mean_score,params in zip((grid_s.cv_results_['mean_test_score']),
                            (grid_s.cv_results_['params'])):
    print(np.sqrt(-mean_score),'    ',params)

5537.389608293812      {'n_estimators': 3, 'n_jobs': 2}
5696.046817386897      {'n_estimators': 3, 'n_jobs': 3}
5674.766959766677      {'n_estimators': 3, 'n_jobs': 5}
5790.276563259737      {'n_estimators': 3, 'n_jobs': 8}
5659.182678149859      {'n_estimators': 3, 'n_jobs': 10}
5488.812745008263      {'n_estimators': 3, 'n_jobs': 12}
5359.5162957384355      {'n_estimators': 3, 'n_jobs': 15}
5305.08465056891      {'n_estimators': 5, 'n_jobs': 2}
5256.238242548056      {'n_estimators': 5, 'n_jobs': 3}
5400.330336645441      {'n_estimators': 5, 'n_jobs': 5}
5284.634037599105      {'n_estimators': 5, 'n_jobs': 8}
5383.132859270321      {'n_estimators': 5, 'n_jobs': 10}
5254.714079144711      {'n_estimators': 5, 'n_jobs': 12}
5388.745967522668      {'n_estimators': 5, 'n_jobs': 15}
5012.902172873897      {'n_estimators': 10, 'n_jobs': 2}
5140.820935710741      {'n_estimators': 10, 'n_jobs': 3}
5108.858555437097      {'n_estimators': 10, 'n_jobs': 5}
5121.0179904498345      {'n_estimators'

# **Predicting**
Predicting the target after we preprocessed the data and choose an appropriate model :)

In [None]:
predictions = grid_s.best_estimator_.predict(X_test)

In [None]:
y_test[0:10].values

array([ 9095.06825,  5272.1758 , 29330.98315,  9301.89355, 33750.2918 ,
        4536.259  ,  2117.33885, 14210.53595,  3732.6251 , 10264.4421 ])

In [None]:
comparison = pd.DataFrame({'Y Test': y_test[0:10].values,
                           'Predictions': predictions[0:10]})
comparison

Unnamed: 0,Y Test,Predictions
0,9095.06825,10162.978032
1,5272.1758,5643.367122
2,29330.98315,28047.231102
3,9301.89355,11384.651592
4,33750.2918,34606.424351
5,4536.259,8803.980034
6,2117.33885,2084.803017
7,14210.53595,14242.508763
8,3732.6251,6094.430697
9,10264.4421,11275.376706


Some are still off compared to the actual value. Now we evaluate our model

# **Evaluation**

We use different evaluation methods such as:
- R-squared
- Mean Squared Error (MSE)
- Mean Absolute Error (MAE)

In [None]:
# Importing the errors

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [None]:
# Building a regresssion evaluations

def regression_eval(preds):
    mse = mean_squared_error(y_test,preds)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_test,preds)
    mae = mean_absolute_error(y_test,preds)
    
    print(f"""Mean Absolute Error: {mae}
            Mean Squared Error: {mse}
            Root Mean Squared Error: {rmse}
            Squared Value: {r_squared}""")

In [None]:
regression_eval(predictions)

Mean Absolute Error: 2552.9342673818414
            Mean Squared Error: 21742433.0444859
            Root Mean Squared Error: 4662.87819318561
            Squared Value: 0.8599509777918024


Adding a clone data 

In [None]:
data = {'age' : 40,
        'sex' : 1,
        'bmi' : 40.30,
        'children' : 4,
        'smoker' : 1,
        'northeast' : 0,
        'northwest' : 1,
        'southeast' : 0,
        'southwest' : 0}

In [None]:
df = pd.DataFrame(data,index=[0])
df

Unnamed: 0,age,sex,bmi,children,smoker,northeast,northwest,southeast,southwest
0,40,1,40.3,4,1,0,1,0,0


In [None]:
rf = RandomForestRegressor()
rf.fit(X,y)

RandomForestRegressor()

In [None]:
new_pred = rf.predict(df)
print("Medical Insurance cost for New Customer is : ",new_pred[0])

Medical Insurance cost for New Customer is :  44101.250279


In [None]:
import joblib

In [None]:
joblib.dump(rf,'model_joblib_rf')

In [None]:
model = joblib.load('model_joblib_rf')

In [None]:
model.predict(df)

# **GUI**

Implementing a GUI so that users can use

In [None]:
from tkinter import *
import joblib

In [None]:
def show_entry():
    p1 = float(e1,get())
    p2 = float(e2,get())
    p3 = float(e3,get())
    p4 = float(e4,get())
    p5 = float(e5,get())
    p6 = float(e6,get())

    model = joblib.load('model_joblib_rf')
    result = model.predict([[p1,p2,p3,p4,p5,p6]])

    Label(master, text = "Insurance Cost").grid(row=7)
    Label(master, test=result).grid

In [None]:
master = Tk()
master.title("Insurance Cost Prediction")
label = Label(master,text = "Insurance Cost Prediction",
              bg = "black",
              fg = "white").grid(row = 0 , columnspan = 2)

Label(master,text = "Enter Your Age").grid(row=1)
Label(master,text = "Enter Your Sex (Male/Female) [1/0]").grid(row=2)
Label(master,text = "Enter Your BMI Score").grid(row=3)
Label(master,text = "Enter Your Number of Children").grid(row=4)
Label(master,text = "Do you smoke? Yes/No [1/0]").grid(row=5)
Label(master,text = "Do you live in Northeast? [1/0]").grid(row=6)
Label(master,text = "Do you live in Northest? [1/0]").grid(row=7)
Label(master,text = "Do you live in Southeast? [1/0]").grid(row=8)
Label(master,text = "Do you live in Southwest? [1/0]").grid(row=9)

e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)

Button(master,text="Predict", command=show_entry.grid())



mainloop()

ModuleNotFoundError: No module named 'joblib'