In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#dataset reading
dts = pd.read_csv("insurance_pre.csv")

In [3]:
dts

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
#viewing no of columns
dts.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [5]:
#changing categorical value to numerical value
dts = pd.get_dummies(dts)

In [6]:
dts

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes
0,19,27.900,0,16884.92400,1,0,0,1
1,18,33.770,1,1725.55230,0,1,1,0
2,28,33.000,3,4449.46200,0,1,1,0
3,33,22.705,0,21984.47061,0,1,1,0
4,32,28.880,0,3866.85520,0,1,1,0
...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,1,0
1334,18,31.920,0,2205.98080,1,0,1,0
1335,18,36.850,0,1629.83350,1,0,1,0
1336,21,25.800,0,2007.94500,1,0,1,0


In [7]:
#separate the input and output values
feature = dts[["age","bmi","children","sex_female","sex_male","smoker_yes","smoker_no"]]

In [8]:
target = dts["charges"]

In [9]:
feature,target

(      age     bmi  children  sex_female  sex_male  smoker_yes  smoker_no
 0      19  27.900         0           1         0           1          0
 1      18  33.770         1           0         1           0          1
 2      28  33.000         3           0         1           0          1
 3      33  22.705         0           0         1           0          1
 4      32  28.880         0           0         1           0          1
 ...   ...     ...       ...         ...       ...         ...        ...
 1333   50  30.970         3           0         1           0          1
 1334   18  31.920         0           1         0           0          1
 1335   18  36.850         0           1         0           0          1
 1336   21  25.800         0           1         0           0          1
 1337   61  29.070         0           1         0           1          0
 
 [1338 rows x 7 columns],
 0       16884.92400
 1        1725.55230
 2        4449.46200
 3       21984.47061


In [10]:
#splitting the train & test datasets.
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(feature,target,test_size=0.33,random_state=0)

In [11]:
X_train,X_test

(      age     bmi  children  sex_female  sex_male  smoker_yes  smoker_no
 1271   25  34.485         0           1         0           0          1
 1313   19  34.700         2           1         0           1          0
 2      28  33.000         3           0         1           0          1
 405    52  38.380         2           1         0           0          1
 482    18  31.350         0           1         0           0          1
 ...   ...     ...       ...         ...       ...         ...        ...
 763    27  26.030         0           0         1           0          1
 835    42  35.970         2           0         1           0          1
 1216   40  25.080         0           0         1           0          1
 559    19  35.530         0           0         1           0          1
 684    33  18.500         1           1         0           0          1
 
 [896 rows x 7 columns],
       age     bmi  children  sex_female  sex_male  smoker_yes  smoker_no
 578    52 

In [12]:
Y_train,Y_test

(1271     3021.80915
 1313    36397.57600
 2        4449.46200
 405     11396.90020
 482      1622.18850
            ...     
 763      3070.80870
 835      7160.33030
 1216     5415.66120
 559      1646.42970
 684      4766.02200
 Name: charges, Length: 896, dtype: float64,
 578      9724.53000
 610      8547.69130
 569     45702.02235
 1034    12950.07120
 198      9644.25250
            ...     
 117     19107.77960
 520     25656.57526
 422     39125.33225
 294      3906.12700
 261     17085.26760
 Name: charges, Length: 442, dtype: float64)

In [13]:
#Creating & Training the model.
from sklearn.ensemble import RandomForestRegressor
model_create = RandomForestRegressor(criterion = "absolute_error",n_estimators=1000, max_features="log2")
model_create = model_create.fit(X_train,Y_train)

In [14]:
#Storing the predicted values.
Y_pred = model_create.predict(X_test)

In [15]:
Y_pred

array([10651.06353829,  9602.23371645, 43580.31640328, 14250.47863682,
        9619.41429692, 11671.53840348,  2871.45251122, 11589.53549408,
        7717.13776482,  5776.32776905,  6072.78030014, 11169.76607089,
        8195.48756785,  5512.69482153, 23992.64780476, 11155.59745919,
       13260.8521761 ,  4250.68034851,  7165.31636653, 32032.49179209,
       25582.0140648 , 14992.94780729, 11001.65604691, 26836.71857108,
        2457.77008118,  6906.76403091,  4036.81939741,  8541.99235534,
        3885.56839195, 12107.26392156,  8669.63942593, 44846.64256682,
       14422.07340675, 13550.45961035, 18488.90868768,  4368.03243946,
        8767.91845464, 37074.17046738, 38384.68551569,  2438.34300793,
        5440.4825687 ,  4472.40303175, 22113.4213855 , 45492.00069626,
       37292.26262075,  3869.86843416, 11281.86259898,  6490.76464606,
        6299.91703483, 13151.85685226,  4628.72967259,  5340.31547075,
       27392.52645331, 43952.40991155, 11697.78467323, 10882.43184138,
      

In [16]:
#Evaluation metrics for model prediction.
from sklearn.metrics import r2_score
rscore=r2_score(Y_test,Y_pred)

In [17]:
rscore

0.8736764438863796

In [18]:
#Saving model phase
import pickle
filename = "bestmodel_RF.sav"
pickle.dump(model_create,open(filename,"wb"))

In [24]:
#Loading the model for testing purpose
#Giving Static input
loaded_model = pickle.load(open("bestmodel_RF.sav", "rb"))
static_input = loaded_model.predict([[33,18.500,1,1,0,0,1]])



In [25]:
#Actual output 4766.02200, Predicted output 5115.8112
static_input

array([5115.81128384])

In [26]:
#Giving Dynamic input
dynm_age = int(input("Enter the Age:"))
dynm_bmi = float(input("Enter the BMI:"))
dynm_child = int(input("Enter no of Children:"))
dynm_s_f = int(input("Enter the s/f 0 or 1:"))
dynm_s_m = int(input("Enter the s/m 0 or 1:"))
dynm_sm_y = int(input("Enter the sm/y 0 or 1:"))
dynm_sm_n = int(input("Enter the sm/n 0 or 1:"))

Enter the Age:33
Enter the BMI:18.500
Enter no of Children:1
Enter the s/f 0 or 1:1
Enter the s/m 0 or 1:0
Enter the sm/y 0 or 1:0
Enter the sm/n 0 or 1:1


In [27]:
dynamic_input = loaded_model.predict([[dynm_age,dynm_bmi,dynm_child,dynm_s_f,dynm_s_m,dynm_sm_y,dynm_sm_n]])



In [28]:
##Actual output 4766.02200, Predicted output 5115.8112
dynamic_input

array([5115.81128384])