In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Read and store the csv file to "data" variable
data = pd.read_csv("insurance_pre.csv")

In [9]:
#print first 5 rows of the data
data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


* **Identifing the problem statement:**
  - Stage 1:
    * We have Input data's are Numbers - so Domain will be Machine Learning
  - Stage 2:
    * Supervised Learning- It's a Clear requirnment and we have IP/OP data.
  - stage 3:
    * Numerical values(Target features) - so we can go with Regression

* **Basic info about the dataSet:**
  * Total Row : 1388
  * Total Columns : 6
  * Dataset contains both numerical and categorical values 
  * Column names : AGE, SEX, BMI, CHILDREN, SMOKER, CHARGES(Traget variable)
  * age,children are Integer type.
  * bmi and charges are Float type.
  * sex and smoker are in Object type.
  * There is no null values in the dataset.
  * Requirnment is clear.

In [7]:
data.shape

(1338, 6)

In [24]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
charges     float64
dtype: object

In [32]:
print(data['age'].min())
print(data['age'].max())

18
64


In [34]:
data['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [52]:
data['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

### **Pre-Processing Method**

In [78]:
df = pd.get_dummies(data,drop_first=True, dtype =int)

In [80]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


**Spliting the Independent variable and Dependent variable**

In [91]:
df.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [93]:
independent = df[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dependent = df[['charges']]

In [99]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(independent,dependent,test_size=0.30,random_state=32)

In [113]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [125]:
X_train

array([[-1.45539989,  0.61957641, -0.91671781,  0.98304962, -0.51465024],
       [-1.52772942,  0.08852876, -0.91671781, -1.01724265, -0.51465024],
       [ 1.7994288 ,  0.87454423, -0.91671781, -1.01724265, -0.51465024],
       ...,
       [ 1.07613353, -1.06369847, -0.91671781,  0.98304962, -0.51465024],
       [-1.52772942,  0.21682468, -0.91671781, -1.01724265, -0.51465024],
       [-0.73210463, -1.43397022, -0.09561937, -1.01724265,  1.94306722]])

## **Model 1 - MLR**

In [115]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [117]:
wt = regressor.coef_
wt

array([[3543.52178492, 2114.05811025,  606.22568166,  -73.92937558,
        9590.5194808 ]])

In [119]:
bias = regressor.intercept_
bias

array([13362.05764083])

In [121]:
# Predict the test data:
y_pred = regressor.predict(X_test)

In [123]:
from sklearn.metrics import r2_score
r2score = r2_score(y_test,y_pred)
r2score

0.7723923235599369

## **Model 2 - SVR**

In [157]:
from sklearn.svm import SVR
hp = [10,100,500,1000,2000,3000]
ker = ['linear','rbf','poly','sigmoid']
svr_r2score=[]
for x in ker:
    for y in hp:
        svr_model = SVR(kernel=x,C=y)
        svr_model.fit(X_train, y_train)
        y_pred = svr_model.predict(X_test)
        r2 = r2_score(y_test,y_pred)
        svr_r2score.append((x,y,r2))

In [183]:
# SVR_R2_Score : Results with all the combinations:
for kernel, C, score in svr_r2score:
    print("Kernel:",kernel,"- C:",C,"- R² Score",score)

Kernel: linear - C: 10 - R² Score 0.4411154085859137
Kernel: linear - C: 100 - R² Score 0.6205250918367728
Kernel: linear - C: 500 - R² Score 0.7486844603269274
Kernel: linear - C: 1000 - R² Score 0.7187303718390283
Kernel: linear - C: 2000 - R² Score 0.717185774596472
Kernel: linear - C: 3000 - R² Score 0.7161570985448058
Kernel: rbf - C: 10 - R² Score -0.044010385361080706
Kernel: rbf - C: 100 - R² Score 0.3229298217686596
Kernel: rbf - C: 500 - R² Score 0.6597447201916936
Kernel: rbf - C: 1000 - R² Score 0.7923789012887561
Kernel: rbf - C: 2000 - R² Score 0.8332041538541531
Kernel: rbf - C: 3000 - R² Score 0.8422384746495604
Kernel: poly - C: 10 - R² Score 0.024580030588300605
Kernel: poly - C: 100 - R² Score 0.5897995020893756
Kernel: poly - C: 500 - R² Score 0.8038815140652048
Kernel: poly - C: 1000 - R² Score 0.8338303273683121
Kernel: poly - C: 2000 - R² Score 0.8403113295667679
Kernel: poly - C: 3000 - R² Score 0.8390286005966449
Kernel: sigmoid - C: 10 - R² Score 0.02297826583

## **Model 3 - Decision Tree Regressor**

In [195]:
from sklearn.tree import DecisionTreeRegressor
criterion1 = ['squared_error','friedman_mse','absolute_error','poisson']
max_ft1 = ['sqrt','log2']
spliter1 = ['best','random']
DT_r2 = []

for cri in criterion1:
    for mx_ft in max_ft1:
        for split in spliter1:
            DT_model = DecisionTreeRegressor(criterion=cri,max_features=mx_ft,splitter=split)
            DT_model.fit(X_train,y_train)
            y_pred = DT_model.predict(X_test)
            dt_r2 = r2_score(y_test,y_pred)
            DT_r2.append((cri,mx_ft,split,dt_r2))


In [199]:
for criterion,max_features,splitter,score in DT_r2:
    print("Criterion:",criterion,"max_features:",max_features,"splitter:",splitter,"R2_Score:",score)

Criterion: squared_error max_features: sqrt splitter: best R2_Score: 0.6905149345691342
Criterion: squared_error max_features: sqrt splitter: random R2_Score: 0.6089158084430758
Criterion: squared_error max_features: log2 splitter: best R2_Score: 0.7208084204160041
Criterion: squared_error max_features: log2 splitter: random R2_Score: 0.5885137699183175
Criterion: friedman_mse max_features: sqrt splitter: best R2_Score: 0.741628247754792
Criterion: friedman_mse max_features: sqrt splitter: random R2_Score: 0.632694706926831
Criterion: friedman_mse max_features: log2 splitter: best R2_Score: 0.6353895211454534
Criterion: friedman_mse max_features: log2 splitter: random R2_Score: 0.6909830368433382
Criterion: absolute_error max_features: sqrt splitter: best R2_Score: 0.6465435342030181
Criterion: absolute_error max_features: sqrt splitter: random R2_Score: 0.7744421292327105
Criterion: absolute_error max_features: log2 splitter: best R2_Score: 0.712613522330376
Criterion: absolute_error 

## **Random_Forest_Regressor**

In [208]:
from sklearn.ensemble import RandomForestRegressor

n_estimators = [50,100,200,250,300,350,400,500]
criterion2 = ['squared_error','friedman_mse','poisson']
max_ft2 = ['sqrt','log2']
randomforest_r2score= []

for estimators in n_estimators:
    for crt in criterion2:
        for max_frt in max_ft2:
            rff = RandomForestRegressor(n_estimators=estimators ,criterion=crt ,max_features=max_frt ,random_state=32)
            rff.fit(X_train,y_train)
            y_pred = rff.predict(X_test)
            rf_r2score = r2_score(y_test,y_pred)
            randomforest_r2score.append((estimators,crt,max_frt,rf_r2score))


In [209]:
for n_estimators,criterion,max_features,score in randomforest_r2score:
    print("n_estimators:",n_estimators,"Criterion:",criterion,"max_features:",max_features,"R2_Score:",score)

n_estimators: 50 Criterion: squared_error max_features: sqrt R2_Score: 0.8502984279389878
n_estimators: 50 Criterion: squared_error max_features: log2 R2_Score: 0.8502984279389878
n_estimators: 50 Criterion: friedman_mse max_features: sqrt R2_Score: 0.8509127600875582
n_estimators: 50 Criterion: friedman_mse max_features: log2 R2_Score: 0.8509127600875582
n_estimators: 50 Criterion: poisson max_features: sqrt R2_Score: 0.8468833336548729
n_estimators: 50 Criterion: poisson max_features: log2 R2_Score: 0.8468833336548729
n_estimators: 100 Criterion: squared_error max_features: sqrt R2_Score: 0.8492395564499937
n_estimators: 100 Criterion: squared_error max_features: log2 R2_Score: 0.8492395564499937
n_estimators: 100 Criterion: friedman_mse max_features: sqrt R2_Score: 0.8492782583516395
n_estimators: 100 Criterion: friedman_mse max_features: log2 R2_Score: 0.8492782583516395
n_estimators: 100 Criterion: poisson max_features: sqrt R2_Score: 0.8474801406941771
n_estimators: 100 Criterion

## **Saving the best model for Deployment** 

In [248]:
from sklearn.ensemble import RandomForestRegressor
rf_Reg = RandomForestRegressor(n_estimators=500 ,criterion='poisson' ,max_features='log2' ,random_state=32)
rf_Reg.fit(X_train,y_train)
y_predict = rf_Reg.predict(X_test)
r2_Score = r2_score(y_test,y_predict)
r2_Score

0.8518107418752442

In [249]:
import pickle

In [252]:
filename = "rf_reg_finalized.sav"
pickle.dump(rf_Reg,open(filename,'wb'))