## MEDICAL COST PREDICTION:
The dataset contains medical information and costs billed by health insurance companies. It contains 1338 rows of data and the following columns: age, gender, BMI, children, smoker, region and insurance charges.
THe model predicts insurance costs according to factors like age, BMI, etc.

### IMPORTING LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

### IMPORTING DATASET

In [None]:
dataset = pd.read_csv("/home/kashish/insurance.csv")
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values
Y = Y.reshape(len(Y),1)

In [None]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
dataset.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [None]:
dataset['children'].unique()

array([0, 1, 3, 2, 5, 4])

In [None]:
dataset['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [None]:
dataset['sex'].unique()

array(['female', 'male'], dtype=object)

In [None]:
dataset['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

### ENCODING CATEGORICAL DATA

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ct = ColumnTransformer(transformers = [('encoder',ohe,[1,-2,-1])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
X[0]

array([1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 19, 27.9, 0], dtype=object)

### SPLITTING INTO TRAIN AND TEST SET

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

In [None]:
X_train

array([[1.0, 0.0, 1.0, ..., 53, 26.6, 0],
       [0.0, 1.0, 1.0, ..., 53, 21.4, 1],
       [0.0, 1.0, 1.0, ..., 18, 37.29, 0],
       ...,
       [1.0, 0.0, 0.0, ..., 51, 34.96, 2],
       [1.0, 0.0, 0.0, ..., 40, 22.22, 2],
       [0.0, 1.0, 1.0, ..., 57, 27.94, 1]], dtype=object)

# Multiple Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression()

In [None]:
Y_pred = regressor.predict(X_test)

In [None]:
np.set_printoptions(precision=5)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1),Y_test.reshape(len(Y_test),1)),1))

[[ 4383.6809   1646.4297 ]
 [12885.03892 11353.2276 ]
 [12589.21653  8798.593  ]
 [13286.22919 10381.4787 ]
 [  544.72833  2103.08   ]
 [32117.58401 38746.3551 ]
 [12919.04237  9304.7019 ]
 [12318.62183 11658.11505]
 [ 3784.29146  3070.8087 ]
 [29468.45725 19539.243  ]
 [11002.81394 12629.8967 ]
 [17539.69474 11538.421  ]
 [ 8681.35472  6338.0756 ]
 [ 8349.04326  7050.642  ]
 [ 3130.12726  1137.4697 ]
 [10445.83896  8968.33   ]
 [ 3863.74358 21984.47061]
 [ 6944.62511  6414.178  ]
 [15009.63121 28287.89766]
 [14441.59912 13462.52   ]
 [12543.65769  9722.7695 ]
 [32958.72553 40932.4295 ]
 [ 9072.63608  8026.6666 ]
 [ 8986.8586   8444.474  ]
 [ 3022.85773  2203.47185]
 [ 8164.97136  6664.68595]
 [ 9556.07558  8606.2174 ]
 [10743.20364  8283.6807 ]
 [ 7694.01744  5375.038  ]
 [ 4373.43772  3645.0894 ]
 [14140.93558 11674.13   ]
 [ 5811.78545 11737.84884]
 [34631.91317 24873.3849 ]
 [27009.11191 33750.2918 ]
 [33348.14099 24180.9335 ]
 [ 9532.96787  9863.4718 ]
 [30421.65018 36837.467  ]
 

In [None]:
from sklearn.metrics import r2_score
print("Accuracy = ",r2_score(Y_test,Y_pred)*100,"%")

Accuracy =  76.23311844057112 %


# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
for i in range(2,5):
    poly_reg = PolynomialFeatures(degree=i)
    X_poly = poly_reg.fit_transform(X)
    X_polytrain, X_polytest, Y_polytrain, Y_polytest = train_test_split(X_poly, Y, test_size = 0.2, random_state=1)
    polynom = LinearRegression()
    polynom.fit(X_polytrain, Y_polytrain)
    Y_polypred = polynom.predict(X_polytest)
    print("For degree = ",i, ": Accuracy = ",r2_score(Y_polytest,Y_polypred)*100,"%")

For degree =  2 : Accuracy =  86.21935299452076 %
For degree =  3 : Accuracy =  84.30427588082942 %
For degree =  4 : Accuracy =  78.01754510097439 %


The best accuracy is given by polynomial regression with degree=2.

### STANDARDIZATION

In [None]:
from sklearn.preprocessing import StandardScaler
sc1 = StandardScaler()
X_train1 = X_train
Y_train1 = Y_train
X_test1 = X_test
Y_test1 = Y_test
X_train1[:,8:] = sc1.fit_transform(X_train1[:,8:])
X_test1[:,8:] = sc1.transform(X_test1[:,8:])
sc2 = StandardScaler()
Y_train1 = sc2.fit_transform(Y_train1)

# Support Vector Regression

In [None]:
from sklearn.svm import SVR
svr_regressor = SVR(kernel = 'rbf')
svr_regressor.fit(X_train1, Y_train1)
Y_svrpred = sc2.inverse_transform(svr_regressor.predict(X_test1))

  return f(*args, **kwargs)


In [None]:
np.set_printoptions(precision=5)
print(np.concatenate((Y_svrpred.reshape(len(Y_svrpred),1),sc2.inverse_transform(Y_test1).reshape(len(Y_test1),1)),1))

[[2.83469e+03 1.98968e+07]
 [1.24909e+04 1.37124e+08]
 [9.95406e+03 1.06272e+08]
 [1.20722e+04 1.25388e+08]
 [1.87893e+03 2.54117e+07]
 [3.62156e+04 4.67945e+08]
 [9.97554e+03 1.12384e+08]
 [1.28025e+04 1.40806e+08]
 [3.52559e+03 3.70988e+07]
 [2.00529e+04 2.35985e+08]
 [1.37488e+04 1.52542e+08]
 [1.27968e+04 1.39360e+08]
 [7.05029e+03 7.65569e+07]
 [8.05519e+03 8.51624e+07]
 [2.36095e+03 1.37502e+07]
 [9.62294e+03 1.08322e+08]
 [3.93502e+03 2.65515e+08]
 [7.54277e+03 7.74759e+07]
 [1.38474e+04 3.41640e+08]
 [1.47037e+04 1.62597e+08]
 [1.09470e+04 1.17433e+08]
 [3.78273e+04 4.94346e+08]
 [9.07164e+03 9.69496e+07]
 [9.62640e+03 1.01995e+08]
 [3.44546e+03 2.66241e+07]
 [7.26307e+03 8.05013e+07]
 [9.29460e+03 1.03949e+08]
 [9.27776e+03 1.00054e+08]
 [6.66860e+03 6.49265e+07]
 [4.09470e+03 4.40342e+07]
 [1.28456e+04 1.40999e+08]
 [5.52162e+03 1.41769e+08]
 [2.17895e+04 3.00404e+08]
 [2.63778e+04 4.07609e+08]
 [2.38430e+04 2.92041e+08]
 [1.06984e+04 1.19132e+08]
 [3.86852e+04 4.44892e+08]
 

In [None]:
print("Accuracy = ",r2_score(Y_test1,Y_svrpred)*100,"%")

Accuracy =  86.28653783011315 %


# Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
dec_regressor = DecisionTreeRegressor(random_state=0)
dec_regressor.fit(X_train, Y_train)

DecisionTreeRegressor(random_state=0)

In [None]:
Y_decpred = dec_regressor.predict(X_test)

In [None]:
np.set_printoptions(precision=5)
print(np.concatenate((Y_decpred.reshape(len(Y_test),1),Y_test.reshape(len(Y_test),1)),1))

[[ 1263.249    1646.4297 ]
 [11842.62375 11353.2276 ]
 [10848.1343   8798.593  ]
 [10702.6424  10381.4787 ]
 [ 1964.78     2103.08   ]
 [39983.42595 38746.3551 ]
 [ 9048.0273   9304.7019 ]
 [11658.37915 11658.11505]
 [ 3558.62025  3070.8087 ]
 [20296.86345 19539.243  ]
 [24513.09126 12629.8967 ]
 [12105.32    11538.421  ]
 [ 6600.361    6338.0756 ]
 [ 6770.1925   7050.642  ]
 [ 1137.011    1137.4697 ]
 [ 9620.3307   8968.33   ]
 [ 4185.0979  21984.47061]
 [ 5926.846    6414.178  ]
 [14474.675   28287.89766]
 [13844.7972  13462.52   ]
 [ 9140.951    9722.7695 ]
 [39241.442   40932.4295 ]
 [28340.18885  8026.6666 ]
 [ 8782.469    8444.474  ]
 [21344.8467   2203.47185]
 [ 6393.60345  6664.68595]
 [ 6406.4107   8606.2174 ]
 [ 8280.6227   8283.6807 ]
 [ 5836.5204   5375.038  ]
 [ 3558.62025  3645.0894 ]
 [10601.412   11674.13   ]
 [ 5124.1887  11737.84884]
 [35069.37452 24873.3849 ]
 [36898.73308 33750.2918 ]
 [21978.6769  24180.9335 ]
 [10197.7722   9863.4718 ]
 [44585.45587 36837.467  ]
 

In [None]:
print("Accuracy = ",r2_score(Y_test, Y_decpred)*100,"%")

Accuracy =  74.34988360300278 %


# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rand_regressor = RandomForestRegressor(n_estimators=300, random_state=42)
rand_regressor.fit(X_train, Y_train)
y_randpred = rand_regressor.predict(X_test)

  rand_regressor.fit(X_train, Y_train)


In [None]:
np.set_printoptions(precision=5)
print(np.concatenate((y_randpred.reshape(len(Y_test),1),Y_test.reshape(len(Y_test),1)),1))

[[ 1675.09074  1646.4297 ]
 [12422.9304  11353.2276 ]
 [ 9195.16001  8798.593  ]
 [11292.99311 10381.4787 ]
 [ 2265.19813  2103.08   ]
 [39704.38279 38746.3551 ]
 [10526.9973   9304.7019 ]
 [11694.93671 11658.11505]
 [ 4350.79724  3070.8087 ]
 [19851.84167 19539.243  ]
 [16525.60701 12629.8967 ]
 [12567.7392  11538.421  ]
 [ 6761.96665  6338.0756 ]
 [ 7017.03226  7050.642  ]
 [ 1654.81465  1137.4697 ]
 [11263.66598  8968.33   ]
 [ 5609.92638 21984.47061]
 [ 7171.2249   6414.178  ]
 [16997.24013 28287.89766]
 [13527.87507 13462.52   ]
 [12317.37114  9722.7695 ]
 [41543.28119 40932.4295 ]
 [12064.67219  8026.6666 ]
 [11094.44179  8444.474  ]
 [13913.95979  2203.47185]
 [ 6585.75207  6664.68595]
 [11663.68073  8606.2174 ]
 [ 9920.46169  8283.6807 ]
 [ 7409.84489  5375.038  ]
 [ 4055.14532  3645.0894 ]
 [12226.68141 11674.13   ]
 [ 6340.48756 11737.84884]
 [27043.28358 24873.3849 ]
 [34793.24681 33750.2918 ]
 [25111.21854 24180.9335 ]
 [12279.40621  9863.4718 ]
 [40002.24692 36837.467  ]
 

In [None]:
print("Accuracy = ",r2_score(Y_test, y_randpred)*100,"%")

Accuracy =  85.76333578894378 %


# ANN


In [None]:
ann = tf.keras.models.Sequential()

In [None]:
ann.add(tf.keras.layers.Dense(units = 18, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 24, activation = 'relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units = 1))

In [None]:
ann.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
X_train1 = np.asarray(X_train1).astype(np.float32)
Y_train1 = np.asarray(Y_train1).astype(np.float32)
ann.fit(X_train1, Y_train1, batch_size = 32, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f06dfff4d60>

In [None]:
X_test1 = np.asarray(X_test1).astype(np.float32)
Y_test1 = np.asarray(Y_test1).astype(np.float32)
y_ann_pred = ann.predict(X_test1)



In [None]:
np.set_printoptions(precision=5)
print(np.concatenate((y_ann_pred.reshape(len(Y_test1),1),Y_test1.reshape(len(Y_test1),1)),1))

[[-8.66478e-01  1.64643e+03]
 [-7.15884e-02  1.13532e+04]
 [-3.11592e-01  8.79859e+03]
 [-2.05214e-02  1.03815e+04]
 [-9.02973e-01  2.10308e+03]
 [ 2.23587e+00  3.87464e+04]
 [-2.05254e-01  9.30470e+03]
 [ 4.46145e-02  1.16581e+04]
 [-8.01619e-01  3.07081e+03]
 [ 6.53625e-01  1.95392e+04]
 [ 2.86621e-01  1.26299e+04]
 [-9.96419e-02  1.15384e+04]
 [-6.31457e-01  6.33808e+03]
 [-4.23201e-01  7.05064e+03]
 [-8.60023e-01  1.13747e+03]
 [-3.08625e-01  8.96833e+03]
 [-6.77133e-01  2.19845e+04]
 [-5.61895e-01  6.41418e+03]
 [ 1.04487e-01  2.82879e+04]
 [ 8.57526e-02  1.34625e+04]
 [-2.39824e-01  9.72277e+03]
 [ 2.37039e+00  4.09324e+04]
 [-3.38986e-01  8.02667e+03]
 [-2.18174e-01  8.44447e+03]
 [-7.52545e-01  2.20347e+03]
 [-4.34404e-01  6.66469e+03]
 [-1.63724e-01  8.60622e+03]
 [-1.64908e-01  8.28368e+03]
 [-5.75885e-01  5.37504e+03]
 [-7.48978e-01  3.64509e+03]
 [ 1.01810e-01  1.16741e+04]
 [-6.73212e-01  1.17378e+04]
 [ 7.57941e-01  2.48734e+04]
 [ 1.23353e+00  3.37503e+04]
 [ 1.03220e+00

In [None]:
print("Accuracy = ",r2_score(Y_test1, y_ann_pred)*100,"%")

Accuracy =  -120.82900556877978 %


In [None]:
from xgboost import XGBRegressor
xgregress = XGBRegressor()
xgregress.fit(X_train, Y_train)
Y_xgpred = xgregress.predict(X_test)
print("Accuracy = ",r2_score(Y_test, Y_xgpred)*100,"%")

Accuracy =  81.12245283973093 %


In [None]:
from catboost import CatBoostRegressor
cbreg = CatBoostRegressor()
cbreg.fit(X_train, Y_train)
Y_cbpred = cbreg.predict(X_test)
print("Accuracy = ",r2_score(Y_test, Y_xgpred)*100,"%")

Learning rate set to 0.041383
0:	learn: 11722.4641441	total: 807us	remaining: 806ms
1:	learn: 11345.0600281	total: 1.32ms	remaining: 658ms
2:	learn: 11011.8807698	total: 2.07ms	remaining: 688ms
3:	learn: 10673.5550482	total: 2.83ms	remaining: 705ms
4:	learn: 10396.4635167	total: 3.19ms	remaining: 635ms
5:	learn: 10122.0715191	total: 3.92ms	remaining: 650ms
6:	learn: 9840.8959518	total: 4.61ms	remaining: 654ms
7:	learn: 9592.0176326	total: 5.31ms	remaining: 658ms
8:	learn: 9382.7799889	total: 5.58ms	remaining: 615ms
9:	learn: 9118.5731327	total: 5.98ms	remaining: 592ms
10:	learn: 8879.7368518	total: 6.31ms	remaining: 567ms
11:	learn: 8671.1943823	total: 7ms	remaining: 577ms
12:	learn: 8472.4312448	total: 7.7ms	remaining: 584ms
13:	learn: 8265.4724191	total: 8.39ms	remaining: 591ms
14:	learn: 8061.3487597	total: 9.16ms	remaining: 602ms
15:	learn: 7858.3381318	total: 9.7ms	remaining: 597ms
16:	learn: 7681.6345966	total: 10.1ms	remaining: 586ms
17:	learn: 7509.7651550	total: 10.8ms	remaini

## The best accuracy is given by the SVR model. 