In [19]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[K     |████████████████████████████████| 223.6 MB 1.4 kB/s eta 0:00:01    |█████████▉                      | 68.7 MB 6.9 MB/s eta 0:00:23     |███████████                     | 77.3 MB 221 kB/s eta 0:11:01     |█████████████▎                  | 92.7 MB 7.0 MB/s eta 0:00:19     |█████████████▍                  | 93.9 MB 7.0 MB/s eta 0:00:19     |███████████████▏                | 106.2 MB 4.7 MB/s eta 0:00:26     |███████████████▎                | 106.9 MB 4.7 MB/s eta 0:00:25     |███████████████▎                | 107.2 MB 4.7 MB/s eta 0:00:25     |████████████████▊               | 117.2 MB 6.9 MB/s eta 0:00:16     |█████████████████▎              | 120.8 MB 6.9 MB/s eta 0:00:15     |███████████████████▍            | 135.4 MB 7.5 MB/s eta 0:00:12     |██████████████████████▍         | 156.4 MB 7.3 MB/s eta 0:00:10     |████████████████████████████▎   | 197.6 MB 5.2 MB/s eta 0:00:06     |██████

In [33]:
#importing libraries
#numpy & pandas dataset handlers & numpy used to parse .csv file.
#matplotlib used to create graphs like visuablisaton.
#xgboost optimized gradient boosting algorithm which allows efficient predictions on various
#classification and regression tasks.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import xgboost

In [27]:
print(xgboost.__version__)

2.1.4


In [28]:
#dataset reading
dts = pd.read_csv("insurance_pre.csv")

In [29]:
dts

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [30]:
dts.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [31]:
dts = pd.get_dummies(dts,drop_first=True)

In [32]:
dts

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [35]:
#Columns rename-ing
dts.rename(columns={"sex_male":"sex","smoker_yes":"smoker"},inplace=True)

In [36]:
dts

Unnamed: 0,age,bmi,children,charges,sex,smoker
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [37]:
#separating the input and output values in datasets.
feature = dts[["age","bmi","children","sex","smoker"]]
target = dts[["charges"]]

In [38]:
#printing the input and output values
feature,target

(      age     bmi  children  sex  smoker
 0      19  27.900         0    0       1
 1      18  33.770         1    1       0
 2      28  33.000         3    1       0
 3      33  22.705         0    1       0
 4      32  28.880         0    1       0
 ...   ...     ...       ...  ...     ...
 1333   50  30.970         3    1       0
 1334   18  31.920         0    0       0
 1335   18  36.850         0    0       0
 1336   21  25.800         0    0       0
 1337   61  29.070         0    0       1
 
 [1338 rows x 5 columns],
           charges
 0     16884.92400
 1      1725.55230
 2      4449.46200
 3     21984.47061
 4      3866.85520
 ...           ...
 1333  10600.54830
 1334   2205.98080
 1335   1629.83350
 1336   2007.94500
 1337  29141.36030
 
 [1338 rows x 1 columns])

In [46]:
#plitting the train & test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(feature,target,test_size=0.33,random_state=0)

In [47]:
X_train, X_test

(      age     bmi  children  sex  smoker
 1271   25  34.485         0    0       0
 1313   19  34.700         2    0       1
 2      28  33.000         3    1       0
 405    52  38.380         2    0       0
 482    18  31.350         0    0       0
 ...   ...     ...       ...  ...     ...
 763    27  26.030         0    1       0
 835    42  35.970         2    1       0
 1216   40  25.080         0    1       0
 559    19  35.530         0    1       0
 684    33  18.500         1    0       0
 
 [896 rows x 5 columns],
       age     bmi  children  sex  smoker
 578    52  30.200         1    1       0
 610    47  29.370         1    0       0
 569    48  40.565         2    1       1
 1034   61  38.380         0    1       0
 198    51  18.050         0    0       0
 ...   ...     ...       ...  ...     ...
 117    29  27.940         1    0       1
 520    50  27.360         0    0       0
 422    40  32.775         1    1       1
 294    25  26.800         3    1       0
 261   

In [48]:
Y_train, Y_test

(          charges
 1271   3021.80915
 1313  36397.57600
 2      4449.46200
 405   11396.90020
 482    1622.18850
 ...           ...
 763    3070.80870
 835    7160.33030
 1216   5415.66120
 559    1646.42970
 684    4766.02200
 
 [896 rows x 1 columns],
           charges
 578    9724.53000
 610    8547.69130
 569   45702.02235
 1034  12950.07120
 198    9644.25250
 ...           ...
 117   19107.77960
 520   25656.57526
 422   39125.33225
 294    3906.12700
 261   17085.26760
 
 [442 rows x 1 columns])

In [201]:
#Standardisation procedure to maintain least different between inputs.
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [208]:
X_test,X_train

(array([[ 0.89345423, -0.07906071, -0.07800765,  1.01348375, -0.4996512 ],
        [ 0.53613634, -0.21350965, -0.07800765, -0.98669564, -0.4996512 ],
        [ 0.60759992,  1.59993119,  0.7540739 ,  1.01348375,  2.00139616],
        ...,
        [ 0.03589131,  0.33805498, -0.07800765,  1.01348375,  2.00139616],
        [-1.03606235, -0.62981541,  1.58615545,  1.01348375, -0.4996512 ],
        [-1.39338023, -0.62333594, -0.07800765, -0.98669564,  2.00139616]]),
 array([[-1.03606235,  0.6150522 , -0.91008919, -0.98669564, -0.4996512 ],
        [-1.46484381,  0.64987934,  0.7540739 , -0.98669564,  2.00139616],
        [-0.82167162,  0.37450199,  1.58615545,  1.01348375, -0.4996512 ],
        ...,
        [ 0.03589131, -0.90843249, -0.91008919,  1.01348375, -0.4996512 ],
        [-1.46484381,  0.78432828, -0.91008919,  1.01348375, -0.4996512 ],
        [-0.46435373, -1.97430482, -0.07800765, -0.98669564, -0.4996512 ]]))

In [209]:
#creating & training the model
#hypertunning parameter's
#criterion{‘squared_error’}, default=’friedman_mse’
#loss{‘absolute_error’, ‘huber’, ‘quantile’}, default=’squared_error’
from sklearn.ensemble import GradientBoostingRegressor
model1_create =  GradientBoostingRegressor(n_estimators=500, max_depth=4,min_samples_split=5,learning_rate=0.01,loss="huber")
model1_create = model1_create.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [210]:
Y_pred = model1_create.predict(X_test)

In [211]:
Y_pred

array([10467.16471197,  9347.44641068, 45486.34644652, 13431.1527403 ,
        9382.6851907 ,  4991.33939494,  2059.70151614, 10593.24721757,
        6933.4968906 ,  5455.83886798,  6785.55553245, 10636.31271706,
        8589.86809227,  5264.55844757, 18589.30617459, 10876.84978144,
       12653.75976719,  3312.30880237,  6313.96855804, 34364.24117134,
       23332.9096202 , 12696.67825117, 10767.87320337, 24390.97950015,
        1924.62663012,  4431.78878629,  3742.52850298,  7732.85798746,
        3753.39899088,  8595.11655262,  7710.15833411, 47898.72741561,
       13650.75766751, 11100.72619127, 15630.17104344,  3753.39899088,
        8429.75159006, 37598.48977418, 39696.67894162,  2210.90879563,
        5242.3963499 ,  3467.52122013, 20083.69214655, 46561.43108127,
       36674.24346012,  3086.31858582, 10876.84978144,  6315.04034412,
        4609.08679689, 11927.42970927,  2875.36949565,  3250.96681639,
       24395.20788159, 45827.62179147, 11385.15157666,  3231.64263316,
      

In [212]:
from sklearn.metrics import r2_score
rscore = r2_score(Y_test, Y_pred)

In [213]:
rscore

0.8921174804457045