### AdaBoost (Adaptive Boosting) et XGBoost (eXtreme Gradient Boosting) sont deux algorithmes d'ensemble utilisés en apprentissage automatique pour améliorer les performances des modèles de régression et de classification. Bien qu'ils partagent certains principes de base, ils diffèrent dans leurs mécanismes spécifiques et leurs performances.

### Dataset: https://www.kaggle.com/mirichoi0218/insurance

### Import the library 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.simplefilter('ignore')

### Import Data

In [2]:
data = pd.read_csv('insurance.csv')

In [3]:
X = data.iloc[:,:-1]

In [4]:
Y = data.iloc[:,-1]

In [5]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Label encoding

In [6]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,southwest
1,18,1,33.770,1,0,southeast
2,28,1,33.000,3,0,southeast
3,33,1,22.705,0,0,northwest
4,32,1,28.880,0,0,northwest
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest
1334,18,0,31.920,0,0,northeast
1335,18,0,36.850,0,0,southeast
1336,21,0,25.800,0,0,southwest


## One hot encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [5])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X

array([[ 0.  ,  0.  ,  0.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  1.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  0.  ,  1.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

### Train test split

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

### Modèle

In [10]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [11]:
import xgboost as xgb

In [12]:
model = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3,
    learning_rate=0.05
)

model.fit(X_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, objective='reg:squarederror', ...)

### Prédiction

In [13]:
y_pred = model.predict(X_test)

In [14]:
print(y_pred)

[ 3304.1443 12514.461  10408.605  11625.639   3417.367  38262.5
 11503.01   12795.932   4997.6616 20412.957  15379.668  13344.997
  7346.578   7636.451   2836.2173 11383.133   4949.968   7400.867
 14560.098  14426.975  12628.622  40421.875   9256.913  10276.228
  4321.9536  7580.112   9705.184  10569.746   6860.841   5265.242
 13025.118   6884.3984 24277.637  35128.12   23872.088  12031.293
 39451.39   17573.52   12616.009  41909.684   6696.717  12514.461
 11773.36   14649.611   6581.9976 14143.31    3725.8357 37667.508
 11374.5    16278.38   14214.384  13972.4795  4013.4312  9306.799
 17392.057   4688.4004 41694.28   14181.785   7225.0146  2573.1428
  4756.3506 14388.499  20239.63    4678.555  14143.31   11047.519
 11753.967  12485.557   4129.8066 16373.734  43169.195  40712.656
  3477.8398 15066.263  14298.268  43310.582   5064.1836  4539.1987
 12674.331  11728.738   3910.9377 13958.838  13825.464   3997.766
 39020.75   38810.484   7486.0713 45156.457  12390.646   8178.3115
 21506.57

### Comparaison

In [15]:
comparaison = pd.DataFrame()
comparaison['Actual'] = y_test
comparaison['predicted'] = y_pred
comparaison

Unnamed: 0,Actual,predicted
559,1646.42970,3304.144287
1087,11353.22760,12514.460938
1020,8798.59300,10408.605469
460,10381.47870,11625.638672
802,2103.08000,3417.366943
...,...,...
682,40103.89000,40164.039062
629,42983.45850,44035.132812
893,44202.65360,43768.785156
807,2136.88225,3758.213379


### Evaluation

In [16]:
from sklearn.metrics import r2_score

In [18]:
r2_score(y_test, y_pred)

0.8814897702620178