# XGBoost

In [33]:
#import libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xb
warnings.filterwarnings('ignore')

In [23]:
#load dataset
df=sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [24]:
df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [25]:
df.shape

(53940, 10)

In [26]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [28]:
df.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [29]:
#How to Build an XGBoost DMatrix
X,y=df.drop('price',axis=1),df['price']

In [31]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()
for col in cats:
    X[col]=X[col].astype('category')


carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [32]:
#Split data in test and train
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.3)

In [34]:
#Create Regression matrices
dtrain_reg=xb.DMatrix(X_train,Y_train,enable_categorical=True)
dtest_reg=xb.DMatrix(X_test,Y_test,enable_categorical=True)


In [40]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [41]:
n = 100
model = xb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)


In [43]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [44]:
preds

array([1461.9902 ,  969.51935, 3168.8206 , ..., 7964.405  , 4721.824  ,
        876.5113 ], dtype=float32)

In [47]:
rmse = mean_squared_error(Y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")


RMSE of the base model: 571.536


In [50]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [54]:
model = xb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=5000,
   evals=evals,
   verbose_eval=10 # Every ten rounds
)



[0]	train-rmse:3971.07848	validation-rmse:3976.20810
[10]	train-rmse:549.88851	validation-rmse:610.32799
[20]	train-rmse:482.92531	validation-rmse:572.96927
[30]	train-rmse:455.80413	validation-rmse:569.64743
[40]	train-rmse:432.66573	validation-rmse:570.02392
[50]	train-rmse:419.26276	validation-rmse:570.10046
[60]	train-rmse:406.22152	validation-rmse:569.27281
[70]	train-rmse:395.57102	validation-rmse:568.20918
[80]	train-rmse:383.85367	validation-rmse:571.03981
[90]	train-rmse:370.55064	validation-rmse:572.06633
[100]	train-rmse:362.17282	validation-rmse:571.53193
[110]	train-rmse:351.92419	validation-rmse:573.77035
[120]	train-rmse:344.27691	validation-rmse:574.17296
[130]	train-rmse:335.68281	validation-rmse:573.70450
[140]	train-rmse:328.00230	validation-rmse:575.12328
[150]	train-rmse:322.88008	validation-rmse:574.45053
[160]	train-rmse:316.16205	validation-rmse:575.98353
[170]	train-rmse:310.42923	validation-rmse:576.27307
[180]	train-rmse:305.55801	validation-rmse:576.37926
[1

In [57]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000

results = xb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [58]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3971.561248,7.086365,3972.862216,30.56517
1,2839.813205,6.32238,2845.898368,18.656492
2,2053.982495,3.794979,2062.502143,15.490805
3,1514.466817,2.921985,1529.525052,12.912728
4,1150.729924,1.978405,1170.81747,12.635362
