In [None]:
# install xgboost
!pip install xgboost

In [44]:
# import necessary packages
import pandas as pd
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [45]:
ford_df = pd.read_csv("ford.csv")

In [46]:
ford_df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [47]:
ford_df.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [48]:
# checking the data type of the features
ford_df.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

#### Dealing with categorical data without manually encoding them (feature available only in XGBoost versions starting from 1.5)

In [49]:
ford_df_cat = ford_df

In [50]:
ford_df_cat.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [53]:
# mentioning the features that are categorical
ford_df_cat[["model","transmission","fuelType"]] = ford_df_cat[["model","transmission","fuelType"]].astype("category")

In [54]:
# checking the data type of features again to see if they are recognized as categories
ford_df_cat.dtypes

model           category
year               int64
price              int64
transmission    category
mileage            int64
fuelType        category
tax                int64
mpg              float64
engineSize       float64
dtype: object

In [55]:
# separating features and labels and splitting dataset into train and test set
X = ford_df_cat.drop('price',axis=1)
y = ford_df_cat['price']
X_train, X_test,y_train, y_test = train_test_split(X,y ,
                                   random_state=100, 
                                   test_size=0.20, 
                                   shuffle=True)

In [56]:
# Defining the XGBoost Regressor
# Adding enable_categorical=True to existing parameters
# Supported tree_method for categorical data are `gpu_hist`, `approx`, and `hist`
xgb = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 12, seed = 56, tree_method='hist',
                      enable_categorical=True)
# Training the model using the training dataset
xgb.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=12, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [57]:
# predicting using trained model and finding the r2 score
y_pred = xgb.predict(X_test)
r2 = r2_score(y_test,y_pred)
print(r2)

0.911171089852688


In [58]:
# saving the trained model
xgb.save_model("model.json")

##### Predicting using the saved model

In [60]:
import xgboost as xg
xgb_saved = xg.Booster()
# loading saved model
xgb_saved.load_model("model.json")

In [66]:
#predicting using trained model
y_pred_saved = xgb_saved.predict(xg.DMatrix(X_test,enable_categorical=True))

In [67]:
y_pred_saved[:10]

array([ 6874.7046, 10417.231 , 16825.986 ,  8153.799 ,  6967.2495,
       10899.846 , 10692.221 ,  9445.076 , 11041.24  , 12960.756 ],
      dtype=float32)

In [68]:
r2 = r2_score(y_test,y_pred_saved)
print(r2)

0.911171089852688


#### Grid search CV for finding the best hyperparameters for XGBoost

In [33]:
# possible value for each of the parameters while performing GridSearchCV
parameters = {
    'learning_rate' : [0.2,0.3,0.4],
    'gamma' : [1,2,3,4],
    'max_depth': [4,6,7],
    'tree_method': ['auto','exact','hist']
}

In [41]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

grid_search = GridSearchCV(xgb, parameters,n_jobs=4,cv=5, verbose=3)
grid_search.fit(X, y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estim

In [42]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9262097629484091
{'gamma': 1, 'learning_rate': 0.4, 'max_depth': 7, 'tree_method': 'auto'}


In [3]:
!pip freeze

alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work
anaconda-client @ file:///opt/concourse/worker/volumes/live/866d4dd0-ff5b-4d0b-718d-0267a3b10e06/volume/anaconda-client_1635342573767/work
anaconda-navigator==2.1.1
anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1626085644852/work
anyio @ file:///opt/concourse/worker/volumes/live/96440bbe-d2f1-4a9e-5edf-600248ff38bd/volume/anyio_1617783321037/work/dist
appdirs==1.4.4
applaunchservices @ file:///Users/ktietz/demo/mc3/conda-bld/applaunchservices_1630511705208/work
appnope @ file:///opt/concourse/worker/volumes/live/6ca6f098-d773-4461-5c91-a24a17435bda/volume/appnope_1606859448531/work
appscript @ file:///opt/concourse/worker/volumes/live/00049ed6-6263-4a6e-72b9-9d990f6e2f07/volume/appscript_1611427000595/work
argh==0.26.2
argon2-cffi @ file:///opt/concourse/worker/volumes/live/38e8fb2b-1295-4bdf-4adf-b20acbe4d91b/volume/argon2-cffi_1607022498041/work
arrow @ file:///opt/concourse/worker/volumes