## Gradient boosting for classification

In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings

warnings.simplefilter(action='ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [28]:
df_train = pd.read_csv("train.csv", index_col=0)
df_train.head()

Unnamed: 0_level_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,...,x_292,x_293,x_294,x_295,x_296,x_297,x_298,x_299,x_300,Category
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.03,-1.0,-0.02,-0.8,-1.37,0.62,-0.09,-0.87,-0.02,0.63,...,-0.16,-0.02,-0.15,-0.0,1.07,-0.06,-0.8,0.39,-0.06,-1.0
1,-0.18,1.71,0.0,-1.09,-1.13,-0.08,2.2,-0.32,0.0,0.81,...,-0.63,0.08,-1.83,0.04,1.49,0.1,0.78,-1.56,0.08,1.0
2,-1.41,0.01,0.0,-0.48,0.31,1.02,0.9,0.41,0.04,-0.0,...,-1.25,0.06,-0.2,0.04,-0.77,0.04,-1.39,0.87,0.02,1.0
3,-0.41,-0.54,-0.02,-0.53,0.52,-1.23,0.62,0.02,-0.03,0.06,...,0.49,-0.0,-0.4,-0.02,-1.02,-0.0,-0.16,0.41,-0.07,-1.0
4,-1.58,0.25,0.02,1.15,-1.95,-0.43,0.4,-0.33,0.02,0.72,...,-0.14,0.04,-0.85,0.0,0.02,0.14,0.68,-1.42,0.02,-1.0


In [29]:
y = df_train["Category"]
X = df_train.drop("Category", axis=1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [31]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(X_train, y_train)
accuracy_score(model.predict(X_test), y_test)

0.9076296296296297

In [32]:
grid = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', tree_method='hist'), param_grid={
    "eta": [0.2, 0.5, 0.8],
    "max_depth": list(range(4, 6)),
    "reg_lambda": [2, 5, 8]

})

grid.fit(X_train, y_train)
grid.best_params_

{'eta': 0.5, 'max_depth': 5, 'reg_lambda': 8}

In [34]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eta=0.5, eval_metric='logloss', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=8,
              scale_pos_weight=1, subsample=1, tree_method='hist',
              validate_parameters=1, verbosity=None)

In [35]:
print(classification_report(y_test, grid.predict(X_test)))

              precision    recall  f1-score   support

        -1.0       0.91      0.91      0.91     13554
         1.0       0.91      0.91      0.91     13446

    accuracy                           0.91     27000
   macro avg       0.91      0.91      0.91     27000
weighted avg       0.91      0.91      0.91     27000



In [37]:
model=xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', tree_method='hist', n_estimators=1000,
                  eta=0.5, max_depth=5, reg_lambda=8)
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

        -1.0       0.94      0.94      0.94     13554
         1.0       0.94      0.94      0.94     13446

    accuracy                           0.94     27000
   macro avg       0.94      0.94      0.94     27000
weighted avg       0.94      0.94      0.94     27000

