In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import xgboost as xgb

In [2]:
train = pd.read_csv(
    '/data/temp/train.csv',
    )

test = pd.read_csv(
    '/data/temp/test.csv',
    )

for c in train.columns:
    if train[c].dtype == "object":
        train[c] = train[c].astype("category")

for c in test.columns:
    if test[c].dtype == "object":
        test[c] = test[c].astype("category")

In [3]:
X = train.drop(columns = ['survived'])
y = train['survived'].to_numpy()

In [4]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
pos_weight = (y.shape[0]-y.sum())/y.sum()
pos_weight

1.605263157894737

param_dist = {'categorical_feature':True, }
estimator = xgb.XGBClassifier(**param_dist)

In [6]:
estimator = xgb.XGBClassifier(
    tree_method="gpu_hist",
    gpu_id = 0,
    use_label_encoder = False,
    enable_categorical = True,
    objective = 'binary:logistic',
    )

In [7]:
param_grid = [
    {
        'max_depth': [2,10], 
        'n_estimators': [50,100],
        'scale_pos_weight':[1, pos_weight],
        'learning_rate':[0.1,1.0],
        'objective':['binary:logistic','binary:logistic'],
    },
    ]

In [8]:
gbm = GridSearchCV(
    estimator, 
    param_grid, 
    cv = 5,
    scoring = 'roc_auc')

In [9]:
start_time = tm.time()

gbm.fit(X, y,
    verbose = False)

running_time = tm.time() - start_time













In [16]:
print('running time: %f seconds'%(running_time))

running time: 52.231592 seconds


In [10]:
gbm_best = gbm.best_estimator_

In [None]:
xgb.plot_importance(gbm_best)

In [11]:
print(gbm_best.feature_importances_)

[0.11414606 0.         0.44519982 0.05973374 0.05428183 0.02129877
 0.02604642 0.05286869 0.16915359 0.0572711 ]


In [12]:
print(gbm.best_score_)

0.8722929896112632


In [13]:
print(gbm.best_params_)

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50, 'objective': 'binary:logistic', 'scale_pos_weight': 1.605263157894737}
