In [1]:
import numpy as np
import pandas as pd

train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' , skiprows = 1, header = None)

col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', 
              'capital_gain','capital_loss', 'hours_per_week', 'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels
data = pd.concat([train_set,test_set],ignore_index = True ,axis = 0)

In [2]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [3]:
## fnlwgt:similarity between people , not used in this assignment 
data.drop(columns=["fnlwgt"],inplace = True)

In [4]:
## checking missing value
print(data.describe())
for col in data.columns:
    print(col)
    print(data[col].value_counts())

                age  education_num  capital_gain  capital_loss  hours_per_week
count  48842.000000   48842.000000  48842.000000  48842.000000    48842.000000
mean      38.643585      10.078089   1079.067626     87.502314       40.422382
std       13.710510       2.570973   7452.019058    403.004552       12.391444
min       17.000000       1.000000      0.000000      0.000000        1.000000
25%       28.000000       9.000000      0.000000      0.000000       40.000000
50%       37.000000      10.000000      0.000000      0.000000       40.000000
75%       48.000000      12.000000      0.000000      0.000000       45.000000
max       90.000000      16.000000  99999.000000   4356.000000       99.000000
age
36    1348
35    1337
33    1335
23    1329
31    1325
      ... 
88       6
85       5
87       3
89       2
86       1
Name: age, Length: 74, dtype: int64
workclass
 Private             33906
 Self-emp-not-inc     3862
 Local-gov            3136
 ?                    2799
 State-gov

In [5]:
## handling unclean data
data['wage_class'] = data['wage_class'].map({' <=50K':0,' <=50K.':0,' >50K':1,' >50K.':1})
data['wage_class'].value_counts()

0    37155
1    11687
Name: wage_class, dtype: int64

In [6]:
def questionMark2nan(str1):
    if str1 == ' ?':
        return np.nan
    return str1

In [7]:
## handling missing value -- workclass/occupation/native_country
data['workclass'] = data['workclass'].apply(questionMark2nan)
data['occupation'] = data['occupation'].apply(questionMark2nan)
data['native_country'] = data['native_country'].apply(questionMark2nan)

In [8]:
## random sample imputation
random_occupation = data['occupation'].dropna().sample(data['occupation'].isnull().sum(),random_state=22)
random_occupation.index = data[data['occupation'].isnull()].index
data['occupation'] = data['occupation'].fillna(random_occupation)

random_workclass = data['workclass'].dropna().sample(data['workclass'].isnull().sum(),random_state=22)
random_workclass.index = data[data['workclass'].isnull()].index
data['workclass'] = data['workclass'].fillna(random_occupation)

random_native_country = data['native_country'].dropna().sample(data['native_country'].isnull().sum(),random_state=22)
random_native_country.index = data[data['native_country'].isnull()].index
data['native_country'] = data['native_country'].fillna(random_occupation)

In [9]:
## handling categroical variabel
'''
for col in data.columns:
    if data[col].dtype == 'O':
        new_col = pd.get_dummies(data[col],prefix = col ,drop_first=True)
        data = pd.concat([data,new_col],axis=1)
        del data[col]
data
'''
## too many columns...

"\nfor col in data.columns:\n    if data[col].dtype == 'O':\n        new_col = pd.get_dummies(data[col],prefix = col ,drop_first=True)\n        data = pd.concat([data,new_col],axis=1)\n        del data[col]\ndata\n"

In [10]:
## handling categroical variabele
for col in data.columns:
    if data[col].dtype == 'O':
        data[col] = data[col].map({className:id for id,className in enumerate(data[col].unique())})
data

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,0,0,13,0,0,0,0,0,2174,0,40,0,0
1,50,1,0,13,1,1,1,0,0,0,0,13,0,0
2,38,2,1,9,2,2,0,0,0,0,0,40,0,0
3,53,2,2,7,1,2,1,1,0,0,0,40,0,0
4,28,2,0,13,1,3,2,1,1,0,0,40,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,2,0,13,2,3,0,0,1,0,0,36,0,0
48838,64,5,1,9,6,1,5,1,0,0,0,40,0,0
48839,38,2,0,13,1,3,1,0,0,0,0,50,0,0
48840,44,2,0,13,2,0,3,2,0,5455,0,40,0,0


In [11]:
X = data.drop(columns=['wage_class'])
y = data['wage_class']

In [12]:
## data scaling 
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

In [13]:
## split data
from sklearn.model_selection import train_test_split,GridSearchCV
train_X,test_X,train_y,test_y=train_test_split(scaled_X,y,test_size=0.3,random_state=42)

In [21]:
len(train_X)

34189

In [14]:
## modeling building 
from xgboost import XGBClassifier
model = XGBClassifier(objective='binary:logistic',use_label_encoder=False)
model.fit(train_X, train_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [15]:
## modeling evaluation
from sklearn.metrics import accuracy_score
## train score
y_pred = model.predict(train_X)
accuracy_score(train_y,y_pred)

0.8933867618239785

In [16]:
## test score
y_pred = model.predict(test_X)
accuracy_score(test_y,y_pred)

0.8722445915512181

In [26]:
## hyperparameter tunning
param_grid={ 
    'learning_rate':[1,0.5,0.1,0.01,0.001],
    'max_depth': [3,5,10,20],
    'n_estimators':[10,50,100,200] 
}
xgbclf = XGBClassifier(objective='binary:logistic',use_label_encoder=False,n_jobs = -1,tree_method = 'hist')
grid= GridSearchCV(xgbclf ,param_grid, verbose=3)
grid.fit(train_X,train_y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.864 total time=   0.0s
[CV 2/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.869 total time=   0.0s
[CV 3/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.861 total time=   0.0s
[CV 4/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.855 total time=   0.0s
[CV 5/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.863 total time=   0.0s
[CV 1/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.869 total time=   0.1s
[CV 2/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.873 total time=   0.1s
[CV 3/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.869 total time=   0.1s
[CV 4/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.866 total time=   0.1s
[CV 5/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.872 total time=   0.1s
[CV 1/5] END learn

[CV 2/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.868 total time=   0.0s
[CV 3/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.867 total time=   0.0s
[CV 4/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.864 total time=   0.0s
[CV 5/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.870 total time=   0.0s
[CV 1/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.862 total time=   0.2s
[CV 2/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.867 total time=   0.2s
[CV 3/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.865 total time=   0.2s
[CV 4/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.864 total time=   0.2s
[CV 5/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.869 total time=   0.2s
[CV 1/5] END learning_rate=1, max_depth=5, n_estimators=100;, score=0.859 total time=   0.4s
[CV 2/5] END learning_rate=1, max_depth=5, n_estimators=100;, score=0.862 total

[CV 4/5] END learning_rate=1, max_depth=10, n_estimators=10;, score=0.863 total time=   0.1s
[CV 5/5] END learning_rate=1, max_depth=10, n_estimators=10;, score=0.867 total time=   0.1s
[CV 1/5] END learning_rate=1, max_depth=10, n_estimators=50;, score=0.847 total time=   0.5s
[CV 2/5] END learning_rate=1, max_depth=10, n_estimators=50;, score=0.852 total time=   0.5s
[CV 3/5] END learning_rate=1, max_depth=10, n_estimators=50;, score=0.852 total time=   0.5s
[CV 4/5] END learning_rate=1, max_depth=10, n_estimators=50;, score=0.844 total time=   0.5s
[CV 5/5] END learning_rate=1, max_depth=10, n_estimators=50;, score=0.857 total time=   0.5s
[CV 1/5] END learning_rate=1, max_depth=10, n_estimators=100;, score=0.843 total time=   1.0s
[CV 2/5] END learning_rate=1, max_depth=10, n_estimators=100;, score=0.850 total time=   0.9s
[CV 3/5] END learning_rate=1, max_depth=10, n_estimators=100;, score=0.847 total time=   0.8s
[CV 4/5] END learning_rate=1, max_depth=10, n_estimators=100;, scor

[CV 5/5] END learning_rate=1, max_depth=20, n_estimators=10;, score=0.856 total time=   0.3s
[CV 1/5] END learning_rate=1, max_depth=20, n_estimators=50;, score=0.842 total time=   1.1s
[CV 2/5] END learning_rate=1, max_depth=20, n_estimators=50;, score=0.847 total time=   1.1s
[CV 3/5] END learning_rate=1, max_depth=20, n_estimators=50;, score=0.845 total time=   1.1s
[CV 4/5] END learning_rate=1, max_depth=20, n_estimators=50;, score=0.843 total time=   1.1s
[CV 5/5] END learning_rate=1, max_depth=20, n_estimators=50;, score=0.849 total time=   1.1s
[CV 1/5] END learning_rate=1, max_depth=20, n_estimators=100;, score=0.842 total time=   1.8s
[CV 2/5] END learning_rate=1, max_depth=20, n_estimators=100;, score=0.845 total time=   1.8s
[CV 3/5] END learning_rate=1, max_depth=20, n_estimators=100;, score=0.846 total time=   1.7s
[CV 4/5] END learning_rate=1, max_depth=20, n_estimators=100;, score=0.842 total time=   1.8s
[CV 5/5] END learning_rate=1, max_depth=20, n_estimators=100;, sco

[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.870 total time=   0.1s
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.872 total time=   0.1s
[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.867 total time=   0.1s
[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.865 total time=   0.1s
[CV 5/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.874 total time=   0.1s
[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.870 total time=   0.3s
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.875 total time=   0.3s
[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.871 total time=   0.3s
[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.869 total time=   0.3s
[CV 5/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.875 total time=   0.3s
[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimato

[CV 2/5] END learning_rate=0.5, max_depth=5, n_estimators=50;, score=0.873 total time=   0.2s
[CV 3/5] END learning_rate=0.5, max_depth=5, n_estimators=50;, score=0.868 total time=   0.2s
[CV 4/5] END learning_rate=0.5, max_depth=5, n_estimators=50;, score=0.868 total time=   0.2s
[CV 5/5] END learning_rate=0.5, max_depth=5, n_estimators=50;, score=0.875 total time=   0.2s
[CV 1/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.872 total time=   0.4s
[CV 2/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.872 total time=   0.4s
[CV 3/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.866 total time=   0.4s
[CV 4/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.869 total time=   0.4s
[CV 5/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.875 total time=   0.4s
[CV 1/5] END learning_rate=0.5, max_depth=5, n_estimators=200;, score=0.868 total time=   0.8s
[CV 2/5] END learning_rate=0.5, max_depth=5, n_estimat

[CV 3/5] END learning_rate=0.5, max_depth=10, n_estimators=50;, score=0.863 total time=   0.5s
[CV 4/5] END learning_rate=0.5, max_depth=10, n_estimators=50;, score=0.865 total time=   0.5s
[CV 5/5] END learning_rate=0.5, max_depth=10, n_estimators=50;, score=0.869 total time=   0.5s
[CV 1/5] END learning_rate=0.5, max_depth=10, n_estimators=100;, score=0.855 total time=   0.9s
[CV 2/5] END learning_rate=0.5, max_depth=10, n_estimators=100;, score=0.860 total time=   0.9s
[CV 3/5] END learning_rate=0.5, max_depth=10, n_estimators=100;, score=0.856 total time=   0.9s
[CV 4/5] END learning_rate=0.5, max_depth=10, n_estimators=100;, score=0.855 total time=   0.9s
[CV 5/5] END learning_rate=0.5, max_depth=10, n_estimators=100;, score=0.867 total time=   0.9s
[CV 1/5] END learning_rate=0.5, max_depth=10, n_estimators=200;, score=0.852 total time=   1.8s
[CV 2/5] END learning_rate=0.5, max_depth=10, n_estimators=200;, score=0.854 total time=   1.8s
[CV 3/5] END learning_rate=0.5, max_depth=1

[CV 4/5] END learning_rate=0.5, max_depth=20, n_estimators=50;, score=0.852 total time=   1.0s
[CV 5/5] END learning_rate=0.5, max_depth=20, n_estimators=50;, score=0.860 total time=   0.9s
[CV 1/5] END learning_rate=0.5, max_depth=20, n_estimators=100;, score=0.844 total time=   1.7s
[CV 2/5] END learning_rate=0.5, max_depth=20, n_estimators=100;, score=0.850 total time=   1.8s
[CV 3/5] END learning_rate=0.5, max_depth=20, n_estimators=100;, score=0.849 total time=   1.6s
[CV 4/5] END learning_rate=0.5, max_depth=20, n_estimators=100;, score=0.849 total time=   1.9s
[CV 5/5] END learning_rate=0.5, max_depth=20, n_estimators=100;, score=0.855 total time=   1.7s
[CV 1/5] END learning_rate=0.5, max_depth=20, n_estimators=200;, score=0.844 total time=   3.0s
[CV 2/5] END learning_rate=0.5, max_depth=20, n_estimators=200;, score=0.846 total time=   3.1s
[CV 3/5] END learning_rate=0.5, max_depth=20, n_estimators=200;, score=0.847 total time=   3.0s
[CV 4/5] END learning_rate=0.5, max_depth=

[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.857 total time=   0.1s
[CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.864 total time=   0.3s
[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.864 total time=   0.3s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.864 total time=   0.2s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.858 total time=   0.2s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=0.865 total time=   0.2s
[CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.870 total time=   0.5s
[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.871 total time=   0.5s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.868 total time=   0.4s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.867 total time=   0.6s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_esti

[CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=50;, score=0.865 total time=   0.2s
[CV 1/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.872 total time=   0.4s
[CV 2/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.874 total time=   0.5s
[CV 3/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.872 total time=   0.5s
[CV 4/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.867 total time=   0.4s
[CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.874 total time=   0.4s
[CV 1/5] END learning_rate=0.1, max_depth=5, n_estimators=200;, score=0.876 total time=   0.8s
[CV 2/5] END learning_rate=0.1, max_depth=5, n_estimators=200;, score=0.877 total time=   0.7s
[CV 3/5] END learning_rate=0.1, max_depth=5, n_estimators=200;, score=0.871 total time=   0.8s
[CV 4/5] END learning_rate=0.1, max_depth=5, n_estimators=200;, score=0.871 total time=   0.8s
[CV 5/5] END learning_rate=0.1, max_depth=5, n_esti

[CV 1/5] END learning_rate=0.1, max_depth=10, n_estimators=100;, score=0.870 total time=   1.0s
[CV 2/5] END learning_rate=0.1, max_depth=10, n_estimators=100;, score=0.873 total time=   1.0s
[CV 3/5] END learning_rate=0.1, max_depth=10, n_estimators=100;, score=0.867 total time=   1.0s
[CV 4/5] END learning_rate=0.1, max_depth=10, n_estimators=100;, score=0.870 total time=   1.0s
[CV 5/5] END learning_rate=0.1, max_depth=10, n_estimators=100;, score=0.873 total time=   1.0s
[CV 1/5] END learning_rate=0.1, max_depth=10, n_estimators=200;, score=0.867 total time=   1.8s
[CV 2/5] END learning_rate=0.1, max_depth=10, n_estimators=200;, score=0.871 total time=   1.4s
[CV 3/5] END learning_rate=0.1, max_depth=10, n_estimators=200;, score=0.865 total time=   1.7s
[CV 4/5] END learning_rate=0.1, max_depth=10, n_estimators=200;, score=0.869 total time=   1.7s
[CV 5/5] END learning_rate=0.1, max_depth=10, n_estimators=200;, score=0.874 total time=   1.8s
[CV 1/5] END learning_rate=0.1, max_dept

[CV 2/5] END learning_rate=0.1, max_depth=20, n_estimators=100;, score=0.864 total time=   2.2s
[CV 3/5] END learning_rate=0.1, max_depth=20, n_estimators=100;, score=0.857 total time=   1.9s
[CV 4/5] END learning_rate=0.1, max_depth=20, n_estimators=100;, score=0.862 total time=   2.0s
[CV 5/5] END learning_rate=0.1, max_depth=20, n_estimators=100;, score=0.868 total time=   2.1s
[CV 1/5] END learning_rate=0.1, max_depth=20, n_estimators=200;, score=0.853 total time=   4.0s
[CV 2/5] END learning_rate=0.1, max_depth=20, n_estimators=200;, score=0.861 total time=   3.4s
[CV 3/5] END learning_rate=0.1, max_depth=20, n_estimators=200;, score=0.857 total time=   3.6s
[CV 4/5] END learning_rate=0.1, max_depth=20, n_estimators=200;, score=0.856 total time=   3.5s
[CV 5/5] END learning_rate=0.1, max_depth=20, n_estimators=200;, score=0.863 total time=   3.7s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=10;, score=0.807 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_dept

[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.847 total time=   0.3s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.843 total time=   0.3s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.844 total time=   0.2s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.849 total time=   0.5s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.850 total time=   0.5s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.852 total time=   0.6s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.845 total time=   0.6s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.848 total time=   0.6s
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=10;, score=0.854 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=10;, score=0.848 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth

[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.849 total time=   0.4s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.848 total time=   0.4s
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.857 total time=   0.9s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.856 total time=   0.9s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.858 total time=   0.9s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.852 total time=   0.9s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=200;, score=0.855 total time=   0.9s
[CV 1/5] END learning_rate=0.01, max_depth=10, n_estimators=10;, score=0.857 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=10, n_estimators=10;, score=0.862 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=10, n_estimators=10;, score=0.860 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_dep

[CV 5/5] END learning_rate=0.01, max_depth=10, n_estimators=100;, score=0.864 total time=   1.2s
[CV 1/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.864 total time=   2.2s
[CV 2/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.868 total time=   2.2s
[CV 3/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.865 total time=   2.4s
[CV 4/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.861 total time=   2.3s
[CV 5/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.866 total time=   2.1s
[CV 1/5] END learning_rate=0.01, max_depth=20, n_estimators=10;, score=0.854 total time=   0.3s
[CV 2/5] END learning_rate=0.01, max_depth=20, n_estimators=10;, score=0.853 total time=   0.3s
[CV 3/5] END learning_rate=0.01, max_depth=20, n_estimators=10;, score=0.850 total time=   0.3s
[CV 4/5] END learning_rate=0.01, max_depth=20, n_estimators=10;, score=0.847 total time=   0.3s
[CV 5/5] END learning_rate=0.01, m

[CV 1/5] END learning_rate=0.01, max_depth=20, n_estimators=200;, score=0.857 total time=   7.4s
[CV 2/5] END learning_rate=0.01, max_depth=20, n_estimators=200;, score=0.862 total time=   7.7s
[CV 3/5] END learning_rate=0.01, max_depth=20, n_estimators=200;, score=0.859 total time=   7.6s
[CV 4/5] END learning_rate=0.01, max_depth=20, n_estimators=200;, score=0.861 total time=   7.1s
[CV 5/5] END learning_rate=0.01, max_depth=20, n_estimators=200;, score=0.866 total time=   7.2s
[CV 1/5] END learning_rate=0.001, max_depth=3, n_estimators=10;, score=0.807 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=3, n_estimators=10;, score=0.808 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=3, n_estimators=10;, score=0.805 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=3, n_estimators=10;, score=0.802 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=3, n_estimators=10;, score=0.807 total time=   0.0s
[CV 1/5] END learning_rate=0.001, m

[CV 2/5] END learning_rate=0.001, max_depth=3, n_estimators=200;, score=0.845 total time=   0.5s
[CV 3/5] END learning_rate=0.001, max_depth=3, n_estimators=200;, score=0.805 total time=   0.5s
[CV 4/5] END learning_rate=0.001, max_depth=3, n_estimators=200;, score=0.842 total time=   0.6s
[CV 5/5] END learning_rate=0.001, max_depth=3, n_estimators=200;, score=0.844 total time=   0.6s
[CV 1/5] END learning_rate=0.001, max_depth=5, n_estimators=10;, score=0.854 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=5, n_estimators=10;, score=0.848 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=5, n_estimators=10;, score=0.845 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=5, n_estimators=10;, score=0.848 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=5, n_estimators=10;, score=0.843 total time=   0.0s
[CV 1/5] END learning_rate=0.001, max_depth=5, n_estimators=50;, score=0.854 total time=   0.2s
[CV 2/5] END learning_rate=0.001, ma

[CV 3/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.848 total time=   0.9s
[CV 4/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.848 total time=   0.7s
[CV 5/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.845 total time=   0.7s
[CV 1/5] END learning_rate=0.001, max_depth=10, n_estimators=10;, score=0.856 total time=   0.1s
[CV 2/5] END learning_rate=0.001, max_depth=10, n_estimators=10;, score=0.861 total time=   0.1s
[CV 3/5] END learning_rate=0.001, max_depth=10, n_estimators=10;, score=0.859 total time=   0.1s
[CV 4/5] END learning_rate=0.001, max_depth=10, n_estimators=10;, score=0.854 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=10, n_estimators=10;, score=0.856 total time=   0.1s
[CV 1/5] END learning_rate=0.001, max_depth=10, n_estimators=50;, score=0.857 total time=   0.6s
[CV 2/5] END learning_rate=0.001, max_depth=10, n_estimators=50;, score=0.862 total time=   0.5s
[CV 3/5] END learning_rate=0.0

[CV 4/5] END learning_rate=0.001, max_depth=10, n_estimators=200;, score=0.857 total time=   1.5s
[CV 5/5] END learning_rate=0.001, max_depth=10, n_estimators=200;, score=0.860 total time=   1.4s
[CV 1/5] END learning_rate=0.001, max_depth=20, n_estimators=10;, score=0.851 total time=   0.2s
[CV 2/5] END learning_rate=0.001, max_depth=20, n_estimators=10;, score=0.851 total time=   0.2s
[CV 3/5] END learning_rate=0.001, max_depth=20, n_estimators=10;, score=0.849 total time=   0.4s
[CV 4/5] END learning_rate=0.001, max_depth=20, n_estimators=10;, score=0.846 total time=   0.2s
[CV 5/5] END learning_rate=0.001, max_depth=20, n_estimators=10;, score=0.850 total time=   0.2s
[CV 1/5] END learning_rate=0.001, max_depth=20, n_estimators=50;, score=0.852 total time=   1.7s
[CV 2/5] END learning_rate=0.001, max_depth=20, n_estimators=50;, score=0.852 total time=   1.1s
[CV 3/5] END learning_rate=0.001, max_depth=20, n_estimators=50;, score=0.851 total time=   1.1s
[CV 4/5] END learning_rate=0

[CV 5/5] END learning_rate=0.001, max_depth=20, n_estimators=200;, score=0.856 total time=   7.2s


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method='hist',
                  

In [27]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

In [28]:
## retrain modeling 
new_model = XGBClassifier(objective='binary:logistic',use_label_encoder=False,**grid.best_params_)
new_model.fit(train_X, train_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [29]:
## test score
y_pred = new_model.predict(test_X)
accuracy_score(test_y,y_pred)

0.8727223094246912