In [1]:
import pandas as pd
df = pd.read_csv('./dataset/cleandata.csv', index_col=[0])
x = df.drop('income', axis=1)
y = df['income']

In [2]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [4]:
import xgboost

In [5]:
xgmodel = xgboost.XGBClassifier()

In [6]:
xgmodel.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
dtrain = xgboost.DMatrix(x_train, label=y_train)
dtest=xgboost.DMatrix(x_test, label=y_test)

In [8]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'binary:logistic',
}

In [9]:
params['eval_metric'] = "auc"

In [11]:
num_boost_round=500

In [12]:
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=4
)

[0]	Test-auc:0.90062
[1]	Test-auc:0.90438
[2]	Test-auc:0.90722
[3]	Test-auc:0.90995
[4]	Test-auc:0.91265
[5]	Test-auc:0.91506
[6]	Test-auc:0.91666
[7]	Test-auc:0.91836
[8]	Test-auc:0.91896
[9]	Test-auc:0.91995
[10]	Test-auc:0.92050
[11]	Test-auc:0.92165
[12]	Test-auc:0.92258
[13]	Test-auc:0.92342
[14]	Test-auc:0.92424
[15]	Test-auc:0.92598
[16]	Test-auc:0.92634
[17]	Test-auc:0.92676
[18]	Test-auc:0.92771
[19]	Test-auc:0.92832
[20]	Test-auc:0.92866
[21]	Test-auc:0.92908
[22]	Test-auc:0.92952
[23]	Test-auc:0.93018
[24]	Test-auc:0.93044
[25]	Test-auc:0.93071
[26]	Test-auc:0.93094
[27]	Test-auc:0.93123
[28]	Test-auc:0.93140
[29]	Test-auc:0.93157
[30]	Test-auc:0.93194
[31]	Test-auc:0.93209
[32]	Test-auc:0.93238
[33]	Test-auc:0.93272
[34]	Test-auc:0.93284
[35]	Test-auc:0.93295
[36]	Test-auc:0.93322
[37]	Test-auc:0.93325
[38]	Test-auc:0.93354
[39]	Test-auc:0.93362
[40]	Test-auc:0.93376
[41]	Test-auc:0.93386
[42]	Test-auc:0.93397
[43]	Test-auc:0.93417
[44]	Test-auc:0.93418
[45]	Test-auc:0.9344

In [13]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(5,20)
    for min_child_weight in range(0,5)
]

In [None]:
max_auc = float(0)
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(max_depth,min_child_weight))    

    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight    

    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=3,
        metrics={'auc'},
        early_stopping_rounds=4
    )    
    
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

CV with max_depth=5, min_child_weight=0
	AUC 0.943984 for 349 rounds
CV with max_depth=5, min_child_weight=1
	AUC 0.9426093333333334 for 338 rounds
CV with max_depth=5, min_child_weight=2
	AUC 0.9418190000000001 for 351 rounds
CV with max_depth=5, min_child_weight=3
	AUC 0.9395986666666666 for 255 rounds
CV with max_depth=5, min_child_weight=4
	AUC 0.9382153333333333 for 224 rounds
CV with max_depth=6, min_child_weight=0
	AUC 0.9485783333333333 for 426 rounds
CV with max_depth=6, min_child_weight=1
	AUC 0.9437286666666665 for 261 rounds
CV with max_depth=6, min_child_weight=2
	AUC 0.9421823333333332 for 221 rounds
CV with max_depth=6, min_child_weight=3
	AUC 0.9424513333333334 for 326 rounds
CV with max_depth=6, min_child_weight=4
	AUC 0.9397806666666666 for 200 rounds
CV with max_depth=7, min_child_weight=0


In [None]:
params['max_depth'] = 16
params['min_child_weight'] = 1

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5,10)]
    for colsample in [i/10. for i in range(5,10)]
]

In [None]:
max_auc = float(0)
best_params = None
for subsample, colsample in gridsearch_params:
    print("CV with subsample={}, colsample={}".format(max_depth,min_child_weight))    
    
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    
    # Run CV
    cv_results_pca = xgboost.cv(
        params,
        dtrain_pca,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=3,
        metrics={'auc'},
        early_stopping_rounds=4
    )    
    
    mean_auc = cv_results_pca['test-auc-mean'].max()
    boost_rounds = cv_results_pca['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (subsample,colsample)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

In [None]:
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

In [None]:
max_auc = float(0)
best_params = None
for eta in [1, 0.8,0.6,0.4,.2]:
    print("CV with eta={}".format(eta))  

    params['eta'] = eta  

    cv_results_pca = xgboost.cv(
        params,
        dtrain_pca,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=3,
        metrics={'auc'},
        early_stopping_rounds=4
    )    

    mean_auc = cv_results_pca['test-auc-mean'].max()
    boost_rounds = cv_results_pca['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = eta
print("Best params: {}, AUC: {}".format(best_params, max_auc))

In [None]:
params["eta"]=0.4

In [None]:
xgmodel = xgboost.XGBClassifier(n_estimators=700, max_depth=16, min_child_weight=1, eta=0.4, subsample=0.5, colsample_bytree=0.7, objective="binary:logistic")

xgmodel.fit(x_train, y_train)
y_pred = xgmodel.predict(x_test)
print(classification_report(y_test, y_pred))
y_pred_prob = xgmodel.predict_proba(x_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))