In [71]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import metrics

import random
import pandas as pd 
import numpy as np

In [9]:
data_le = pd.read_csv('Data/encoded_data.csv')
features = data_le.columns.tolist()
features.remove('y')

In [20]:
def CVXgb(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare KStratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    indicies = []
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data.index.values, data['y']):
        # Prepare KNN model 
        model = XGBClassifier(use_label_encoder=False, *args, **kwargs)
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        # Add indicies of test data
        indicies.append(df.iloc[test].index.tolist().copy())
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds, indicies 

In [21]:
# Test basic model 
train_results, test_results, preds, indicies  = CVXgb(data_le, **{ 
    'max_depth': 8, 
    'subsample': 0.7, 
    'colsample_bytree': 0.7,
})

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.8773112229617339
ROC_AUC ON TEST SCORE 0.7859788526067999
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.8734216973933209
ROC_AUC ON TEST SCORE 0.7866418670819156
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.8733733809154742
ROC_AUC ON TEST SCORE 0.7968434112448131
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.876954286026812
ROC_AUC ON TEST SCORE 0.7933740087884272
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.8766545120635696
ROC_AUC ON TEST SCORE 0.8006030318467773


In [67]:
def XGB_h_param():
    test_list = []
    for i in range(50):
        
        param_dict = {
        'max_depth': random.randrange(4, 14, 1),
        'subsample': random.choice([x/10 for x in list(range(3,10))]), 
        'colsample_bytree': random.choice([x/10 for x in list(range(3,10))]),
        'reg_lambda': random.choice([(x/10)**2 for x in list(range(2, 8))])
        }

        # Use gamme if lambda is low and in half situations (in order to try models without L2 and gamma)
        if (param_dict['reg_lambda']) and (random.randrange(0,2,1) == 1):
            param_dict['gamma'] = random.choice([0, 0.01, 0.1, 0.5, 1, 2, 5])
            
        train_results, test_results, preds, indicies  = CVXgb(data_le, **param_dict)
        test_list.append([param_dict.items(), np.mean(test_results)])
        
        print([param_dict.items(), np.mean(train_results), np.mean(test_results)])
            
    return test_list 

In [68]:
tune_xgb = XGB_h_param()

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.8548742055462326
ROC_AUC ON TEST SCORE 0.7794619826613569
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.8555974547971332
ROC_AUC ON TEST SCORE 0.7837485821942827
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.8521692152472983
ROC_AUC ON TEST SCORE 0.7952716554129398
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.8536492054890735
ROC_AUC ON TEST SCORE 0.7908912616525609
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.8512001105356445
ROC_AUC ON TEST SCORE 0.80165913222356
[dict_items([('max_depth', 7), ('subsample', 0.4), ('colsample_bytree', 0.9), ('reg_lambda', 0.48999999999999994)]), 0.8534980383230764, 0.79020652282894]
FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.9516130692814944
ROC_AUC ON TEST SCORE 0.7816172833409253
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.9513077178872956
ROC_AUC ON TEST SCORE 0.778834568017204
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.9509503690638365
ROC_AUC ON TEST SCORE 0.7860445322469501
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.9518624716374603
ROC_AUC

In [73]:
sorted_xgb = sorted(tune_xgb, key=lambda x: x[-1], reverse=True)
sorted_xgb

[[dict_items([('max_depth', 5), ('subsample', 0.9), ('colsample_bytree', 0.4), ('reg_lambda', 0.36)]),
  0.8000688604069088],
 [dict_items([('max_depth', 5), ('subsample', 0.9), ('colsample_bytree', 0.9), ('reg_lambda', 0.04000000000000001), ('gamma', 0.5)]),
  0.7993539195863975],
 [dict_items([('max_depth', 5), ('subsample', 0.7), ('colsample_bytree', 0.4), ('reg_lambda', 0.16000000000000003), ('gamma', 0.01)]),
  0.7982915556139341],
 [dict_items([('max_depth', 4), ('subsample', 0.9), ('colsample_bytree', 0.6), ('reg_lambda', 0.16000000000000003)]),
  0.7981392280607935],
 [dict_items([('max_depth', 6), ('subsample', 0.7), ('colsample_bytree', 0.3), ('reg_lambda', 0.36), ('gamma', 0.01)]),
  0.7976764202317496],
 [dict_items([('max_depth', 4), ('subsample', 0.7), ('colsample_bytree', 0.5), ('reg_lambda', 0.16000000000000003), ('gamma', 0.5)]),
  0.7975235072017147],
 [dict_items([('max_depth', 8), ('subsample', 0.4), ('colsample_bytree', 0.4), ('reg_lambda', 0.36), ('gamma', 5)]),
 

In [120]:
def find_features(data):
    X_train, X_test, y_train, y_test = train_test_split(data[features], data_le['y'], test_size=0.25)
    model = XGBClassifier(max_depth=5, subsample=0.9, colsample_bytree=0.4, reg_lambda=0.36, use_label_encoder=False)
    model.fit(X_test, y_test)
    feature_importances = []
    importance_values = model.feature_importances_
    importance_names = X_train.columns.tolist()
    for i in range(len(importance_values)):
        feature_importances.append([importance_names[i], importance_values[i]])
    sorted_importances = sorted(feature_importances, key=lambda x: x[-1], reverse=True)
    return sorted_importances

In [121]:
find_features(data_le)



[['martial_status', 0.3233115],
 ['capital_gain', 0.13365893],
 ['education_num', 0.1179173],
 ['sex', 0.09177802],
 ['relationship', 0.076752335],
 ['capital_loss', 0.05240771],
 ['age', 0.044324327],
 ['occupation', 0.043233737],
 ['hours_per_week', 0.030805232],
 ['fnlwgt', 0.020445198],
 ['workclass', 0.018302206],
 ['education', 0.017441545],
 ['race', 0.015688144],
 ['naive_country', 0.013933842]]

In [111]:
feature_importances = []
importance_values = best_model.feature_importances_
importance_names = X_train.columns.tolist()
for i in range(len(importance_values)):
    feature_importances.append([importance_names[i], importance_values[i]])
sorted_importances = sorted(feature_importances, key=lambda x: x[-1], reverse=True)

In [122]:
sorted_importances

[['martial_status', 0.26695514],
 ['capital_gain', 0.14750516],
 ['relationship', 0.1365918],
 ['sex', 0.10572314],
 ['education_num', 0.09393712],
 ['capital_loss', 0.047303792],
 ['occupation', 0.044974726],
 ['age', 0.044208914],
 ['hours_per_week', 0.024951177],
 ['fnlwgt', 0.021042665],
 ['education', 0.01846338],
 ['workclass', 0.018258749],
 ['race', 0.016831458],
 ['naive_country', 0.013252837]]

In [124]:
import pickle 
with open('xgb_info.pkl', 'wb') as f:
    pickle.dump(sorted_importances, f)