In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df=pd.read_csv('df_1000.csv')

In [3]:
df.shape

(350576, 1022)

In [4]:
for bin_feature in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])

In [5]:
set(df['FLAG_OWN_CAR'])

{0L, 1L}

In [6]:
set(df['FLAG_OWN_REALTY'])

{0L, 1L}

In [7]:
df.shape

(350576, 1022)

In [8]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [9]:
df, cat_cols = one_hot_encoder(df, nan_as_category=False)

In [10]:
df.shape

(350576, 1141)

In [11]:
USELESS_COLUMNS = ['FLAG_DOCUMENT_10',
                   'FLAG_DOCUMENT_12',
                   'FLAG_DOCUMENT_13',
                   'FLAG_DOCUMENT_14',
                   'FLAG_DOCUMENT_15',
                   'FLAG_DOCUMENT_16',
                   'FLAG_DOCUMENT_17',
                   'FLAG_DOCUMENT_19',
                   'FLAG_DOCUMENT_2',
                   'FLAG_DOCUMENT_20',
                   'FLAG_DOCUMENT_21']

In [12]:
df= df.drop(USELESS_COLUMNS,axis=1)

In [13]:
df.shape

(350576, 1130)

In [14]:
import xgboost as xgb
from xgboost import plot_importance

In [15]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]
print("Starting xgb. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

Starting xgb. Train shape: (307511, 1130), test shape: (43065, 1130)


In [16]:
del df
gc.collect()

28

In [17]:
# 测试 取100行
train_df=train_df.iloc[:100,:]
test_df=test_df.iloc[:100,:]

In [18]:
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
X=train_df[feats]
y=train_df['TARGET']

In [19]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [20]:
import pandas as pd
import numpy as np
from __future__ import print_function
from __future__ import division
import xgboost as xgb
from sklearn.cross_validation import cross_val_score
from bayes_opt import bayesian_optimization
import sklearn.cross_validation as cv



In [21]:
def xgboostcv(max_depth,
              gamma,
              min_child_weight,
              colsample_bylevel,
              subsample,
              colsample_bytree,
              reg_lambda,
              reg_alpha,
              silent=True,
              objective='binary:logistic',
              learning_rate=0.02,
              n_estimators=5000):
    return cross_val_score(xgb.XGBClassifier(max_depth=int(max_depth),
                                             learning_rate=learning_rate,
                                             n_estimators=n_estimators,
                                             colsample_bylevel=colsample_bylevel,
                                             silent=silent,
                                            objective=objective,
                                            gamma=gamma,
                                            min_child_weight=min_child_weight,
                                            subsample=subsample,
                                            colsample_bytree=colsample_bytree,
                                            reg_alpha=reg_alpha,
                                            reg_lambda=reg_lambda),
                           X,
                           y,
                           'roc_auc',
                           cv=5).mean()

if __name__ == "__main__":
    
    xgboostBO = bayesian_optimization.BayesianOptimization(xgboostcv,
                                 {'gamma':(0.01,1),
                                 'max_depth' : (4,20),
                                 'min_child_weight' : (2,10),
                                 'subsample' : (0.4,0.9),
                                 'colsample_bytree' : (0.4,0.9),
                                 'colsample_bylevel' : (0.7,1),
                                 'reg_alpha' : (0,0.01),
                                 'reg_lambda' : (0,0.01)})
    xgboostBO.maximize(init_points=2, n_iter = 2)
    print('-'*53)
    print('Final Results')
    print('XGBOOST: %f' % xgboostBO.res['max']['max_val'])

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bylevel |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   reg_alpha |   reg_lambda |   subsample | 
    1 | 00m18s | [35m   0.40643[0m | [32m             0.9056[0m | [32m            0.7253[0m | [32m   0.9209[0m | [32m    13.1775[0m | [32m            4.8323[0m | [32m     0.0047[0m | [32m      0.0078[0m | [32m     0.7938[0m | 
    2 | 00m17s | [35m   0.52632[0m | [32m             0.9037[0m | [32m            0.6062[0m | [32m   0.6080[0m | [32m    16.9370[0m | [32m            5.4918[0m | [32m     0.0002[0m | [32m      0.0095[0m | [32m     0.4870[0m | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------------------------------------------------------------------

In [22]:
folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
dtest=xgb.DMatrix(test_df[feats])
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
    dtrain = xgb.DMatrix(train_df[feats].iloc[train_idx],train_df['TARGET'].iloc[train_idx])
    dvalid = xgb.DMatrix(train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx])
    valid_y=train_df['TARGET'].iloc[valid_idx]

       # xgb
    params = {'eval_metric': 'auc',
              'objective': 'binary:logistic',
              'booster':'gbtree',
              'tree_method': 'auto',
              'nthread' : 4,
              'eta' : 0.02,
               'max_leaves': 40,
              'max_depth' : 16,
              'max_bin': 255,
              'min_child_weight' : 4,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'colsample_bylevel' : 1,
              'alpha' : 0.001,
              'lambda' : 0.001,
              'scale_pos_weight': 1}
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
    model=xgb.train(params, dtrain, 5000, watchlist, maximize=True, early_stopping_rounds = 100, verbose_eval=100)
    oof_preds[valid_idx] = model.predict(dvalid, ntree_limit=model.best_ntree_limit)
    sub_preds += model.predict(dtest,ntree_limit=model.best_ntree_limit) / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    del model, dtrain, dvalid, valid_y

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
test_df['TARGET'] = sub_preds
test_df[['SK_ID_CURR', 'TARGET']].to_csv('xgb_1000feature.csv', index= False)

[12:35:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1


XGBoostError: [12:35:18] src/metric/rank_metric.cc:135: Check failed: !auc_error AUC: the dataset only contains pos or neg samples

In [None]:
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('XGB Features (avg over folds)')
plt.tight_layout
plt.show()

In [None]:
feature_importance_df.to_csv('feature_importance_xgb1000features.csv', index=False)