In [3]:
import numpy as np, pandas as pd, os, gc
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

# 加载数据

In [4]:
X_train = pd.read_csv('./clean_data/xgb_fraud_with_magic_train.csv')
X_test = pd.read_csv('./clean_data/xgb_fraud_with_magic_test.csv')

# 选择训练数据column

In [5]:
train_cols = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card5',
       'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain',
       'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9',
       'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5',
       'D10', 'D11', 'D15', 'M1', 'M2', 'M3', 'M4', 'M6', 'M7', 'M8',
       'M9', 'V1', 'V3', 'V4', 'V6', 'V8', 'V11', 'V13', 'V14', 'V17',
       'V20', 'V23', 'V26', 'V27', 'V30', 'V36', 'V37', 'V40', 'V41',
       'V44', 'V47', 'V48', 'V54', 'V56', 'V59', 'V62', 'V65', 'V67',
       'V68', 'V70', 'V76', 'V78', 'V80', 'V82', 'V86', 'V88', 'V89',
       'V91', 'V107', 'V108', 'V111', 'V115', 'V117', 'V120', 'V121',
       'V123', 'V124', 'V127', 'V129', 'V130', 'V136', 'V138', 'V139',
       'V142', 'V147', 'V156', 'V160', 'V162', 'V165', 'V166', 'V169',
       'V171', 'V173', 'V175', 'V176', 'V178', 'V180', 'V182', 'V185',
       'V187', 'V188', 'V198', 'V203', 'V205', 'V207', 'V209', 'V210',
       'V215', 'V218', 'V220', 'V221', 'V223', 'V224', 'V226', 'V228',
       'V229', 'V234', 'V235', 'V238', 'V240', 'V250', 'V252', 'V253',
       'V257', 'V258', 'V260', 'V261', 'V264', 'V266', 'V267', 'V271',
       'V274', 'V277', 'V281', 'V283', 'V284', 'V285', 'V286', 'V289',
       'V291', 'V294', 'V296', 'V297', 'V301', 'V303', 'V305', 'V307',
       'V309', 'V310', 'V314', 'V320', 'id_01', 'id_02', 'id_03', 'id_04',
       'id_05', 'id_06', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13',
       'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_28',
       'id_29', 'id_31', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo', 'cents', 'addr1_FE', 'card1_FE', 'card2_FE',
       'card3_FE', 'P_emaildomain_FE', 'card1_addr1',
       'card1_addr1_P_emaildomain', 'card1_addr1_FE',
       'card1_addr1_P_emaildomain_FE', 'TransactionAmt_card1_mean',
       'TransactionAmt_card1_std', 'TransactionAmt_card1_addr1_mean',
       'TransactionAmt_card1_addr1_std',
       'TransactionAmt_card1_addr1_P_emaildomain_mean',
       'TransactionAmt_card1_addr1_P_emaildomain_std', 'D9_card1_mean',
       'D9_card1_std', 'D9_card1_addr1_mean', 'D9_card1_addr1_std',
       'D9_card1_addr1_P_emaildomain_mean',
       'D9_card1_addr1_P_emaildomain_std', 'D11_card1_mean',
       'D11_card1_std', 'D11_card1_addr1_mean', 'D11_card1_addr1_std',
       'D11_card1_addr1_P_emaildomain_mean',
       'D11_card1_addr1_P_emaildomain_std']

In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
import datetime
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
X_train['DT_M'] = X_train['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
X_train['DT_M'] = (X_train['DT_M'].dt.year-2017)*12 + X_train['DT_M'].dt.month 

y_train = X_train['isFraud'].copy()


X_test['DT_M'] = X_test['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
X_test['DT_M'] = (X_test['DT_M'].dt.year-2017)*12 + X_test['DT_M'].dt.month 

In [8]:
%%time
X_train = reduce_mem_usage(X_train)
y_train = y_train.astype(np.int8)

X_test = reduce_mem_usage(X_test)

Memory usage of dataframe is 1099.33 MB
Memory usage after optimization is: 277.09 MB
Decreased by 74.8%
Memory usage of dataframe is 939.38 MB
Memory usage after optimization is: 240.16 MB
Decreased by 74.4%
CPU times: user 38.2 s, sys: 34.8 s, total: 1min 13s
Wall time: 1min 13s


In [9]:
train_cols = X_train.columns
train_cols = train_cols.drop(['isFraud', 'TransactionDT', 'DT_M'])
train_cols

Index(['TransactionID', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'D9_card1_addr1_mean', 'D9_card1_addr1_std',
       'D9_card1_addr1_P_emaildomain_mean', 'D9_card1_addr1_P_emaildomain_std',
       'D11_card1_mean', 'D11_card1_std', 'D11_card1_addr1_mean',
       'D11_card1_addr1_std', 'D11_card1_addr1_P_emaildomain_mean',
       'D11_card1_addr1_P_emaildomain_std'],
      dtype='object', length=241)

# 模型训练

In [10]:
# TRAIN 75% PREDICT 25%
idxT = X_train.index[:3*len(X_train)//4]
idxV = X_train.index[3*len(X_train)//4:]

In [11]:
# %%time
# import xgboost as xgb
# from sklearn.model_selection import GridSearchCV
# default_params = {'n_estimators':5000, 'cmax_depth':12, 'learning_rate':0.02,
#                   'subsample':0.8, 'colsample_bytree':0.4, 'missing':-1}

# params_select = {'n_estimators':[3000, 5000, 7000], 
#                'max_depth':[8, 12, 16], 
#                'learning_rate':[0.01, 0.03, 0.05, 0.08], 
#                'subsample':[0.4, 0.6, 0.8, 1.0, 1.2], 
#                'colsample_bytree':[0.2, 0.4, 0.6, 0.8, 1.0]}

# clf = xgb.XGBClassifier(
#             missing=-1,
#             eval_metric='auc',
#             # USE CPU
#             nthread=24,
#             tree_method='hist'
#             # USE GPU
# #                 tree_method='gpu_hist' 
#         )
        
# gsearch = GridSearchCV(clf, param_grid=params_select, scoring='roc_auc', cv=[(idxT, idxV)])
# gsearch.fit(X_train, y_train)

In [12]:
# print("Best score: %0.3f" % gsearch.best_score_)
# print("Best parameters set:")
# best_parameters = gsearch.best_estimator_.get_params()
# for param_name in sorted(params_select.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

# result = pd.DataFrame.from_dict(gsearch.cv_results_)
# result.to_csv('./output/xgb_result.csv')

In [13]:
# %%time
# import xgboost as xgb
# params_select = {'n_estimators':[1000, 3000, 5000, 7000, 9000], 
#                'max_depth':[4, 8, 16, 20], 
#                'learning_rate':[0.01, 0.03, 0.05, 0.08, 0.11], 
#                'subsample':[0.4, 0.6, 1.0, 1.2], 
#                'colsample_bytree':[0.2, 0.6, 0.8, 1.0]}
# result = pd.DataFrame(columns=['n_estimators', 'n_estimators_params', 'max_depth', 'max_depth_params',
#                                'learning_rate', 'learning_rate_params', 'subsample', 'subsample_params',
#                                'colsample_bytree', 'colsample_bytree_params' ])

# for (name, value_list) in params_select.items():
#     for i, value in enumerate(value_list):
#         params = {name: value}
        
#         print("params：%s, value:%d"%(name, value))
#         clf = xgb.XGBClassifier(
#             missing=-1,
#             eval_metric='auc',
#             # USE CPU
#             nthread=24,
#             tree_method='hist'
#             # USE GPU
# #                 tree_method='gpu_hist' 
#         )
#         clf.set_params(**params)
#         clf.fit(X_train[train_cols].iloc[idxT], y_train.iloc[idxT], 
#                 eval_set=[(X_train[train_cols].iloc[idxV],y_train.iloc[idxV])],
#                 verbose=100, early_stopping_rounds=200)

#         result.loc[i, name] = roc_auc_score(y_train.iloc[idxV], clf.predict_proba(X_train[train_cols].iloc[idxV])[:,1])
#         result.loc[i, name+'_params'] = value
#     result.to_csv('./output/xgb_'+name+'_params.csv')

# 模型

In [None]:
%%time
import xgboost as xgb
params = {'n_estimators':5000, 'max_depth':12, 'learning_rate':0.02,
                  'subsample':0.8, 'colsample_bytree':0.4}

skf = GroupKFold(n_splits=6)

y_preds = np.zeros(X_test.shape[0])
y_train_preds = np.zeros(X_train.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = train_cols

for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M']) ):
    print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
    clf = xgb.XGBClassifier(missing=-1, eval_metric='auc',
        # USE CPU
        nthread=24, tree_method='hist'
        # USE GPU
#         tree_method='gpu_hist' 
    )
    clf.set_params(**params)
    clf.fit(X_train[train_cols].iloc[idxT], y_train.iloc[idxT], 
            eval_set=[(X_train[train_cols].iloc[idxV],y_train.iloc[idxV])],
            verbose=100, early_stopping_rounds=200)

    feature_importances[f'fold_{i + 1}'] = clf.feature_importances_
    y_train_preds[idxV] += clf.predict_proba(X_train[train_cols].iloc[idxV])[:,1]
    y_preds += clf.predict_proba(X_test[train_cols])[:,1]/skf.n_splits
    del clf
    gc.collect()

 rows of train = 453219 rows of holdout = 137321
[0]	validation_0-auc:0.75801
Will train until validation_0-auc hasn't improved in 200 rounds.
[100]	validation_0-auc:0.87186
[200]	validation_0-auc:0.89072
[300]	validation_0-auc:0.89992
[400]	validation_0-auc:0.90268
[500]	validation_0-auc:0.90363
[600]	validation_0-auc:0.90312
[700]	validation_0-auc:0.90371
[800]	validation_0-auc:0.90325
Stopping. Best iteration:
[663]	validation_0-auc:0.90399

 rows of train = 488908 rows of holdout = 101632
[0]	validation_0-auc:0.82505
Will train until validation_0-auc hasn't improved in 200 rounds.
[100]	validation_0-auc:0.90321
[200]	validation_0-auc:0.92941
[300]	validation_0-auc:0.94144
[400]	validation_0-auc:0.94542
[500]	validation_0-auc:0.94640
[600]	validation_0-auc:0.94680
[700]	validation_0-auc:0.94662
[800]	validation_0-auc:0.94682
[900]	validation_0-auc:0.94658
Stopping. Best iteration:
[772]	validation_0-auc:0.94684

 rows of train = 497955 rows of holdout = 92585
[0]	validation_0-auc:0.

In [17]:
print ('XGBoost CV auc', roc_auc_score(y_train, y_train_preds))

# 存储预测结果
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission.isFraud = y_preds
sample_submission.to_csv('./output/xgb_submission.csv', index=False)

XGBoost CV auc 0.9417165849751634


In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns
plt.figure(figsize=(16, 16))
feature_importances['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(skf.n_splits)]].mean(axis=1)
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 most important feature');