In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from six.moves import cPickle
from sklearn.cross_validation import StratifiedKFold
import graphviz
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
% matplotlib inline
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

plt.style.use('ggplot')

In [3]:
def  auc_me(bst, y_valid, valid):
        y_true = y_valid
        y_pred = bst.predict(valid)
        return roc_auc_score(y_true, y_pred)

In [4]:
def  f_me(bst, y_valid, valid):
        y_true = y_valid
        y_pred = bst.predict(valid)
        y_pred_binary = np.array([binarize(y) for y in y_pred])

        return f1_score(y_true, y_pred_binary)

In [5]:
def run_xgb(X, Y, cust_list, param, save=False, verbose=True, model=True):
    cust_Y = cust_list.SIU_CUST_YN
    # select train, among the cust 
    skf = StratifiedKFold(cust_Y, n_folds = 10, shuffle = True)
    cust_train_idx , cust_valid_idx = next(iter(skf))
    # find the index of train customer
    train_idx = cust_list.iloc[cust_train_idx].CUST_ID.values
    valid_idx = cust_list.iloc[cust_valid_idx].CUST_ID.values
    
    X_train = X.loc[X.CUST_ID.isin(train_idx)]
    X_valid= X.loc[X.CUST_ID.isin(valid_idx)]
    #drop cust id in X
    X_train.drop('CUST_ID', axis = 1, inplace = True)
    X_valid.drop('CUST_ID', axis = 1, inplace = True)
    y_train =  Y.loc[Y.CUST_ID.isin(train_idx), 'SIU_CUST_YN']
    y_valid = Y.loc[Y.CUST_ID.isin(valid_idx), 'SIU_CUST_YN']
      
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)

    # iteration regularization
    num_round = 200
    early_stopping_rounds = 50

    # To use early stopping, there should be stopping material which is validation set
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    bst = xgb.train(param, dtrain, num_round, 
                    early_stopping_rounds=early_stopping_rounds, 
                    evals = watchlist, verbose_eval = verbose)
    
    if verbose == False:
        print ("best score is : ", bst.best_score)
        print ("f-score(sklearn) valid: ",f_me(bst, dvalid.get_label(), dvalid))
        print ('auc(sklearn) valid:', auc_me(bst, dvalid.get_label(), dvalid))

    if save:
        print('Saving your model SCORE..')
        xgbfir.saveXgbFI(bst, feature_names=(X_train.columns),
                 OutputXlsxFile='./Dump/new_feature_SCORE.xlsx')
        print('Completed!')
        
    if model:
        return bst, dvalid

In [6]:
def binarize(element):
    if element >= 0.5:
        return 1
    else:
        return 0

# feature import

In [83]:
new_features = cPickle.load(open('./Dump/select_logit_feature.pkl', 'rb'))

In [84]:
score_feature = pd.DataFrame(new_features).sort_values(by = 'score', ascending = False)

# Data import

In [85]:
use_feature = score_feature.loc[score_feature.score > 0.65, 'feature']

In [86]:
test = cPickle.load(open('./Dump/test_new_feature.pkl', 'rb'))
train = cPickle.load(open('./Dump/train_new_feature.pkl', 'rb'))


# 1. 배깅

# Data split

나중에 bagging 했을때 테스트를위해 미리 10%를 떼놓는다

In [87]:
Y = train[['SIU_CUST_YN', 'CUST_ID']]
select_f_feature = {'feature' : [], 'score' : [] }

 #to do k-fold in cust_id (~22400)
cust_list = train[['CUST_ID', 'SIU_CUST_YN']]
cust_list = cust_list.groupby('CUST_ID')['SIU_CUST_YN'].sum().to_frame().reset_index()
cust_list.loc[cust_re.SIU_CUST_YN > 0, 'SIU_CUST_YN'] = 1

input_Y = cust_re.SIU_CUST_YN
skf = StratifiedKFold(input_Y, n_folds = 10, shuffle = True)
xgb_idx , bagging_idx  = next(iter(skf))

xgb_cust_list = cust_list.iloc[xgb_idx]

In [110]:
# tau 높아지면 확률이 다비슷해지고,  (std 가 작아짐) 
iter_num = 0
tau = 0.31
model_num = 0
num_feature = 30

Y = train[['SIU_CUST_YN', 'CUST_ID']]
select_f_feature = {'feature' : [], 'score' : [] }
save  =  True

while iter_num<25: 
    # select customer 사기꾼 숫자랑 아닌숫자가 같게 (아닌사람이 16000명인데 그중에 1600명만고른다고생각하면 됨)
    num_SIU_1 = len(xgb_cust_list[xgb_cust_list.SIU_CUST_YN == 1])
    num_SIU_0 = len(xgb_cust_list[xgb_cust_list.SIU_CUST_YN == 0])
    idx_rnd_SIU_0 = np.round(np.random.uniform(0, num_SIU_0-1, num_SIU_1))
    xgb_cust_re_0 = xgb_cust_list[xgb_cust_list.SIU_CUST_YN == 0].iloc[idx_rnd_SIU_0]
    xgb_cust_re_1 = xgb_cust_list.loc[xgb_cust_list.SIU_CUST_YN == 1, ]


    new_xgb_cust_list = xgb_cust_re_0.append(xgb_cust_re_1)
    input_Y = xgb_cust_list.SIU_CUST_YN

    skf = StratifiedKFold(input_Y, n_folds = 4, shuffle = True)
    select_idx , _ = next(iter(skf))

    select_cust_list = xgb_cust_list.iloc[select_idx]
    
    score_feature_select = score_feature.loc[score_feature.score > 0.65, ]

    prob_select_feature = list(score_feature_select.feature.values)


    for obj in ['logitraw', 'logistic']:
        print (obj)
       
        cust = True
        param = {} 

        # Model setting
        param['objective'] = 'binary:' + obj
        param['booster'] = 'gbtree'
        param['eval_metric'] = 'auc'

        # weight regularization
        param['Eta'] = 0.1
        param['Lambda'] = 0.5
        param['alpha'] = 0.5

        # partitioning regularization
        param['Gamma'] = 0.1
        param['Max_depth'] = 25
        param['min_child_weight'] = 5

        # sampling regularization
        param['Subsample'] = 1
        param['Colsample_bytree'] = 1

       

        #X = train[['CUST_ID'] + prob_select_feature]
        X = train.loc[train.CUST_ID.isin(cust_list.iloc[xgb_idx].CUST_ID), ['CUST_ID'] + prob_select_feature]
        (bst, dvalid) = run_xgb(X, Y, select_cust_list, param, verbose=False)
        
        if (f_me(bst, dvalid.get_label(), dvalid) > 0.7) & (save == True):
            print ('save', f_me(bst, dvalid.get_label(), dvalid))
            cPickle.dump(bst, open('./model/xgb_model_{}.pkl'.format(model_num), 'wb'), -1)
            print ('model num is {}'. format(model_num))
            model_num += 1
        iter_num += 1
        print (iter_num)
    print ('----------------------------------------------------------------------------------------------------------------')
    
    
    
print ('end')

logitraw
best score is :  0.934071
f-score(sklearn) valid:  0.721021611002
auc(sklearn) valid: 0.926569719657
save 0.721021611002
model num is 0
1
logistic
best score is :  0.923854
f-score(sklearn) valid:  0.721403948605
auc(sklearn) valid: 0.914089209609
save 0.721403948605
model num is 1
2
----------------------------------------------------------------------------------------------------------------
logitraw
best score is :  0.922123
f-score(sklearn) valid:  0.741911211437
auc(sklearn) valid: 0.917468406363
save 0.741911211437
model num is 2
3
logistic
best score is :  0.923077
f-score(sklearn) valid:  0.752398081535
auc(sklearn) valid: 0.921993026888
save 0.752398081535
model num is 3
4
----------------------------------------------------------------------------------------------------------------
logitraw
best score is :  0.914805
f-score(sklearn) valid:  0.694920955264
auc(sklearn) valid: 0.913641551872
5
logistic
best score is :  0.938476
f-score(sklearn) valid:  0.78561549101


In [111]:
models  = []
for model_num in range(18):
    models.append(cPickle.load(open('./model/xgb_model_{}.pkl'.format(model_num), 'rb')))

In [112]:
# select test set in X
X_test = train.loc[train.CUST_ID.isin(cust_list.iloc[bagging_idx].CUST_ID)]

In [113]:
#select feature used in model
Y_pred = []

for each_model in models:
    X_test_temp = X_test[each_model.feature_names]
    result_each_model = each_model.predict(xgb.DMatrix(X_test_temp))
    result_each_model = result_each_model
    y_pred_binary = np.array([binarize(y) for y in result_each_model])
    Y_pred.append(y_pred_binary)
    
Y_pred_total = np.array(Y_pred).T

In [133]:
Y_bagging_result = []
for i in range(len(Y_pred_total)):
    if np.sum(Y_pred_total[i,:]) > 2:
        Y_bagging_result.append(1)
    else:
        Y_bagging_result.append(0)


y_real_intest = Y.loc[Y.CUST_ID.isin(cust_re.iloc[bagging_idx].CUST_ID), 'SIU_CUST_YN']
print ("fscore :", f1_score(y_real_intest, Y_bagging_result))
print ('AUC :',roc_auc_score(y_real_intest, Y_bagging_result))
print ('recall :',recall_score(y_real_intest, Y_bagging_result))
print ('precision :',precision_score(y_real_intest, Y_bagging_result))

 

fscore : 0.769136307818
AUC : 0.861060573756
recall : 0.79753820034
precision : 0.742687747036


In [134]:
#select feature used in model
Y_pred = []

for each_model in models:
    X_test_temp = test[bst.feature_names]
    result_each_model = each_model.predict(xgb.DMatrix(X_test_temp))
    result_each_model = result_each_model
    y_pred_binary = np.array([binarize(y) for y in result_each_model])
    Y_pred.append(y_pred_binary)
    
Y_pred_test = np.array(Y_pred).T

Y_bagging_result = []
for i in range(len(Y_pred_test)):
    if np.sum(Y_pred_test[i,:]) > 2:
        Y_bagging_result.append(1)
    else:
        Y_bagging_result.append(0)

In [136]:
test_CUST_bagging = test[['CUST_ID', 'SIU_CUST_YN']]

In [137]:
test_CUST_bagging['SIU_CUST_YN'] = y_pred_binary

In [148]:
test_CUST_bagging = (test_CUST_bagging.drop_duplicates('CUST_ID'))

In [None]:
test_CUST_bagging.to_csv()

# 2. 단일모형

# Data split

나중에 bagging 했을때 테스트를위해 미리 10%를 떼놓는다

In [11]:
Y = train[['SIU_CUST_YN', 'CUST_ID']]
select_f_feature = {'feature' : [], 'score' : [] }

 #to do k-fold in cust_id (~22400)
cust_re = train[['CUST_ID', 'SIU_CUST_YN']]
cust_re = cust_re.groupby('CUST_ID')['SIU_CUST_YN'].sum().to_frame().reset_index()
cust_re.loc[cust_re.SIU_CUST_YN > 0, 'SIU_CUST_YN'] = 1

'''input_Y = cust_re.SIU_CUST_YN
skf = StratifiedKFold(input_Y, n_folds = 10, shuffle = True)
xgb_idx , bagging_idx  = next(iter(skf))'''

select_cust_re = cust_re

# XGBOOST 단일모델

In [12]:
result_dump = {'param' : [], 'fscore':[], 'auc_score':[]}

In [27]:

allfe = list(train.columns)[3:]
allfe.remove('DIVIDED_SET')
allfe.remove('SIU_CUST_YN')
for i in range(10):
    for obj in ['logistic']:
        print (obj)

        cust = True
        param = {} 

        # Model setting
        param['objective'] = 'binary:' + obj
        param['booster'] = 'gbtree'
        param['eval_metric'] = 'auc'

        # weight regularization
        param['Eta'] = 0.1
        param['Lambda'] = 0.5
        param['alpha'] = 0.5

        # partitioning regularization
        param['Gamma'] = 0.1
        param['Max_depth'] = 20
        param['min_child_weight'] = 7

        # sampling regularization
        param['Subsample'] = 1
        param['Colsample_bytree'] = 0.8



        X = train[['CUST_ID'] + list(use_feature.values)]
        (bst, dvalid) = run_xgb(X, Y, select_cust_re, param, verbose=False)

        print ('----------------------------------------------------------------------------------------------------------------')



logistic
best score is :  0.92713
f-score(sklearn) valid:  0.713965341488
auc(sklearn) valid: 0.918677230518
----------------------------------------------------------------------------------------------------------------
logistic
best score is :  0.90715
f-score(sklearn) valid:  0.684843280378
auc(sklearn) valid: 0.905055600112
----------------------------------------------------------------------------------------------------------------
logistic
best score is :  0.934463
f-score(sklearn) valid:  0.739092111062
auc(sklearn) valid: 0.930874670479
----------------------------------------------------------------------------------------------------------------
logistic
best score is :  0.929959
f-score(sklearn) valid:  0.767970509932
auc(sklearn) valid: 0.926409592007
----------------------------------------------------------------------------------------------------------------
logistic
best score is :  0.917568
f-score(sklearn) valid:  0.714348206474
auc(sklearn) valid: 0.907393069586


In [29]:
train.head(5)

Unnamed: 0,ACCI_DVSN,ACCI_HOSP_ADDR,ACCI_OCCP_GRP1,ACCI_OCCP_GRP2,AGE,BEFO_JOB,BRCH_CODE,CAUS_CODE,CAUS_CODE_DTAL,CHANG_FP_YN,...,CUST_HOSP,CUST_CHME,CHME_NUM,HOSP_NUM,CUST_HOSP_DVSN,ITEM_COUNT,MONEY_COUNT,DMND_ALL,AVG_MONEY,POLY_NUM
9243,3,1,6,0,57.0,8,113.0,537,14,1,...,4,4,4,23,2,1,7,30000000.0,29090518.0,7
9244,3,1,6,0,57.0,8,113.0,537,14,1,...,4,4,4,23,2,1,7,650000.0,29090518.0,7
9245,3,12,6,0,57.0,8,113.0,712,50,1,...,4,4,3,54,2,1,7,4810224.0,29090518.0,7
9246,3,12,6,0,57.0,8,113.0,145,50,1,...,4,4,7,199,2,1,7,7950000.0,29090518.0,7
9247,3,12,6,0,57.0,8,113.0,145,50,1,...,4,4,7,199,2,1,7,2825000.0,29090518.0,7


In [58]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9243 entries, 0 to 9242
Columns: 108 entries, ACCI_DVSN to POLY_NUM
dtypes: float64(31), int64(77)
memory usage: 7.7 MB


In [32]:
#select feature used in model
X_test = test[bst.feature_names]
y_pred_intest = bst.predict(xgb.DMatrix(X_test))

y_pred_binary = np.array([binarize(y) for y in y_pred_intest])
#print ("fscore :", f1_score(y_real_intest, y_pred_binary))
#print ('AUC :',roc_auc_score(y_real_intest, y_pred_intest))

In [56]:
y_pred_binary

array([0, 0, 0, ..., 0, 0, 0])

In [67]:
test_CUST = test[['CUST_ID', 'SIU_CUST_YN']]

In [68]:
test_CUST['SIU_CUST_YN'] = y_pred_binary

In [72]:
test_CUST.drop_duplicates('CUST_ID')

Unnamed: 0,CUST_ID,SIU_CUST_YN
0,12775,0
6,11418,0
8,4710,0
9,5591,0
12,16327,0
13,22271,0
14,21447,0
26,4545,0
29,1112,0
41,908,0


In [224]:
result_dump['param'].append(param)
result_dump['fscore'].append(f_me(bst, dvalid.get_label(), dvalid))
result_dump['auc_score'].append(bst.best_score)

fscore : 0.70579029734
AUC : 0.911514533648


In [123]:
result_dump

{'auc_score': [0.944672],
 'fscore': [0.79636579103298433],
 'param': [{'Colsample_bytree': 0.8,
   'Eta': 0.1,
   'Gamma': 0.1,
   'Lambda': 0.5,
   'Max_depth': 20,
   'Subsample': 0.7,
   'alpha': 0.5,
   'booster': 'gbtree',
   'eval_metric': 'auc',
   'min_child_weight': 7,
   'objective': 'binary:logistic'}]}