In [3]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from six.moves import cPickle
from sklearn.cross_validation import StratifiedKFold
import graphviz
from sklearn.metrics import f1_score
from sklearn import metrics
% matplotlib inline
plt.style.use('ggplot')

In [42]:
test_all = cPickle.load(open('./Dump/test_all.pkl', 'rb'))
train_all = cPickle.load(open('./Dump/train_all.pkl', 'rb'))

In [43]:
test_all.drop(['SIU_CUST_YN_x', 'SIU_CUST_YN_y'], axis = 1, inplace = True)
test_all.drop(['SELF_CHAM', 'NON_PAY','DCAF_CMPS_XCPA'], axis = 1, inplace = True)

In [44]:
train = test_all.append(train_all)

In [45]:
all_features = list(train.columns)
all_features.remove('CUST_ID')
all_features.remove('DIVIDED_SET')
all_features.remove('SIU_CUST_YN')

In [46]:
def binarize(element):
    if element >= 0.5:
        return 1
    else:
        return 0

In [47]:
def obj(preds, dtrain):
    labels = dtrain.get_label()

    c1 = np.sum(preds + labels)
    c2 = np.dot(labels, preds)
    
    #scaling1 = 10**4
    #scaling2 = 10**20
    
    gradient = -2*(labels / c1 - c2 / c1**2)
    #hessian = -2*gradient/c1
    #print('gradient', gradient)
    #print('hessian', hessian)
    hessian = np.ones(len(preds))
    
    return (100*gradient, hessian)

In [48]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.array([binarize(y) for  y in preds])
    return 'F-measure', 2 * np.dot(labels, preds) / np.sum(preds + labels)

In [49]:
def  f_me(bst, y_valid, valid):
        y_true = y_valid
        y_pred = bst.predict(valid)
        y_pred_binary = np.array([binarize(y) for y in y_pred])

        return f1_score(y_true, y_pred_binary)

## FP features
- FP_W : FP의 재직기간
- FP_CUST : FP가 몇번 보험금을 지급하였는가
- BRCH_CUST_NUM : BRCH가 몇번 보험금을 지급하였는가

In [50]:
# FP_W
def year_converter(int_val):
    string_val = str(int_val)
    year = string_val[0:4]
    return int(year)

train['FP_W'] = train['FIRE_YM_Y'] - train['ETRS_YM_Y']

# missing values are replace by -999
train.loc[train['FP_W'] > 1000, 'FP_W'] = -999

In [51]:
# FP_CUST
train['FP_CUST'] = train.groupby('CLLT_FP_PRNO')['CUST_ID'].transform('count')

In [52]:
# BRCH_CUST_NUM
train['BRCH_CUST_NUM'] = train.groupby('BRCH_CODE')['CUST_ID'].transform('count')

## HOSP features
- CUST_HOSP : CUST가 몇종류의 병원을 갔는가
- CUST_CHME : CUST가 몇명의 의사로 부터 진단을 받았는가
- CUST_HOSP_DVSN : CUST가 어떤종류들의 병원을 갔는가
- CHME_NUM : CHME가 몇번 보험금을 지급하였는가
- HOSP_NUM : HOSP가 몇번 보험금을 지급하였는가

In [53]:
# CUST_HOSP
def factorizer(input):
    return len(np.unique(input))

CUST_HOSP = train.groupby('CUST_ID')['HOSP_CODE'].apply(factorizer).to_frame().reset_index()
CUST_HOSP.columns = ['CUST_ID', 'CUST_HOSP']
train = pd.merge(train, CUST_HOSP, on='CUST_ID')

In [54]:
# CUST_CHME
CUST_CHME = train.groupby('CUST_ID')['CHME_LICE_NO'].apply(factorizer).to_frame().reset_index()
CUST_CHME.columns = ['CUST_ID', 'CUST_CHME']
train = pd.merge(train, CUST_CHME, on='CUST_ID')

In [55]:
# CHME_NUM
train['CHME_NUM'] = train.groupby('CHME_LICE_NO')['CUST_ID'].transform('count')

In [56]:
# HOSP_NUM
train['HOSP_NUM'] = train.groupby('HOSP_CODE')['CUST_ID'].transform('count')

In [57]:
# CUST_HOSP_DVSN
CUST_HOSP_DVSN = train.groupby('CUST_ID')['HOSP_SPEC_DVSN'].apply(factorizer).to_frame().reset_index()
CUST_HOSP_DVSN.columns = ['CUST_ID', 'CUST_HOSP_DVSN']
train = pd.merge(train, CUST_HOSP_DVSN, on='CUST_ID')

## CUST features
- ITEM_COUNT : CUST가 몇개의 상품에 가입했는가
- MONEY_COUNT : CUST가 몇번 돈을 받았는가
- DMND_ALL : CUST가 여태 얼마 받았는가
- AVG_MONEY : CUST가 POLY에 평균적으로 지불한 금액

In [58]:
# ITEM_COUNT
ITEM_COUNT = train.groupby('CUST_ID')['POLY_NO'].apply(factorizer).to_frame().reset_index()
ITEM_COUNT.columns = ['CUST_ID', 'ITEM_COUNT']
train = pd.merge(train, ITEM_COUNT, on='CUST_ID')

In [59]:
# MONEY_COUNT
train['MONEY_COUNT'] = train.groupby('CUST_ID')['CUST_ID'].transform('count')

In [60]:
# DMND_ALL
train['DMND_ALL'] = train.groupby('CUST_ID')['DMND_AMT'].sum()

In [61]:
# AVG_MONEY
train['AVG_MONEY'] = train['TOTALPREM']/train['ITEM_COUNT']

## POLY features
- POLY_NUM : POLY가 몇번 돈을 지급했는가

In [62]:
# POLY_NUM
train['POLY_NUM'] = train.groupby('POLY_NO')['CUST_ID'].transform('count')

In [63]:
fp_features = ['FP_W', 'FP_CUST', 'BRCH_CUST_NUM']
hosp_features = ['CUST_HOSP', 'CUST_CHME', 'CHME_NUM', 'HOSP_NUM', 'CUST_HOSP_DVSN']
cust_features = ['ITEM_COUNT', 'MONEY_COUNT', 'DMND_ALL', 'AVG_MONEY']
poly_features = ['POLY_NUM']
feature_sets = [fp_features, hosp_features, cust_features, poly_features]
new_features = []
for element in all_features:
    new_features.append(element)
for feature_set in feature_sets:
    for element in feature_set:
        new_features.append(element)

new_features = list(np.unique(np.array(new_features)))

In [72]:
test_add_feature = train[train.SIU_CUST_YN.isnull()]
train_add_feature = train[train.SIU_CUST_YN.isnull() == False]

In [73]:
cPickle.dump(train_add_feature, open('./Dump/train_new_feature.pkl', 'wb'), -1)
cPickle.dump(test_add_feature, open('./Dump/test_new_feature.pkl', 'wb'), -1)

In [76]:
def run_xgb(X, Y, cust_re, param, save=False, verbose=False, model=True):

    cust_Y = cust_re.SIU_CUST_YN
    # select train, among the cust 
    skf = StratifiedKFold(cust_Y, n_folds = 10)
    cust_train_idx , cust_valid_idx = next(iter(skf))
    # find the index of train customer
    train_idx = cust_re.CUST_ID[cust_train_idx].values
    valid_idx = cust_re.CUST_ID[cust_valid_idx].values
    
    X_train = X.loc[X.CUST_ID.isin(train_idx)]
    X_valid= X.loc[X.CUST_ID.isin(valid_idx)]
    #drop cust id in X
    X_train.drop('CUST_ID', axis = 1, inplace = True)
    X_valid.drop('CUST_ID', axis = 1, inplace = True)
    
    y_train =  Y.loc[Y.CUST_ID.isin(train_idx), 'SIU_CUST_YN']
    y_valid = Y.loc[Y.CUST_ID.isin(valid_idx), 'SIU_CUST_YN']

    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    
    # iteration regularization
    num_round = 100
    early_stopping_rounds = 20

    # To use early stopping, there should be stopping material which is validation set
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    bst = xgb.train(param, dtrain, num_round, 
                    early_stopping_rounds=early_stopping_rounds, 
                    evals = watchlist,verbose_eval = verbose)
    
    if verbose == False:
        print ("best score is : ", bst.best_score)
        #print ("f-score(sklearn) valid: ",f_me(bst, dvalid.get_label(), dvalid))
        #print ("f-score(sklearn) train: ",f_me(bst, dtrain.get_label(), dtrain))

    if save:
        print('Saving your model SCORE..')
        xgbfir.saveXgbFI(bst, feature_names=(X_train.columns),
                 OutputXlsxFile='./Dump/new_feature_SCORE.xlsx')
        print('Completed!')
        
    if model:
        return bst, dvalid

In [77]:
param = {} 

# Model setting
param['objective'] = 'binary:logistic'
param['booster'] = 'gbtree'
param['eval_metric'] = 'auc'

# weight regularization
param['Eta'] = 0.1
param['Lambda'] = 0.5
param['alpha'] = 0.5

# partitioning regularization
param['Gamma'] = 0.7
param['Max_depth'] = 15
param['min_child_weight'] = 7

# sampling regularization
param['Subsample'] = 0.5
param['Colsample_bytree'] = 1

In [79]:

Y = train_add_feature[['SIU_CUST_YN', 'CUST_ID']]
select_logit_feature = {'feature' : [], 'score' : [] }

# to do k-fold in cust_id (~22400)
cust_re = train[['CUST_ID', 'SIU_CUST_YN']]
cust_re = cust_re.groupby('CUST_ID')['SIU_CUST_YN'].sum().to_frame().reset_index()
cust_re.loc[cust_re.SIU_CUST_YN > 0, 'SIU_CUST_YN'] = 1


for colname in new_features:
    print (colname)
    X = train_add_feature[['CUST_ID', colname]]
    (bst, dvalid) = run_xgb(X, Y, cust_re, param)


    #print (colname ,'(obj f-measure):', obj.best_score)
    #if bst.best_score > 0.5:
    print ('save...')
    select_logit_feature['feature'].append(colname)
    select_logit_feature['score'].append(bst.best_score)
    print ('--------------------------------------------------------------------------------')
print ('Finish!')

ACCI_DVSN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


best score is :  0.53907
save...
--------------------------------------------------------------------------------
ACCI_HOSP_ADDR
best score is :  0.689266
save...
--------------------------------------------------------------------------------
ACCI_OCCP_GRP1
best score is :  0.568214
save...
--------------------------------------------------------------------------------
ACCI_OCCP_GRP2
best score is :  0.562839
save...
--------------------------------------------------------------------------------
AGE
best score is :  0.580374
save...
--------------------------------------------------------------------------------
AVG_MONEY
best score is :  0.580287
save...
--------------------------------------------------------------------------------
BEFO_JOB
best score is :  0.580821
save...
--------------------------------------------------------------------------------
BRCH_CODE
best score is :  0.725637
save...
--------------------------------------------------------------------------------
BRC

In [80]:
test = pd.DataFrame(select_logit_feature)
test.sort_values(by = 'score', ascending= False)

Unnamed: 0,feature,score
25,CUST_HOSP,0.862516
24,CUST_CHME,0.855370
88,POLY_NUM,0.849886
47,FP_CUST,0.843164
76,MONEY_COUNT,0.838387
26,CUST_HOSP_DVSN,0.835615
51,HOSP_CODE,0.772072
103,VLID_HOSP_OTDA,0.765865
95,RESL_CD1,0.755562
82,PAYM_AMT,0.752250


In [81]:
#cPickle.dump(train, open('./Dump/train_new_feature.pkl', 'wb'), -1)
cPickle.dump(new_features, open('./Dump/bst_features.pkl', 'wb'), -1)
cPickle.dump(select_logit_feature, open('./Dump/select_logit_feature.pkl', 'wb'), -1)
cPickle.dump(bst, open('./Dump/bst_f.pkl', 'wb'), -1)