In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
from lightgbm import plot_tree
from graphviz import Digraph
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import time
import pickle

%matplotlib inline

# Load Original Features

In [36]:
feat_num = 450

In [37]:
#df_total_raw = pd.read_csv('./data/features%s.csv'%(feat_num))
with open('./data/features%s.pickle'%(feat_num), 'rb') as handle:
    df_total_raw = pickle.load(handle)

In [38]:
#to_drop = ['TransactionDT']
#to_drop = ['V'+str(i) for i in range(1,340)]
with open('./data/feat%s_rm_pm_importance100.pickle'%(437), 'rb') as handle:
    to_drop = pickle.load(handle)

In [39]:
df_total = df_total_raw.drop(list(to_drop),axis=1)
#df_total = df_total_raw

In [40]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [41]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [42]:
features_train.shape

(590540, 348)

# Prepare model

In [43]:
categorical = ['ProductCD', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_email','R_email','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'Device_name','Device_version','screen_width','screen_height',
               'P_email_suffix','R_email_suffix','id_30_OS','id_30_version',
              'is_card_freq_Device','is_wide','is_long','is_zero','is_win8_vista',
              'is_windows_otheros','is_card_freq_pdc','is_card_freq_addr1'] 
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [44]:
categorical = list(set(categorical).intersection(df_total.columns))

In [45]:
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [46]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type":"gbdt",#'goss'
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
          #'num_threads':10
          #'device' :'gpu',
          #'is_unbalance':True
          #'scale_pos_weight':9
         }

# Feature Selection

In [47]:
features_train.tail()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,pdc_hour_Amt_mean,pdc_hour_Amt_ratio,pdc_month_Amt_mean,pdc_month_Amt_ratio,card_id_30_fq,card_id_31_fq,pdc_addr_Amt_ratio,addr_Amt_ratio,dev_Amt_ratio,pemail_Amt_ratio
590535,15811047,49.0,0,6550,,150.0,2,226.0,1,272.0,...,210.348027,0.232947,153.994609,0.318193,,,0.363135,0.37107,,
590536,15811049,39.5,0,10444,225.0,150.0,1,224.0,1,204.0,...,210.348027,0.187784,153.994609,0.256502,,,0.257036,0.259945,,0.309567
590537,15811079,30.95,0,12037,595.0,150.0,1,224.0,1,231.0,...,210.348027,0.147137,153.994609,0.200981,,,0.216667,0.222367,,0.24256
590538,15811088,117.0,0,7826,481.0,150.0,1,224.0,1,387.0,...,210.348027,0.556221,153.994609,0.759767,,,0.719928,0.726904,,0.650385
590539,15811131,279.95,0,15066,170.0,150.0,1,102.0,0,299.0,...,210.348027,1.33089,153.994609,1.817921,,,1.404302,1.50961,,2.194011


In [None]:
start = time.time()
train_set = lgb.Dataset(features_train.iloc[0:472432,:], label=labels_train.values[0:472432],categorical_feature=categorical)#
valid_set = lgb.Dataset(features_train.iloc[472432:,:], label=labels_train.values[472432:],categorical_feature=categorical)#
valid_results = {}
model = lgb.train(params,train_set,num_boost_round = 10000, 
                  valid_sets = [train_set, valid_set],
                  verbose_eval=500,
                  early_stopping_rounds = 500,
                  evals_result=valid_results)
print(time.time()-start)

In [None]:
max(valid_results['valid_1']['auc'])

In [None]:
lgb.plot_importance(model, max_num_features=128,figsize=(20,15))
#lgb.plot_split_value_histogram(model, feature='dist1', bins='auto')
lgb.plot_metric(valid_results, metric='auc')

In [None]:
lgb.plot_tree(model,tree_index=0, figsize=(107,105))
graph = lgb.create_tree_digraph(model, tree_index=0, name='Tree0')
graph.render(view=True)

# Train Model

In [48]:
def fold_train_model(splits_num,features_train,labels_train,categorical):
    splits = splits_num
    folds = KFold(n_splits = splits,random_state=50)
    predictions = np.zeros(len(features_test_new))
    ave_auc = 0
    
    for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
        print("Fold {}".format(fold_num))
        train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
        valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]

        trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
        val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)

        valid_results = {}
        clf = lgb.train(params,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=500,
                       evals_result=valid_results)

        pred = clf.predict(valid_df)
        auc_score = roc_auc_score(y_valid_df, pred)
        ave_auc += auc_score / splits
        predictions += clf.predict(features_test_new) / splits
    return ave_auc,predictions

In [33]:
ave_auc,predictions = fold_train_model(5,features_train,labels_train,categorical)

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.986534	valid_1's auc: 0.91054
[1000]	training's auc: 0.997422	valid_1's auc: 0.921
[1500]	training's auc: 0.999571	valid_1's auc: 0.923571
[2000]	training's auc: 0.999936	valid_1's auc: 0.924334
[2500]	training's auc: 0.999994	valid_1's auc: 0.924744
[3000]	training's auc: 1	valid_1's auc: 0.925414
[3500]	training's auc: 1	valid_1's auc: 0.925751
[4000]	training's auc: 1	valid_1's auc: 0.926001
[4500]	training's auc: 1	valid_1's auc: 0.926118
Early stopping, best iteration is:
[4152]	training's auc: 1	valid_1's auc: 0.926084
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.987091	valid_1's auc: 0.933222
[1000]	training's auc: 0.997844	valid_1's auc: 0.939879
[1500]	training's auc: 0.999686	valid_1's auc: 0.94048
Early stopping, best iteration is:
[1293]	training's auc: 0.999293	valid_1's auc: 0.940705
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.987743	valid_1's auc: 0.932237
[1000]	training's auc: 0.997947	valid_1's auc: 0.937358
[1500]	training's auc: 0.99969	valid_1's auc: 0.937055
Early stopping, best iteration is:
[1077]	training's auc: 0.998453	valid_1's auc: 0.937481
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.987017	valid_1's auc: 0.947599
[1000]	training's auc: 0.99788	valid_1's auc: 0.954633
[1500]	training's auc: 0.999709	valid_1's auc: 0.954929
Early stopping, best iteration is:
[1184]	training's auc: 0.99895	valid_1's auc: 0.955058
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.987207	valid_1's auc: 0.924143
[1000]	training's auc: 0.997826	valid_1's auc: 0.931628
[1500]	training's auc: 0.999676	valid_1's auc: 0.93208
[2000]	training's auc: 0.999964	valid_1's auc: 0.932085
Early stopping, best iteration is:
[1583]	training's auc: 0.999771	valid_1's auc: 0.932207


In [34]:
# feat440 add card_DeviceType_fq
ave_auc

0.9383068452162177

In [18]:
# feat439 na -999
ave_auc

0.9381576015559274

In [25]:
# feat439 num of leave 300
ave_auc

0.9368800258774106

In [16]:
# feat439 lr 0.005
ave_auc

0.9378605282673502

In [206]:
#feat439 6 fold
ave_auc

0.9423037793396379

In [174]:
# feat440 add id_31_fq_enc
ave_auc

0.9381709259349416

In [159]:
# feat440 add id_30_fq_enc
ave_auc

0.9380367451399003

In [128]:
# feat439 change device info
ave_auc

0.9380337132311257

In [112]:
# feat439 add card_mv_day_fq
ave_auc

0.9382558553226678

In [97]:
# feat441 drop card4_fraud_rate and pemail fraud rate
ave_auc

0.9378070966039451

In [73]:
# feat441 drop feat437_rm_pm_importance100 drop hour amt and card4_fraud_rate
ave_auc

0.9375985902362556

In [61]:
# feat441 drop feat437_rm_pm_importance100 drop hour amt
ave_auc

0.9375681374967423

In [46]:
# feat441 drop feat437_rm_pm_importance100 
ave_auc

0.9375681381106405

In [17]:
# feat438 add addr1 cnt drop feat437_rm_pm_importance100
ave_auc

0.9378057475398072

In [34]:
# feat437 add card1 cnt drop feat437_rm_pm_importance100
ave_auc

0.9372995256863785

In [17]:
# feat437 add card1 cnt drop transactionDT
ave_auc

0.9347757078114228

In [68]:
# feat437 add card1 cnt
ave_auc

0.9371691266159269

In [33]:
# feat436 add pdc amt ratio
ave_auc

0.9360835858343227

In [31]:
# feat457 drop Vfeature
ave_auc

0.9320555176749314

In [16]:
# feat 457 drop c8
ave_auc

0.9354828335082059

In [19]:
# feat 457 all features
ave_auc

0.9352800271384787

In [35]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_feat%s_drop100_%s.csv'%(feat_num,'DeviceType_fq'), index = False)
#submission.to_csv('./data/sub_feat%s_drop100.csv'%(feat_num), index = False)


# Recursive Test

In [50]:
features_train.iloc[:,-11:-1].head()

Unnamed: 0,card_DeviceType_fq,pdc_hour_Amt_mean,pdc_hour_Amt_ratio,pdc_month_Amt_mean,pdc_month_Amt_ratio,card_id_30_fq,card_id_31_fq,pdc_addr_Amt_ratio,addr_Amt_ratio,dev_Amt_ratio
0,,248.961084,0.275143,151.395914,0.452456,,,0.499882,0.505295,
1,,248.961084,0.116484,151.395914,0.191551,,,0.175652,0.182692,
2,,248.961084,0.236985,151.395914,0.389707,,,0.393093,0.446788,
3,,248.961084,0.200835,151.395914,0.33026,,,0.35522,0.363424,
4,1.0,75.84,0.659283,69.407541,0.720383,1.0,1.0,0.737484,0.299817,0.737964


In [None]:
to_drop = {'pdc_hour_Amt_mean','pdc_hour_Amt_ratio',
          'pdc_month_Amt_mean','pdc_month_Amt_ratio','card_id_30_fq',
          'card_id_31_fq','pdc_addr_Amt_ratio','addr_Amt_ratio','dev_Amt_ratio'}

result = []
for col in to_drop:
    print(col)
    to_drop_temp = list(to_drop - set([col]))
    features_train_temp = features_train.drop(to_drop_temp,axis=1)
    print(features_train_temp.shape)
    categorical_temp = list(set(categorical).intersection(features_train_temp.columns))
    ave_auc,predictions = fold_train_model(5,features_train_temp,labels_train,categorical_temp)
    print(ave_auc)
    id_test = features_test['TransactionID']
    submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
    submission.to_csv('./data/sub_feat%s_drop100_%s.csv'%(feat_num,col), index = False)
    result.append([col,ave_auc])

pdc_hour_Amt_mean
(590540, 340)
Fold 0




Training until validation scores don't improve for 500 rounds.


In [17]:
result

[['card_DeviceInfo_fq', 0.938143843412518],
 ['card_DeviceType_fq', 0.9383068452162177],
 ['card_screen_width_fq', 0.9381135438426036]]

In [191]:
#'DT_hour_Amt_ratio','DT_day_Amt_ratio','DT_month_Amt_ratio',
#           'DT_year_Amt_ratio','card2_Amt_ratio',
#          'card3_Amt_ratio','card4_Amt_ratio','card5_Amt_ratio','card6_Amt_ratio'
result

[['card5_Amt_ratio', 0.9378145865335745],
 ['DT_day_Amt_ratio', 0.9378640126104663],
 ['card4_Amt_ratio', 0.937709725015683],
 ['card2_Amt_ratio', 0.937458223863788],
 ['DT_month_Amt_ratio', 0.9377889434647904],
 ['card6_Amt_ratio', 0.937863676290689],
 ['DT_hour_Amt_ratio', 0.9377024436406989],
 ['card3_Amt_ratio', 0.9376937326146639],
 ['DT_year_Amt_ratio', 0.9379433974493159]]