In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
from lightgbm import plot_tree
from graphviz import Digraph
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import time
import pickle

%matplotlib inline

# Load Original Features

In [192]:
feat_num = 439

In [193]:
#df_total_raw = pd.read_csv('./data/features%s.csv'%(feat_num))
with open('./data/features%s.pickle'%(feat_num), 'rb') as handle:
    df_total_raw = pickle.load(handle)

In [194]:
#to_drop = ['TransactionDT']
#to_drop = ['V'+str(i) for i in range(1,340)]
with open('./data/feat%s_rm_pm_importance100.pickle'%(437), 'rb') as handle:
    to_drop = pickle.load(handle)

In [195]:
df_total = df_total_raw.drop(list(to_drop),axis=1)
#df_total = df_total_raw

In [196]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [197]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [198]:
features_train.shape

(590540, 337)

# Prepare model

In [199]:
categorical = ['ProductCD', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_email','R_email','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'Device_name','Device_version','screen_width','screen_height',
               'P_email_suffix','R_email_suffix','id_30_OS','id_30_version',
              'is_card_freq_Device','is_wide','is_long','is_zero','is_win8_vista',
              'is_windows_otheros','is_card_freq_pdc','is_card_freq_addr1'] 
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [200]:
categorical = list(set(categorical).intersection(df_total.columns))

In [201]:
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [202]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type":"gbdt",#'goss'
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
          #'num_threads':10
          #'device' :'gpu',
          #'is_unbalance':True
          #'scale_pos_weight':9
         }

# Feature Selection

In [203]:
features_train.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_38,DeviceType,DeviceInfo,P_email,R_email,screen_width,pdc_amt_ratio,card1_fq_enc,addr1_fq_enc,card_mv_day_fq
0,86400,68.5,0,13926,,150.0,0,142.0,0,315.0,...,-1,-1,-1,-1,-1,-1,0.445929,56,43035.0,1
1,86401,29.0,0,2755,404.0,150.0,1,102.0,0,325.0,...,-1,-1,-1,0,-1,-1,0.188788,1338,76902.0,1
2,86469,59.0,0,4663,490.0,150.0,2,166.0,1,330.0,...,-1,-1,-1,1,-1,-1,0.384085,1794,48387.0,1
3,86499,50.0,0,18132,567.0,150.0,1,117.0,1,476.0,...,-1,-1,-1,2,-1,-1,0.325496,7635,17455.0,11
4,86506,50.0,1,4497,514.0,150.0,1,102.0,0,420.0,...,0,0,0,0,-1,0,0.686003,30,7107.0,1


In [None]:
start = time.time()
train_set = lgb.Dataset(features_train.iloc[0:472432,:], label=labels_train.values[0:472432],categorical_feature=categorical)#
valid_set = lgb.Dataset(features_train.iloc[472432:,:], label=labels_train.values[472432:],categorical_feature=categorical)#
valid_results = {}
model = lgb.train(params,train_set,num_boost_round = 10000, 
                  valid_sets = [train_set, valid_set],
                  verbose_eval=500,
                  early_stopping_rounds = 500,
                  evals_result=valid_results)
print(time.time()-start)

In [None]:
max(valid_results['valid_1']['auc'])

In [None]:
lgb.plot_importance(model, max_num_features=128,figsize=(20,15))
#lgb.plot_split_value_histogram(model, feature='dist1', bins='auto')
lgb.plot_metric(valid_results, metric='auc')

In [None]:
lgb.plot_tree(model,tree_index=0, figsize=(107,105))
graph = lgb.create_tree_digraph(model, tree_index=0, name='Tree0')
graph.render(view=True)

# Train Model

In [204]:
def fold_train_model(splits_num,features_train,labels_train,categorical):
    splits = splits_num
    folds = KFold(n_splits = splits,random_state=50)
    predictions = np.zeros(len(features_test_new))
    ave_auc = 0
    
    for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
        print("Fold {}".format(fold_num))
        train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
        valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]

        trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
        val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)

        valid_results = {}
        clf = lgb.train(params,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=500,
                       evals_result=valid_results)

        pred = clf.predict(valid_df)
        auc_score = roc_auc_score(y_valid_df, pred)
        ave_auc += auc_score / splits
        predictions += clf.predict(features_test_new) / splits
    return ave_auc,predictions

In [205]:
ave_auc,predictions = fold_train_model(6,features_train,labels_train,categorical)

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.986141	valid_1's auc: 0.924159
[1000]	training's auc: 0.997293	valid_1's auc: 0.932639
[1500]	training's auc: 0.999521	valid_1's auc: 0.932171
Early stopping, best iteration is:
[1116]	training's auc: 0.998151	valid_1's auc: 0.932919
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.986186	valid_1's auc: 0.930358
[1000]	training's auc: 0.997482	valid_1's auc: 0.941038
[1500]	training's auc: 0.999583	valid_1's auc: 0.943234
[2000]	training's auc: 0.999943	valid_1's auc: 0.943212
Early stopping, best iteration is:
[1893]	training's auc: 0.99991	valid_1's auc: 0.943364
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.986629	valid_1's auc: 0.939757
[1000]	training's auc: 0.997501	valid_1's auc: 0.947589
[1500]	training's auc: 0.999582	valid_1's auc: 0.949372
[2000]	training's auc: 0.999943	valid_1's auc: 0.949368
Early stopping, best iteration is:
[1803]	training's auc: 0.999869	valid_1's auc: 0.949588
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.987502	valid_1's auc: 0.933784
[1000]	training's auc: 0.997859	valid_1's auc: 0.938437
Early stopping, best iteration is:
[993]	training's auc: 0.997805	valid_1's auc: 0.938466
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.986344	valid_1's auc: 0.941311
[1000]	training's auc: 0.997474	valid_1's auc: 0.950318
[1500]	training's auc: 0.999605	valid_1's auc: 0.950981
[2000]	training's auc: 0.999951	valid_1's auc: 0.95112
Early stopping, best iteration is:
[1699]	training's auc: 0.999822	valid_1's auc: 0.951199
Fold 5




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.9866	valid_1's auc: 0.929926
[1000]	training's auc: 0.997554	valid_1's auc: 0.937361
[1500]	training's auc: 0.999598	valid_1's auc: 0.937954
[2000]	training's auc: 0.999946	valid_1's auc: 0.938232
Early stopping, best iteration is:
[1879]	training's auc: 0.999909	valid_1's auc: 0.938285


In [206]:
#feat439 6 fold
ave_auc

0.9423037793396379

In [172]:
splits = 5
folds = KFold(n_splits = splits,random_state=50)
predictions = np.zeros(len(features_test_new))
ave_auc = 0

In [None]:
for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
    print("Fold {}".format(fold_num))
    train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
    valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
    val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)
    
    valid_results = {}
    clf = lgb.train(params,
                    trn_data,
                    10000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=500,
                   evals_result=valid_results)
    
    pred = clf.predict(valid_df)
    auc_score = roc_auc_score(y_valid_df, pred)
    ave_auc += auc_score / splits
    predictions += clf.predict(features_test_new) / splits

In [174]:
# feat440 add id_31_fq_enc
ave_auc

0.9381709259349416

In [159]:
# feat440 add id_30_fq_enc
ave_auc

0.9380367451399003

In [128]:
# feat439 change device info
ave_auc

0.9380337132311257

In [112]:
# feat439 add card_mv_day_fq
ave_auc

0.9382558553226678

In [97]:
# feat441 drop card4_fraud_rate and pemail fraud rate
ave_auc

0.9378070966039451

In [73]:
# feat441 drop feat437_rm_pm_importance100 drop hour amt and card4_fraud_rate
ave_auc

0.9375985902362556

In [61]:
# feat441 drop feat437_rm_pm_importance100 drop hour amt
ave_auc

0.9375681374967423

In [46]:
# feat441 drop feat437_rm_pm_importance100 
ave_auc

0.9375681381106405

In [17]:
# feat438 add addr1 cnt drop feat437_rm_pm_importance100
ave_auc

0.9378057475398072

In [34]:
# feat437 add card1 cnt drop feat437_rm_pm_importance100
ave_auc

0.9372995256863785

In [17]:
# feat437 add card1 cnt drop transactionDT
ave_auc

0.9347757078114228

In [68]:
# feat437 add card1 cnt
ave_auc

0.9371691266159269

In [33]:
# feat436 add pdc amt ratio
ave_auc

0.9360835858343227

In [31]:
# feat457 drop Vfeature
ave_auc

0.9320555176749314

In [16]:
# feat 457 drop c8
ave_auc

0.9354828335082059

In [19]:
# feat 457 all features
ave_auc

0.9352800271384787

In [207]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_feat%s_drop100.csv'%(feat_num), index = False)


# Recursive Test

In [191]:
result

[['card5_Amt_ratio', 0.9378145865335745],
 ['DT_day_Amt_ratio', 0.9378640126104663],
 ['card4_Amt_ratio', 0.937709725015683],
 ['card2_Amt_ratio', 0.937458223863788],
 ['DT_month_Amt_ratio', 0.9377889434647904],
 ['card6_Amt_ratio', 0.937863676290689],
 ['DT_hour_Amt_ratio', 0.9377024436406989],
 ['card3_Amt_ratio', 0.9376937326146639],
 ['DT_year_Amt_ratio', 0.9379433974493159]]

In [None]:
to_drop = {'DT_hour_Amt_ratio','DT_day_Amt_ratio','DT_month_Amt_ratio',
           'DT_year_Amt_ratio','card2_Amt_ratio',
          'card3_Amt_ratio','card4_Amt_ratio','card5_Amt_ratio','card6_Amt_ratio'}
result = []
for col in to_drop:
    print(col)
    to_drop_temp = list(to_drop - set([col]))
    features_train_temp = features_train.drop(to_drop_temp,axis=1)
    print(features_train_temp.shape)
    categorical_temp = list(set(categorical).intersection(features_train_temp.columns))
    ave_auc,predictions = fold_train_model(5,features_train_temp,labels_train,categorical_temp)
    print(ave_auc)
    result.append([col,ave_auc])

card5_Amt_ratio
(590540, 339)
Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.986856	valid_1's auc: 0.909957
[1000]	training's auc: 0.997807	valid_1's auc: 0.920558
[1500]	training's auc: 0.9997	valid_1's auc: 0.923135
[2000]	training's auc: 0.999969	valid_1's auc: 0.923818
[2500]	training's auc: 0.999998	valid_1's auc: 0.92425
[3000]	training's auc: 1	valid_1's auc: 0.924747
[3500]	training's auc: 1	valid_1's auc: 0.925087
[4000]	training's auc: 1	valid_1's auc: 0.92529
[4500]	training's auc: 1	valid_1's auc: 0.925318
Early stopping, best iteration is:
[4004]	training's auc: 1	valid_1's auc: 0.925293
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.987661	valid_1's auc: 0.931853
[1000]	training's auc: 0.998186	valid_1's auc: 0.939339
[1500]	training's auc: 0.99978	valid_1's auc: 0.940069
Early stopping, best iteration is:
[1333]	training's auc: 0.999552	valid_1's auc: 0.940206
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.988186	valid_1's auc: 0.931187
[1000]	training's auc: 0.998242	valid_1's auc: 0.937001
[1500]	training's auc: 0.999785	valid_1's auc: 0.93667
Early stopping, best iteration is:
[1014]	training's auc: 0.998343	valid_1's auc: 0.937035
Fold 3




Training until validation scores don't improve for 500 rounds.
