In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from random import sample 
import pickle

%matplotlib inline
RSEED = 50

# Load Original Features

In [2]:
feat_num = 476
df_total = pd.read_csv('./data/features%s.csv'%(feat_num))

In [3]:
df_train = df_total[df_total['isFraud'].notnull()]
df_train.shape

(590540, 476)

# Sample Train Data

In [4]:
df_train_sample=df_train.sample(n=200000,random_state=RSEED)
df_train_sample.shape

(200000, 476)

# Prepare Data

In [7]:
labels_train = df_train_sample['isFraud']
features_train = df_train_sample.drop(columns = ['isFraud', 'TransactionID'])
features_train.shape

(200000, 474)

In [30]:
categorical_raw = ['ProductCD', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_email','R_email','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'Device_name','Device_version','screen_width','screen_height',
               'P_email_suffix','R_email_suffix','id_30_OS','id_30_version'
              ] 
#'is_card_freq_Device','is_wide','is_long','is_zero','is_win8_vista',
#              'is_windows_otheros','is_card_freq_pdc','is_card_freq_addr1'
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical_raw = categorical_raw + ids

In [50]:
#categorical = list(set(categorical_raw).intersection(features_train.columns))
#features_train[categorical].nunique().sort_values(ascending=False)

# Select Features

In [31]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
          #'is_unbalance':True
          #'scale_pos_weight':9
         }

In [12]:
def train_selector(params,train_num,features_train,labels_train,categorical,verbose_eval=500):
    train_set = lgb.Dataset(features_train.iloc[0:train_num,:], label=labels_train.values[0:train_num],
                       categorical_feature=categorical)
    valid_set = lgb.Dataset(features_train.iloc[train_num:,:], label=labels_train.values[train_num:],
                       categorical_feature=categorical)
    valid_results = {}
    model = lgb.train(params,train_set,num_boost_round = 10000, 
                   valid_sets = [train_set, valid_set],
                    verbose_eval= verbose_eval,
                    early_stopping_rounds = 500,
                    evals_result=valid_results)
    return model,valid_results

In [13]:
def select_by_importance(model,features_train,importance=0,num_keep=None):
    fi = pd.DataFrame({'feature': features_train.columns, 
                   'importance':model.feature_importance()})
    fi = fi.sort_values('importance', ascending = False)
    if num_keep != None:
        to_drop = fi.iloc[num_keep:,:].feature
    else:
        to_drop = fi[fi.importance <= importance].feature
    return to_drop
    

In [19]:
def fold_train_selector(Nfold,features_train,labels_train,categorical):
    splits = Nfold
    ave_auc = 0
    valid_results = {}
    
    folds = KFold(n_splits = splits,random_state=RSEED)
    
    for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
        print("Fold {}".format(fold_num))
        train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
        valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
        trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
        val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)
    
        
        clf = lgb.train(params,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=500,
                       evals_result=valid_results)
        

        pred = clf.predict(valid_df)
        auc_score = roc_auc_score(y_valid_df, pred)
        ave_auc += auc_score / splits
        print( "  auc = ", auc_score )
    return ave_auc

In [24]:
def fold_select_feature(Nfold,features_train,labels_train,categorical,importance=0):
    splits = Nfold
    ave_auc = 0
    valid_results = {}
    
    folds = KFold(n_splits = splits,random_state=RSEED)
    
    for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
        print("Fold {}".format(fold_num))
        train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
        valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
        trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
        val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)
    
        
        clf = lgb.train(params,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=500,
                       evals_result=valid_results)
        
        drop_this_round = list(select_by_importance(clf,train_df,importance=importance))
        print(drop_this_round)
        
        if fold_num == 0:
            to_drop = drop_this_round
        else:
            to_drop = list(set(to_drop).intersection(drop_this_round))
        print(to_drop)
        
        pred = clf.predict(valid_df)
        auc_score = roc_auc_score(y_valid_df, pred)
        ave_auc += auc_score / splits
        print( "  auc = ", auc_score )
    
    return ave_auc,to_drop
    
    

## Train with all feature set

In [32]:
categorical = list(set(categorical_raw).intersection(features_train.columns))
ave_auc,to_drop = fold_select_feature(3,features_train,labels_train,categorical,importance=0)

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.996735	valid_1's auc: 0.936195
[1000]	training's auc: 0.999952	valid_1's auc: 0.945468
[1500]	training's auc: 1	valid_1's auc: 0.947468
[2000]	training's auc: 1	valid_1's auc: 0.948225
Early stopping, best iteration is:
[1613]	training's auc: 1	valid_1's auc: 0.947802
['V241', 'addr2', 'V240', 'V107', 'V120', 'V27', 'V28', 'id_21', 'id_24', 'id_25', 'V89', 'V88', 'V121', 'V122', 'V305', 'V119', 'V68', 'is_win8_vista', 'is_windows_otheros', 'V118', 'id_14', 'V117', 'is_zero']
['V241', 'addr2', 'V240', 'V107', 'V120', 'V27', 'V28', 'id_21', 'id_24', 'id_25', 'V89', 'V88', 'V121', 'V122', 'V305', 'V119', 'V68', 'is_win8_vista', 'is_windows_otheros', 'V118', 'id_14', 'V117', 'is_zero']
  auc =  0.9478019931766029
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.997171	valid_1's auc: 0.932232
[1000]	training's auc: 0.999962	valid_1's auc: 0.941812
[1500]	training's auc: 1	valid_1's auc: 0.943665
[2000]	training's auc: 1	valid_1's auc: 0.944044
Early stopping, best iteration is:
[1741]	training's auc: 1	valid_1's auc: 0.943957
['V121', 'V241', 'V89', 'V122', 'V118', 'V107', 'id_14', 'V68', 'V65', 'id_26', 'id_25', 'V113', 'id_24', 'is_win8_vista', 'is_windows_otheros', 'V305', 'id_21', 'V117', 'V108', 'V28', 'V27', 'V240', 'V119', 'V120', 'is_zero']
['V118', 'V68', 'V27', 'id_25', 'id_21', 'V122', 'is_zero', 'V240', 'V117', 'V305', 'V89', 'V107', 'V121', 'V28', 'V119', 'V241', 'id_24', 'id_14', 'V120', 'is_windows_otheros', 'is_win8_vista']
  auc =  0.9439573765892916
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.996631	valid_1's auc: 0.934045
[1000]	training's auc: 0.999946	valid_1's auc: 0.942766
[1500]	training's auc: 1	valid_1's auc: 0.944855
[2000]	training's auc: 1	valid_1's auc: 0.945709
Early stopping, best iteration is:
[1663]	training's auc: 1	valid_1's auc: 0.945147
['V118', 'V27', 'is_windows_otheros', 'V240', 'V241', 'V305', 'V120', 'V119', 'V117', 'V107', 'id_14', 'V89', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'is_win8_vista', 'is_zero']
['V305', 'V27', 'V119', 'id_21', 'id_25', 'V118', 'V107', 'V89', 'V241', 'id_24', 'is_zero', 'id_14', 'is_win8_vista', 'V240', 'V120', 'is_windows_otheros', 'V117']
  auc =  0.9451471033536736


In [26]:
ave_auc

0.9457859715571262

In [27]:
to_drop

['V305',
 'V119',
 'id_25',
 'id_21',
 'V118',
 'V107',
 'V89',
 'V241',
 'id_24',
 'V117',
 'id_14',
 'is_zero',
 'V240',
 'V120',
 'is_windows_otheros',
 'is_win8_vista']

## Feature selection by Importance

In [15]:
#to_drop = list(select_by_importance(model,features_train,importance=0))

NameError: name 'model' is not defined

In [28]:
features_train_temp = features_train.drop(to_drop,axis=1)
categorical_temp = list(set(categorical_raw).intersection(features_train_temp.columns))
ave_auc = fold_train_selector(3,features_train_temp,labels_train,categorical_temp)

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.996587	valid_1's auc: 0.936155
[1000]	training's auc: 0.999949	valid_1's auc: 0.945761
[1500]	training's auc: 1	valid_1's auc: 0.947914
[2000]	training's auc: 1	valid_1's auc: 0.948657
Early stopping, best iteration is:
[1621]	training's auc: 1	valid_1's auc: 0.94815
  auc =  0.9481503217222406
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.997138	valid_1's auc: 0.932873
[1000]	training's auc: 0.999963	valid_1's auc: 0.942249
[1500]	training's auc: 1	valid_1's auc: 0.944271
[2000]	training's auc: 1	valid_1's auc: 0.944686
Early stopping, best iteration is:
[1708]	training's auc: 1	valid_1's auc: 0.944508
  auc =  0.9445077717249518
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.996583	valid_1's auc: 0.933441
[1000]	training's auc: 0.999947	valid_1's auc: 0.942028
[1500]	training's auc: 1	valid_1's auc: 0.944093
[2000]	training's auc: 1	valid_1's auc: 0.944725
Early stopping, best iteration is:
[1692]	training's auc: 1	valid_1's auc: 0.944436
  auc =  0.9444363573760468


In [29]:
ave_auc

0.9456981502744131

In [105]:
with open('./data/feat476_rm_importance0.pickle', 'wb') as handle:
    pickle.dump(to_drop, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Recursive Eliminate Features

In [None]:
best = 0.9531365218167643
rm_col = ''
for col in categorical:
    print(col)
    features_train_temp = features_train.drop(col,axis=1)
    categorical_temp = list(set(categorical_raw).intersection(features_train_temp.columns))
    model_temp,valid_results = train_selector(params,train_num,features_train_temp,labels_train,categorical_temp,verbose_eval=2000)
    pfm = max(valid_results['valid_1']['auc'])
    if pfm > best:
        print('remove %s improve auc from %s to %s this feature is endanger'%(col,best,pfm))
        best = pfm
        rm_col = col
        

Device_name




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953302
Early stopping, best iteration is:
[1729]	training's auc: 1	valid_1's auc: 0.953183
remove Device_name improve auc from 0.9531365218167643 to 0.9534075096476156 this feature is endanger
id_26




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952548
Early stopping, best iteration is:
[1722]	training's auc: 1	valid_1's auc: 0.952363
P_email




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952819
Early stopping, best iteration is:
[1808]	training's auc: 1	valid_1's auc: 0.952699
P_email_suffix




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952628
Early stopping, best iteration is:
[1765]	training's auc: 1	valid_1's auc: 0.952487
M9




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953003
Early stopping, best iteration is:
[1734]	training's auc: 1	valid_1's auc: 0.952798
card5




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952742
Early stopping, best iteration is:
[1735]	training's auc: 1	valid_1's auc: 0.952533
Device_version




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952814
Early stopping, best iteration is:
[1821]	training's auc: 1	valid_1's auc: 0.952676
id_22




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953038
Early stopping, best iteration is:
[1741]	training's auc: 1	valid_1's auc: 0.95305
M6




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.9523
Early stopping, best iteration is:
[1766]	training's auc: 1	valid_1's auc: 0.952148
is_zero




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953066
Early stopping, best iteration is:
[1714]	training's auc: 1	valid_1's auc: 0.95285
id_16




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952969
Early stopping, best iteration is:
[1783]	training's auc: 1	valid_1's auc: 0.952908
id_24




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952548
Early stopping, best iteration is:
[1722]	training's auc: 1	valid_1's auc: 0.952363
id_30_OS




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.95285
Early stopping, best iteration is:
[1704]	training's auc: 1	valid_1's auc: 0.952689
id_23




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952923
Early stopping, best iteration is:
[1737]	training's auc: 1	valid_1's auc: 0.952674
M4




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952626
Early stopping, best iteration is:
[1716]	training's auc: 1	valid_1's auc: 0.952453
dow




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953519
Early stopping, best iteration is:
[1720]	training's auc: 1	valid_1's auc: 0.953327
remove dow improve auc from 0.9534075096476156 to 0.9535437865440888 this feature is endanger
R_email_suffix




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952952
Early stopping, best iteration is:
[1740]	training's auc: 1	valid_1's auc: 0.952781
addr1




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.949491
Early stopping, best iteration is:
[1799]	training's auc: 1	valid_1's auc: 0.949389
id_37




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952834
Early stopping, best iteration is:
[1756]	training's auc: 1	valid_1's auc: 0.952819
id_36




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953034
Early stopping, best iteration is:
[1765]	training's auc: 1	valid_1's auc: 0.952757
is_card_freq_pdc




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952795
Early stopping, best iteration is:
[1731]	training's auc: 1	valid_1's auc: 0.952635
M3




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952631
Early stopping, best iteration is:
[1742]	training's auc: 1	valid_1's auc: 0.952338
id_34




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952993
Early stopping, best iteration is:
[1705]	training's auc: 1	valid_1's auc: 0.952857
card2




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.951441
Early stopping, best iteration is:
[1843]	training's auc: 1	valid_1's auc: 0.951285
M5




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952769
Early stopping, best iteration is:
[1757]	training's auc: 1	valid_1's auc: 0.952626
M2




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.952821
Early stopping, best iteration is:
[1826]	training's auc: 1	valid_1's auc: 0.952705
M8




Training until validation scores don't improve for 500 rounds.
[2000]	training's auc: 1	valid_1's auc: 0.953178
Early stopping, best iteration is:
[1741]	training's auc: 1	valid_1's auc: 0.953064
is_windows_otheros




Training until validation scores don't improve for 500 rounds.


In [64]:
rm_col

'ProductCD'