In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from random import sample 

%matplotlib inline
RSEED = 50

# Load Original Features

In [3]:
feat_num = 469
df_total = pd.read_csv('./data/features%s.csv'%(feat_num))

In [9]:
df_train = df_total[df_total['isFraud'].notnull()]
df_train.shape

(590540, 469)

# Sample Train Data

In [10]:
df_train_sample=df_train.sample(n=100000,random_state=RSEED)
df_train_sample.shape

(100000, 469)

# Prepare Data

In [11]:
labels_train = df_train_sample['isFraud']
features_train = df_train_sample.drop(columns = ['isFraud', 'TransactionID'])
features_train.shape

(100000, 467)

In [13]:
categorical = ['ProductCD', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_email','R_email','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'Device_name','Device_version','screen_width','screen_height',
               'P_email_suffix','R_email_suffix','id_30_OS','id_30_version']
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids
categorical = list(set(categorical).intersection(df_total.columns))

# Select Features

In [14]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
          #'is_unbalance':True
          #'scale_pos_weight':9
         }

In [22]:
def train_selector(train_num,features_train,labels_train,categorical):
    train_set = lgb.Dataset(features_train.iloc[0:train_num,:], label=labels_train.values[0:train_num],
                       categorical_feature=categorical)
    valid_set = lgb.Dataset(features_train.iloc[train_num:,:], label=labels_train.values[train_num:],
                       categorical_feature=categorical)
    valid_results = {}
    model = lgb.train(params,train_set,num_boost_round = 10000, 
                   valid_sets = [train_set, valid_set],
                    verbose_eval=500,
                    early_stopping_rounds = 500,
                    evals_result=valid_results)
    return model,valid_results

In [23]:
model,valid_results = train_selector(train_num,features_train,labels_train,categorical)



Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.99669	valid_1's auc: 0.918101
[1000]	training's auc: 0.999955	valid_1's auc: 0.925954
[1500]	training's auc: 1	valid_1's auc: 0.92709
[2000]	training's auc: 1	valid_1's auc: 0.926962
Early stopping, best iteration is:
[1623]	training's auc: 1	valid_1's auc: 0.927194


In [15]:
train_num = 80000
train_set = lgb.Dataset(features_train.iloc[0:train_num,:], label=labels_train.values[0:train_num],
                       categorical_feature=categorical)
valid_set = lgb.Dataset(features_train.iloc[train_num:,:], label=labels_train.values[train_num:],
                       categorical_feature=categorical)
valid_results = {}
model = lgb.train(params,train_set,num_boost_round = 10000, 
                   valid_sets = [train_set, valid_set],
                    verbose_eval=500,
                    early_stopping_rounds = 500,
                 evals_result=valid_results)



Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.996677	valid_1's auc: 0.918242
[1000]	training's auc: 0.999952	valid_1's auc: 0.924871
[1500]	training's auc: 1	valid_1's auc: 0.926299
[2000]	training's auc: 1	valid_1's auc: 0.926302
Early stopping, best iteration is:
[1628]	training's auc: 1	valid_1's auc: 0.926277


In [16]:
fi = pd.DataFrame({'feature': features_train.columns, 
                   'importance':model.feature_importance()})
fi = fi.sort_values('importance', ascending = False)

In [18]:
fi_important = fi[fi.importance>0]
features_train = features_train[fi_important.feature]
categorical = list(set(categorical).intersection(fi_important.feature))

In [19]:
train_num = 80000
train_set = lgb.Dataset(features_train.iloc[0:train_num,:], label=labels_train.values[0:train_num],
                       categorical_feature=categorical)
valid_set = lgb.Dataset(features_train.iloc[train_num:,:], label=labels_train.values[train_num:],
                       categorical_feature=categorical)
valid_results = {}
model = lgb.train(params,train_set,num_boost_round = 10000, 
                   valid_sets = [train_set, valid_set],
                    verbose_eval=500,
                    early_stopping_rounds = 500,
                 evals_result=valid_results)



Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.99669	valid_1's auc: 0.918101
[1000]	training's auc: 0.999955	valid_1's auc: 0.925954
[1500]	training's auc: 1	valid_1's auc: 0.92709
[2000]	training's auc: 1	valid_1's auc: 0.926962
Early stopping, best iteration is:
[1623]	training's auc: 1	valid_1's auc: 0.927194


In [24]:
features_train.shape

(100000, 444)

In [28]:
fi_unimportant = fi[fi.importance==0]

In [33]:
fi_unimportant['feature']

453    card1_count
10           addr2
167           V117
168           V118
169           V119
170           V120
172           V122
414          id_25
410          id_21
415          id_26
355           V305
406          id_17
77             V27
78             V28
138            V88
139            V89
403          id_14
157           V107
290           V240
291           V241
91             V41
413          id_24
118            V68
Name: feature, dtype: object