In [12]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.feature_selection import RFE

%matplotlib inline
RSEED = 50

## Load Original Features

In [2]:
df_total = pd.read_csv('./data/baseline_features.csv')

In [56]:
df_total.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0.0,86400,68.5,0,13926,,150.0,0,142.0,...,-1,,-1,-1,-1,-1,-1,-1,-1,-1
1,2987001,0.0,86401,29.0,0,2755,404.0,150.0,1,102.0,...,-1,,-1,-1,-1,-1,-1,-1,-1,-1
2,2987002,0.0,86469,59.0,0,4663,490.0,150.0,2,166.0,...,-1,,-1,-1,-1,-1,-1,-1,-1,-1
3,2987003,0.0,86499,50.0,0,18132,567.0,150.0,1,117.0,...,-1,,-1,-1,-1,-1,-1,-1,-1,-1
4,2987004,0.0,86506,50.0,1,4497,514.0,150.0,1,102.0,...,0,32.0,0,0,0,0,0,0,0,0


In [4]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [5]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [6]:
features_train.shape

(590540, 432)

# Prepare Model

In [13]:
categorical = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo']
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [14]:
# Create a  lgb training set
train_set = lgb.Dataset(features_train, label = labels_train.values,
                        categorical_feature=categorical)

In [16]:
model = lgb.LGBMClassifier()
params = model.get_params()

In [17]:
cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)
print('Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


In [18]:
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED)
model.fit(features_train, labels_train.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=50, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# Feature Selection

In [48]:
importance = 1

In [20]:
fi = pd.DataFrame({'feature': features_train.columns, 
                   'importance': model.feature_importances_})
fi = fi.sort_values('importance', ascending = False)

In [50]:
for importance in range(1,11):
    fi_unimportant = fi[fi['importance'] < importance]
    features_train_new = features_train.drop(np.array(fi_unimportant['feature']), axis=1)
    features_test_new = features_test.drop(np.array(fi_unimportant['feature']), axis=1)
    categorical_new = list(np.setdiff1d(categorical,fi_unimportant['feature'].values))
    train_set = lgb.Dataset(features_train_new, label = labels_train.values,
                        categorical_feature = categorical_new)
    model = lgb.LGBMClassifier()
    params = model.get_params()
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)
    print('Feature Importance:{:.1f} Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(importance,cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))

    

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:1.0 Cross Validation ROC AUC: 0.92789 with std: 0.00144.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:2.0 Cross Validation ROC AUC: 0.92813 with std: 0.00139.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:3.0 Cross Validation ROC AUC: 0.92854 with std: 0.00106.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:4.0 Cross Validation ROC AUC: 0.92832 with std: 0.00173.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:5.0 Cross Validation ROC AUC: 0.92826 with std: 0.00108.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:6.0 Cross Validation ROC AUC: 0.92844 with std: 0.00112.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:7.0 Cross Validation ROC AUC: 0.92887 with std: 0.00077.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:8.0 Cross Validation ROC AUC: 0.92954 with std: 0.00177.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:9.0 Cross Validation ROC AUC: 0.93019 with std: 0.00184.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:10.0 Cross Validation ROC AUC: 0.92981 with std: 0.00210.


In [51]:
features_train_new.shape

(590540, 70)

In [53]:
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED)
model.fit(features_train_new, labels_train.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=50, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [58]:
features_test_new.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V320,id_01,id_02,id_09,id_17,id_19,id_20,id_30,id_31,DeviceInfo
590540,3663549,,18403224,31.95,0,10409,111.0,150.0,2,226.0,...,0.0,,,,,,,-1,-1,-1
590541,3663550,,18403263,49.0,0,4272,111.0,150.0,2,226.0,...,0.0,,,,,,,-1,-1,-1
590542,3663551,,18403310,171.0,0,4476,574.0,150.0,2,226.0,...,263.0,,,,,,,-1,-1,-1
590543,3663552,,18403310,284.95,0,10989,360.0,150.0,2,166.0,...,0.0,,,,,,,-1,-1,-1
590544,3663553,,18403317,67.95,0,18018,452.0,150.0,1,117.0,...,0.0,,,,,,,-1,-1,-1


In [59]:
id_test = features_test_new['TransactionID']
# Make predictions on the testing data
features_test_final = features_test_new.drop(columns = ['isFraud', 'TransactionID'])
preds = model.predict_proba(features_test_final)[:, 1]
submission = pd.DataFrame({'TransactionID': id_test, 
                                'isFraud': preds})
submission.to_csv('./data/sub_baseline_fs.csv', index = False)