In [64]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.feature_selection import RFE

%matplotlib inline
RSEED = 50

## Load Original Features

In [65]:
df_total = pd.read_csv('./data/baseline_features.csv')

In [66]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [67]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [68]:
features_train.shape

(590540, 432)

# Prepare Model

In [79]:
categorical = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo']
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [80]:
# Create a  lgb training set
train_set = lgb.Dataset(features_train, label = labels_train.values,
                        categorical_feature=categorical)

In [81]:
model = lgb.LGBMClassifier(scale_pos_weight=9)
params = model.get_params()

In [82]:
cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc',
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)
print('Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Cross Validation ROC AUC: 0.92753 with std: 0.00128.


In [84]:
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED,scale_pos_weight=9)
model.fit(features_train, labels_train.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=50, reg_alpha=0.0, reg_lambda=0.0,
               scale_pos_weight=9, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

# Feature Selection

In [85]:
fi = pd.DataFrame({'feature': features_train.columns, 
                   'importance': model.feature_importances_})
fi = fi.sort_values('importance', ascending = False)

In [86]:
for importance in range(1,11):
    fi_unimportant = fi[fi['importance'] < importance]
    features_train_new = features_train.drop(np.array(fi_unimportant['feature']), axis=1)
    features_test_new = features_test.drop(np.array(fi_unimportant['feature']), axis=1)
    categorical_new = list(np.setdiff1d(categorical,fi_unimportant['feature'].values))
    train_set = lgb.Dataset(features_train_new, label = labels_train.values,
                        categorical_feature = categorical_new)
    model = lgb.LGBMClassifier(scale_pos_weight=9)
    params = model.get_params()
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)
    print('Feature Importance:{:.1f} Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(importance,cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))

    

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:1.0 Cross Validation ROC AUC: 0.92810 with std: 0.00075.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:2.0 Cross Validation ROC AUC: 0.92900 with std: 0.00143.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:3.0 Cross Validation ROC AUC: 0.92821 with std: 0.00100.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:4.0 Cross Validation ROC AUC: 0.92842 with std: 0.00109.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:5.0 Cross Validation ROC AUC: 0.92827 with std: 0.00117.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:6.0 Cross Validation ROC AUC: 0.92920 with std: 0.00162.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:7.0 Cross Validation ROC AUC: 0.92960 with std: 0.00208.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:8.0 Cross Validation ROC AUC: 0.92928 with std: 0.00126.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:9.0 Cross Validation ROC AUC: 0.92970 with std: 0.00124.


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Feature Importance:10.0 Cross Validation ROC AUC: 0.92995 with std: 0.00197.


In [87]:
features_train_new.shape

(590540, 68)

In [88]:
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED,scale_pos_weight=9)
model.fit(features_train_new, labels_train.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=50, reg_alpha=0.0, reg_lambda=0.0,
               scale_pos_weight=9, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [89]:
id_test = features_test_new['TransactionID']
# Make predictions on the testing data
features_test_final = features_test_new.drop(columns = ['isFraud', 'TransactionID'])
preds = model.predict_proba(features_test_final)[:, 1]
submission = pd.DataFrame({'TransactionID': id_test, 
                                'isFraud': preds})
submission.to_csv('./data/sub_baseline_fs.csv', index = False)