In [17]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

%matplotlib inline
RSEED = 50

# Load Original Features

In [2]:
df_total = pd.read_csv('./data/features439.csv')

In [3]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [4]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [5]:
features_train.shape

(590540, 437)

# Prepare model

In [6]:
categorical = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'card_id']
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [12]:
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [7]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [19]:
splits = 5
folds = KFold(n_splits = splits,random_state=RSEED)
predictions = np.zeros(len(features_test_new))

In [20]:
for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
    print("Fold {}".format(fold_num))
    train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
    valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
    val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)
    
    clf = lgb.train(params,
                    trn_data,
                    10000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=500)

    pred = clf.predict(valid_df)
    print( "  auc = ", roc_auc_score(y_valid_df, pred) )
    predictions += clf.predict(features_test_new) / splits

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.991308	valid_1's auc: 0.896665
[1000]	training's auc: 0.998816	valid_1's auc: 0.906854
[1500]	training's auc: 0.999885	valid_1's auc: 0.910345
[2000]	training's auc: 0.999994	valid_1's auc: 0.912765
[2500]	training's auc: 1	valid_1's auc: 0.914385
[3000]	training's auc: 1	valid_1's auc: 0.915126
[3500]	training's auc: 1	valid_1's auc: 0.915566
Early stopping, best iteration is:
[3127]	training's auc: 1	valid_1's auc: 0.915224
  auc =  0.9152240673668274
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.991991	valid_1's auc: 0.930665
[1000]	training's auc: 0.999061	valid_1's auc: 0.93311
[1500]	training's auc: 0.999928	valid_1's auc: 0.933632
[2000]	training's auc: 0.999997	valid_1's auc: 0.93399
Early stopping, best iteration is:
[1895]	training's auc: 0.999993	valid_1's auc: 0.934049
  auc =  0.934048981445561
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992364	valid_1's auc: 0.92869
[1000]	training's auc: 0.999097	valid_1's auc: 0.930111
Early stopping, best iteration is:
[846]	training's auc: 0.998222	valid_1's auc: 0.93029
  auc =  0.9302895029193583
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992017	valid_1's auc: 0.945351
[1000]	training's auc: 0.999104	valid_1's auc: 0.948546
[1500]	training's auc: 0.999931	valid_1's auc: 0.948963
[2000]	training's auc: 0.999997	valid_1's auc: 0.948898
Early stopping, best iteration is:
[1758]	training's auc: 0.999985	valid_1's auc: 0.949051
  auc =  0.9490514413026645
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992016	valid_1's auc: 0.920608
[1000]	training's auc: 0.999039	valid_1's auc: 0.924444
[1500]	training's auc: 0.999919	valid_1's auc: 0.925442
[2000]	training's auc: 0.999996	valid_1's auc: 0.925863
[2500]	training's auc: 1	valid_1's auc: 0.92589
Early stopping, best iteration is:
[2148]	training's auc: 0.999999	valid_1's auc: 0.926007
  auc =  0.9260074748052762


In [21]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_feat439_modelselect.csv', index = False)