In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

%matplotlib inline
RSEED = 50

# Load Original Features

In [2]:
df_total = pd.read_csv('./data/features375.csv')

In [3]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [4]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [5]:
features_train.shape

(590540, 373)

# Prepare model

In [9]:
categorical = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'card_id']
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [12]:
categorical = list(set(categorical).intersection(df_total.columns))

In [13]:
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [14]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [15]:
splits = 5
folds = KFold(n_splits = splits,random_state=RSEED)
predictions = np.zeros(len(features_test_new))

In [16]:
for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
    print("Fold {}".format(fold_num))
    train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
    valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
    val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)
    
    clf = lgb.train(params,
                    trn_data,
                    10000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=500)

    pred = clf.predict(valid_df)
    print( "  auc = ", roc_auc_score(y_valid_df, pred) )
    predictions += clf.predict(features_test_new) / splits

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.991779	valid_1's auc: 0.898281
[1000]	training's auc: 0.998998	valid_1's auc: 0.906685
[1500]	training's auc: 0.999918	valid_1's auc: 0.909848
[2000]	training's auc: 0.999996	valid_1's auc: 0.912086
[2500]	training's auc: 1	valid_1's auc: 0.913491
[3000]	training's auc: 1	valid_1's auc: 0.914459
[3500]	training's auc: 1	valid_1's auc: 0.915061
Early stopping, best iteration is:
[3075]	training's auc: 1	valid_1's auc: 0.914526
  auc =  0.9145259300947038
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992572	valid_1's auc: 0.931049
[1000]	training's auc: 0.999213	valid_1's auc: 0.932813
[1500]	training's auc: 0.999951	valid_1's auc: 0.932984
[2000]	training's auc: 0.999998	valid_1's auc: 0.933081
Early stopping, best iteration is:
[1736]	training's auc: 0.999989	valid_1's auc: 0.933176
  auc =  0.9331782933720464
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.993002	valid_1's auc: 0.928899
[1000]	training's auc: 0.999245	valid_1's auc: 0.929486
Early stopping, best iteration is:
[842]	training's auc: 0.998422	valid_1's auc: 0.929978
  auc =  0.9299777515017369
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992731	valid_1's auc: 0.945229
[1000]	training's auc: 0.999261	valid_1's auc: 0.948048
[1500]	training's auc: 0.999953	valid_1's auc: 0.948321
[2000]	training's auc: 0.999999	valid_1's auc: 0.948261
Early stopping, best iteration is:
[1707]	training's auc: 0.999988	valid_1's auc: 0.948419
  auc =  0.9484192760346984
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992715	valid_1's auc: 0.920527
[1000]	training's auc: 0.999218	valid_1's auc: 0.924286
[1500]	training's auc: 0.999949	valid_1's auc: 0.924978
[2000]	training's auc: 0.999998	valid_1's auc: 0.925272
[2500]	training's auc: 1	valid_1's auc: 0.925213
Early stopping, best iteration is:
[2093]	training's auc: 0.999999	valid_1's auc: 0.925326
  auc =  0.9253263245267678


In [17]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_feat375_modelselect.csv', index = False)