In [18]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

%matplotlib inline
RSEED = 50

# Load Original Features

In [29]:
feat_num = 448

In [19]:
df_total = pd.read_csv('./data/features%s.csv'%(feat_num))

In [20]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [21]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [22]:
features_train.shape

(590540, 446)

# Prepare model

In [23]:
categorical = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_emaildomain','R_emaildomain','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'card_id']
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [24]:
categorical = list(set(categorical).intersection(df_total.columns))

In [25]:
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [26]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [27]:
splits = 5
folds = KFold(n_splits = splits,random_state=RSEED)
predictions = np.zeros(len(features_test_new))

In [28]:
for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
    print("Fold {}".format(fold_num))
    train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
    valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
    val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)
    
    clf = lgb.train(params,
                    trn_data,
                    10000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=500)

    pred = clf.predict(valid_df)
    print( "  auc = ", roc_auc_score(y_valid_df, pred) )
    predictions += clf.predict(features_test_new) / splits

Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.99192	valid_1's auc: 0.899819
[1000]	training's auc: 0.999089	valid_1's auc: 0.907853
[1500]	training's auc: 0.999932	valid_1's auc: 0.911393
[2000]	training's auc: 0.999998	valid_1's auc: 0.913286
[2500]	training's auc: 1	valid_1's auc: 0.914764
[3000]	training's auc: 1	valid_1's auc: 0.915544
Early stopping, best iteration is:
[2998]	training's auc: 1	valid_1's auc: 0.915532
  auc =  0.915533008549281
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992634	valid_1's auc: 0.9317
[1000]	training's auc: 0.999299	valid_1's auc: 0.933819
[1500]	training's auc: 0.999962	valid_1's auc: 0.933922
Early stopping, best iteration is:
[1354]	training's auc: 0.999905	valid_1's auc: 0.934122
  auc =  0.9341227509734886
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.993026	valid_1's auc: 0.928919
[1000]	training's auc: 0.99932	valid_1's auc: 0.930244
Early stopping, best iteration is:
[802]	training's auc: 0.998202	valid_1's auc: 0.930555
  auc =  0.9305548458782851
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992697	valid_1's auc: 0.945434
[1000]	training's auc: 0.999334	valid_1's auc: 0.948162
[1500]	training's auc: 0.999962	valid_1's auc: 0.948353
[2000]	training's auc: 0.999999	valid_1's auc: 0.948526
Early stopping, best iteration is:
[1968]	training's auc: 0.999999	valid_1's auc: 0.948546
  auc =  0.9485456445024689
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.992636	valid_1's auc: 0.921321
[1000]	training's auc: 0.999281	valid_1's auc: 0.924909
[1500]	training's auc: 0.999956	valid_1's auc: 0.925657
[2000]	training's auc: 0.999999	valid_1's auc: 0.926051
[2500]	training's auc: 1	valid_1's auc: 0.926212
Early stopping, best iteration is:
[2428]	training's auc: 1	valid_1's auc: 0.926282
  auc =  0.9262817248736983


In [30]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_feat%s_modelselect.csv'%(feat_num), index = False)