In [33]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import catboost as cat
from catboost import CatBoostClassifier,Pool
from lightgbm import plot_tree
from graphviz import Digraph
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import time
import pickle

%matplotlib inline

# Load Original Features

In [3]:
feat_num = 457

In [4]:
#df_total_raw = pd.read_csv('./data/features%s.csv'%(feat_num))
with open('./data/features%s.pickle'%(feat_num), 'rb') as handle:
    df_total_raw = pickle.load(handle)

In [29]:
#to_drop = ['card1_count','addr2','V117','V118','V120','V122','id_25','id_21','id_26',
#           'V305','id_17','V27','V28','V88','V89','id_14','V107','V240','V241','V41','id_24','V68']
#to_drop = ['C8']
#with open('./data/feat%s_rm_pm_importance0.pickle'%(feat_num), 'rb') as handle:
#    to_drop = pickle.load(handle)

In [5]:
#df_total = df_total_raw.drop(to_drop,axis=1)
df_total = df_total_raw

In [6]:
features_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]

In [7]:
labels_train = features_train['isFraud']
features_train = features_train.drop(columns = ['isFraud', 'TransactionID'])

In [8]:
features_train.shape

(590540, 455)

# Prepare model

In [9]:
categorical = ['ProductCD', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_email','R_email','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'Device_name','Device_version','screen_width','screen_height',
               'P_email_suffix','R_email_suffix','id_30_OS','id_30_version',
              'is_card_freq_Device','is_wide','is_long','is_zero','is_win8_vista',
              'is_windows_otheros','is_card_freq_pdc','is_card_freq_addr1'] 
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids

In [10]:
categorical = list(set(categorical).intersection(df_total.columns))

In [11]:
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [82]:
params = {'learning_rate': 0.01,
          "boosting_type":"Plain" ,
          'loss_function': 'CrossEntropy',#'Logloss',
          "eval_metric": 'AUC',
          'random_state': 47,
          'task_type' :'GPU',
          'grow_policy':'Lossguide',
          #'reg_lambda': 0.6485237330340494,
          #'min_data_in_leaf':100,
          #'max_leaves':1400
          #'is_unbalance':True
          #'scale_pos_weight':9
         }

# Feature Selection

In [13]:
features_train.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_02_to_std_card1,D15_to_mean_card1,D15_to_std_card1,D15_to_mean_addr1,D15_to_std_addr1,TransactionAmt_decimal,Device_name,Device_version,screen_width,screen_height
0,86400,68.5,0,13926,,150.0,0,142.0,0,315.0,...,,0.0,0.0,0.0,0.0,500,-1,-1,-1,-1
1,86401,29.0,0,2755,404.0,150.0,1,102.0,0,325.0,...,,0.0,0.0,0.0,0.0,0,-1,-1,-1,-1
2,86469,59.0,0,4663,490.0,150.0,2,166.0,1,330.0,...,,2.188598,1.578656,1.420777,1.251501,0,-1,-1,-1,-1
3,86499,50.0,0,18132,567.0,150.0,1,117.0,1,476.0,...,,0.471748,0.434842,0.581165,0.474049,0,-1,-1,-1,-1
4,86506,50.0,1,4497,514.0,150.0,1,102.0,0,420.0,...,0.9245,,,,,0,0,0,0,0


In [28]:
cbc_1 = CatBoostClassifier(**params)

In [83]:
start = time.time()

train_dataset = Pool(data=features_train.iloc[0:472432,:],
                     label=labels_train.values[0:472432])#,cat_features=categorical

eval_dataset = Pool(data=features_train.iloc[472432:,:],
                    label=labels_train.values[472432:])#,cat_features=categorical


model = cat.train(dtrain = train_dataset,
                  params=params,
                  num_boost_round = 10000, 
                  eval_set = eval_dataset,
                  verbose_eval=500,
                  early_stopping_rounds = 500)
print(time.time()-start)

0:	learn: 0.7805921	test: 0.7690242	best: 0.7690242 (0)	total: 76.8ms	remaining: 12m 48s
500:	learn: 0.9099354	test: 0.8797977	best: 0.8797977 (500)	total: 23.6s	remaining: 7m 28s
1000:	learn: 0.9291734	test: 0.8955471	best: 0.8955471 (1000)	total: 46.4s	remaining: 6m 56s
1500:	learn: 0.9404129	test: 0.9020657	best: 0.9020657 (1500)	total: 1m 8s	remaining: 6m 30s
2000:	learn: 0.9493622	test: 0.9075140	best: 0.9075140 (2000)	total: 1m 31s	remaining: 6m 4s
2500:	learn: 0.9559695	test: 0.9110211	best: 0.9110211 (2500)	total: 1m 53s	remaining: 5m 40s
3000:	learn: 0.9610814	test: 0.9143843	best: 0.9143843 (3000)	total: 2m 15s	remaining: 5m 15s
3500:	learn: 0.9652727	test: 0.9160654	best: 0.9160906 (3495)	total: 2m 37s	remaining: 4m 52s
4000:	learn: 0.9686460	test: 0.9173258	best: 0.9173595 (3984)	total: 2m 59s	remaining: 4m 29s
4500:	learn: 0.9717069	test: 0.9182107	best: 0.9182107 (4500)	total: 3m 21s	remaining: 4m 6s
5000:	learn: 0.9742119	test: 0.9192885	best: 0.9192987 (4991)	total: 3m 

In [84]:
print(model.get_best_score())

{'learn': {'CrossEntropy': 0.04049573784560529, 'AUC': 0.9848528504371643}, 'validation': {'CrossEntropy': 0.08383661067889135, 'AUC': 0.9207524955272675}}


# Train Model

In [89]:
splits = 5
folds = KFold(n_splits = splits,random_state=50)
predictions = np.zeros(len(features_test_new))
ave_auc = 0

In [None]:
for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, 
                                                          labels_train.values)):
    print("Fold {}".format(fold_num))
    train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
    valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]
    
    train_dataset = Pool(data=train_df,label=y_train_df)#,cat_features=categorical
    eval_dataset = Pool(data=valid_df,label=y_valid_df)#,cat_features=categorical
    
    model = cat.train(dtrain = train_dataset,
                  params=params,
                  num_boost_round = 10000, 
                  eval_set = eval_dataset,
                  verbose_eval=500,
                  early_stopping_rounds = 500)

    
    pred = model.predict(valid_df)
    auc_score = roc_auc_score(y_valid_df, pred)
    ave_auc += auc_score / splits
    predictions += model.predict(features_test_new) / splits

Fold 0
0:	learn: 0.7853286	test: 0.7695125	best: 0.7695125 (0)	total: 58.4ms	remaining: 9m 43s
500:	learn: 0.9097171	test: 0.8743044	best: 0.8743044 (500)	total: 23.9s	remaining: 7m 33s
1000:	learn: 0.9280349	test: 0.8925769	best: 0.8925769 (1000)	total: 46.7s	remaining: 6m 59s
1500:	learn: 0.9393933	test: 0.9023674	best: 0.9023842 (1495)	total: 1m 9s	remaining: 6m 35s
2000:	learn: 0.9481987	test: 0.9076049	best: 0.9076049 (2000)	total: 1m 33s	remaining: 6m 12s
2500:	learn: 0.9550296	test: 0.9117606	best: 0.9117606 (2500)	total: 1m 55s	remaining: 5m 45s
3000:	learn: 0.9606536	test: 0.9138484	best: 0.9138484 (3000)	total: 2m 19s	remaining: 5m 24s
3500:	learn: 0.9649511	test: 0.9160973	best: 0.9161022 (3497)	total: 2m 42s	remaining: 5m 2s
4000:	learn: 0.9685147	test: 0.9175644	best: 0.9175644 (4000)	total: 3m 8s	remaining: 4m 42s
4500:	learn: 0.9714721	test: 0.9186276	best: 0.9186365 (4491)	total: 3m 30s	remaining: 4m 16s
5000:	learn: 0.9740893	test: 0.9193466	best: 0.9193490 (4999)	tota

In [91]:
# with all features
ave_auc

0.9269963400097325

In [92]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_cat_feat%s.csv'%(feat_num), index = False)