In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt,gc,os
# import cupy,cudf
from tqdm.auto import tqdm
import itertools
tqdm.pandas()
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

#print('RAPIDS version',cudf.__version__)

In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 41

# FILL NAN VALUE
#NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

In [3]:
TRAIN_PATH = '../input/amexfeatureengineering/770_FE_train.feather'
train = pd.read_feather(TRAIN_PATH)

In [4]:
train.sample()

Unnamed: 0,customer_ID,B_30_nunique,B_38_last,B_38_nunique,D_114_last,D_117_last,D_120_last,D_120_nunique,D_66_last,D_66_nunique,...,D_53_last_mean_diff,B_28_last_mean_diff,S_22_last_mean_diff,B_3_last_mean_diff,D_56_last_mean_diff,D_130_last_mean_diff,S_7_last_mean_diff,total_data_count,total_data_last,target
315229,3449953673388858195,1,7,2,2,2,1,1,0,1,...,-0.002699,0.001243,0.404541,-0.009155,0.0,0.0,0.002335,1030.0,173.0,1


In [5]:
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [6]:
class AmexCatboostMetric(object):
   def get_final_error(self, error, weight): return error
   def is_max_optimal(self): return True
   def evaluate(self, approxes, target, weight): return amex_metric(np.array(target), approxes[0]), 1.0

In [7]:
#train = train.to_pandas() # free GPU memory

In [8]:
TRAIN_SUBSAMPLE = 1.0
gc.collect()
y_oof = np.zeros(train.shape[0])

In [9]:
# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 770 features!


In [10]:
cat_features = [
        "B_38",
        "D_114",
        "D_117",
        "D_120",
        "D_63",
        "D_64",
        "D_66",
    ]

params = {'objective': 'CrossEntropy', 'colsample_bylevel': 0.15, 'bootstrap_type': 'Bernoulli', 
 'max_depth': 7, 'l2_leaf_reg': 37, 'random_strength': 0.9, 'subsample': 0.9}

cat_features = [f"{cf}_last" for cf in cat_features]
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
ix=0
oof = []
for train_ind, val_ind in skf.split(train[FEATURES], train.target):
    
    print(f"******* Fold {ix} ******* ")
    tr_x, val_x = (
        train[FEATURES].iloc[train_ind].reset_index(drop=True),
        train[FEATURES].iloc[val_ind].reset_index(drop=True),
    )
    tr_y, val_y = (
        train.target.iloc[train_ind].reset_index(drop=True),
        train.target.iloc[val_ind].reset_index(drop=True),
    )

    clf = CatBoostClassifier(**params , iterations= 15500,  random_state=SEED ,allow_writing_files=False)
    clf.fit(tr_x, tr_y, eval_set=[(val_x, val_y)], cat_features=cat_features,  verbose=100,early_stopping_rounds=100)
    clf.save_model(f'cat_v{VER}_fold{ix}_seed{SEED}')
    clf.load_model(f'./cat_v{VER}_fold{ix}_seed{SEED}')  
    preds = clf.predict_proba(val_x)[:,1]
    acc = amex_metric(val_y.values, preds)
    print('Kaggle Metric =',acc,'\n')
    
    # SAVE OOF
    df = train.loc[val_ind, ['customer_ID','target'] ].copy()
    df['oof_pred_cat'] = preds
    oof.append( df )
    
    del clf, tr_x, val_x, tr_y, val_y
    _ = gc.collect()
    
    ix = ix + 1
    
print('#'*25)

oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
val_score = amex_metric(oof.target.values, oof.oof_pred_cat.values)
print(f"Amex metric: {val_score}")

******* Fold 0 ******* 
0:	learn: 0.6561306	test: 0.6562043	best: 0.6562043 (0)	total: 753ms	remaining: 3h 14m 32s
100:	learn: 0.2358873	test: 0.2386879	best: 0.2386879 (100)	total: 1m 2s	remaining: 2h 38m 5s
200:	learn: 0.2260273	test: 0.2298234	best: 0.2298234 (200)	total: 2m 4s	remaining: 2h 37m 21s
300:	learn: 0.2217062	test: 0.2263292	best: 0.2263292 (300)	total: 3m 5s	remaining: 2h 35m 58s
400:	learn: 0.2187075	test: 0.2242648	best: 0.2242648 (400)	total: 4m 6s	remaining: 2h 34m 45s
500:	learn: 0.2162558	test: 0.2228806	best: 0.2228806 (500)	total: 5m 8s	remaining: 2h 33m 48s
600:	learn: 0.2140899	test: 0.2218356	best: 0.2218356 (600)	total: 6m 9s	remaining: 2h 32m 35s
700:	learn: 0.2122271	test: 0.2211084	best: 0.2211084 (700)	total: 7m 9s	remaining: 2h 31m 12s
800:	learn: 0.2104994	test: 0.2205621	best: 0.2205621 (800)	total: 8m 11s	remaining: 2h 30m 14s
900:	learn: 0.2089388	test: 0.2201036	best: 0.2201036 (900)	total: 9m 12s	remaining: 2h 29m 16s
1000:	learn: 0.2074539	test: 

In [11]:
oof

Unnamed: 0_level_0,target,oof_pred_cat
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
-9223193039457028513,0,0.000581
-9223173911659837606,1,0.921747
-9222614967383752803,0,0.000815
-9222591693961878959,0,0.028656
-9221871557930351993,0,0.026786
...,...,...
9221355249458109492,1,0.978107
9222620632737647928,1,0.861517
9223073742590486866,0,0.001697
9223126093534097186,1,0.916233


In [12]:
del train
gc.collect()

42

In [13]:
oof_cat = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet', columns=['customer_ID']).drop_duplicates()
oof_cat['customer_ID_hash'] = oof_cat['customer_ID'].str[-16:].apply(int, base=16).astype('int64')
oof_cat = oof_cat.set_index('customer_ID_hash')
oof_cat = oof_cat.merge(oof, left_index=True, right_index=True)
oof_cat = oof_cat.sort_index().reset_index(drop=True)
oof_cat.to_csv(f'oof_cat_seed41.csv',index=False)
oof_cat.head()

Unnamed: 0,customer_ID,target,oof_pred_cat
0,20eac26171c3d251c55fc78204e59fab1c15fc2bc96d0c...,1,0.729132
1,aea50fdf9b974ccec95fa177c3225a0f913483b457de6e...,0,0.000581
2,32cd2d41aef737b69089882754395925c96eaee1f4a859...,0,0.000736
3,8daa6d5dc2655a8a437531e6b8b96829113cdfe9bf6cae...,0,0.026569
4,0ceba351a3851202542feb49d7385bcef32f6037fc57c7...,1,0.921747
