# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
import copy
import scipy.stats
import os
import time
import catboost as cb

In [None]:
# helper functions
import functions
from functions import *

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# dark background style
plt.style.use('dark_background')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

# 2. DATA PREPARATION

In [None]:
# import CSV
df = pd.read_csv('../data/data_v3.csv')
print(df.shape)

In [None]:
# target variable
target = 'fraud'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target variable
y = train[target]
del train[target], test[target]

# 3. MODELING

### PARAMETERS

In [None]:
# drop bad features
excluded_feats = ['id']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

In [None]:
# perform PCA
#num_comp = 5
#pca = PCA(n_components = num_comp)
#pca.fit(train[features].values)
#train = pd.DataFrame(pca.transform(train[features].values), columns = ['v' + str(v) for v in range(0, num_comp)])
#test  = pd.DataFrame(pca.transform(test[features].values),  columns = ['v' + str(v) for v in range(0, num_comp)])
#features = list(train.columns)
#print(train[features].shape)
#print(test[features].shape)

In [None]:
### PARAMETERS

# settings
cores = 12
seed  = 23

# cross-validation
num_folds = 5
shuffle   = True

# muner of rounds
max_rounds = 600
stopping   = 600
verbose    = 200

# CB parameters
cb_params = {
    'boosting_type':     'Plain',
    'objective':         'Logloss', #'metrics':           'binary_logloss',
    'bootstrap_type':    'Bernoulli',
    'subsample':         0.6, # bagging_fraction
    'rsm':               0.7, # feature_fraction
    'model_size_reg':    0,#.1, # lambda_l1
    'l2_leaf_reg':       0, # lambda_l2
    #'min_split_gain':    0.01, #TODO
    #'min_child_weight':  2, #TODO
    'verbose':           False, #'silent': True,
    #'verbosity':         -1,
    #'learning_rate':     0.1, # recommendation self learning if None
    'max_depth':         7,
    'ctr_leaf_count_limit':  70, # num_leaves
    #'min_data_in_leaf':   20, # min_child_samples, not implemented for CPU computation
    # as alternative for max_leaves see ctr_leaf_count_limit
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'thread_count' :     cores, # nthread
    'random_state':      seed,
    'eval_metric' :      CustomMetric(), #prediction_reward,
    # New parameters
    'od_type':           'IncToDec',
    'task_type' : 'CPU',
    
}
# CB parameters
if False:
    cb_params = {
        'boosting_type':     'Plain',
        'objective':         'Logloss', #'metrics':           'binary_logloss',
        'bootstrap_type':    'Bernoulli',
        'subsample':         0.9, # bagging_fraction
        #rsm':               0.8, # feature_fraction
        #model_size_reg':    0.1, # lambda_l1
        'l2_leaf_reg':       0.1, # lambda_l2
        #'min_split_gain':    0.01, #TODO
        #'min_child_weight':  2, #TODO
        'verbose':           False, #'silent': True,
        #'verbosity':         -1,
        #'learning_rate':     0.1, # recommendation self learning
        'max_depth':         7,
        #ctr_leaf_count_limit':  70, # num_leaves
        # as alternative for max_leaves see ctr_leaf_count_limit
        'scale_pos_weight':  1,
        'n_estimators':      max_rounds,
        'thread_count' :     cores, # nthread
        'random_state':      seed,
        'eval_metric' :      CustomMetric(), #prediction_reward,
        # New parameters
        'od_type':           'IncToDec',
        'task_type' : 'GPU',
        'grow_policy': 'Depthwise',
        #'min_data_in_leaf':   20, # min_child_samples, not implemented for CPU computation
    }


# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [None]:
# placeholders
clfs = []
valid_profit = np.zeros(num_folds) 
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])
importances  = pd.DataFrame()

In [None]:
# SMOTE settings
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 23, n_jobs = 10, sampling_strategy = 0.1)

### CROSS-VALIDATION

In [None]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    ## remove outliers
    #out_idx = (np.abs(scipy.stats.zscore(trn_x)) < 10).all(axis = 1)
    #trn_x = trn_x[out_idx]
    #trn_y = trn_y[out_idx]
    
    # scale data
    scaler   = RobustScaler()
    trn_x    = scaler.fit_transform(trn_x)
    val_x    = pd.DataFrame(scaler.transform(val_x),          columns = features)
    tmp_test = pd.DataFrame(scaler.transform(test[features]), columns = features)

    # augment training data with SMOTE
    trn_x, trn_y = sm.fit_sample(trn_x, trn_y)
    trn_x = pd.DataFrame(trn_x, columns = features)
    trn_y = pd.Series(trn_y)
    
    # factor decomposition
    tmp_features = copy.deepcopy(features)
    if False:
        decomp = FactorAnalysis(n_components = 11)
        decomp.fit(trn_x)
        trn_x = decomp.transform(trn_x)
        val_x = decomp.transform(val_x)
        tmp_test = decomp.transform(tmp_test)
        tmp_features = ['pc'+str(i) for i in range(decomp.n_components)]
    
    # add noise to train to reduce overfitting
    #trn_x += np.random.normal(0, 0.01, trn_x.shape)
    
    # mean target encoding
    #trn_x, val_x, tmp_test = mean_target_encoding(trn_x, val_x, test, features = ['trustLevel'], target = 'fraud', folds = 5)
    #features = [f for f in trn_x.columns if f not in excluded_feats]
        
    # train lightGBM
    print('Custom early stopping: select the best out of %.0f iterations...' % max_rounds)
    clf = cb.CatBoostClassifier(**cb_params) 
    if cb_params.get('task_type',None) is not None and cb_params.get('task_type',None) == 'GPU':
        clf = clf.fit(trn_x, trn_y, 
                      eval_set              = (val_x, val_y)
                     )
    else:
        clf = clf.fit(trn_x, trn_y, 
                      eval_set              = [(trn_x, trn_y), (val_x, val_y)]
                     )
    clfs.append(clf)
    
    # find the best iteration
    #best_iter = clf.best_iteration_
    best_iter = np.argmax(clf.evals_result_['validation_1']['CustomMetric']) + 1
    print('Best iteration is:')
    print('[' + str(best_iter) + ']   valid_1 profit: ' + 
          #str(prediction_reward(val_y, clf.predict_proba(val_x, num_iteration = best_iter)[:, 1])[1].astype('int')))
          str(prediction_reward(val_y, clf.predict_proba(val_x, ntree_end = best_iter)[:, 1])[1].astype('int')))
       
    # predictions
    #preds_oof[val_idx]    = clf.predict_proba(val_x, num_iteration = best_iter)[:, 1]
    preds_oof[val_idx]    = clf.predict_proba(val_x, ntree_end = best_iter)[:, 1]
    valid_profit[n_fold]  = prediction_reward(val_y, preds_oof[val_idx])[1]
    #preds_test           += clf.predict_proba(tmp_test, num_iteration = best_iter)[:, 1] / folds.n_splits 
    preds_test           += clf.predict_proba(tmp_test, ntree_end = best_iter)[:, 1] / folds.n_splits 

    ## importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = tmp_features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = n_fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('----------------------')
    print('FOLD%2d: PROFIT = %.0f' % (n_fold + 1, valid_profit[n_fold]))
    print('----------------------')
    print('')
        
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    # uncomment with mean target encoding
    #features = [f for f in train.columns if f not in excluded_feats]
    
    
# print overall performance    
cv_perf = np.sum(valid_profit)
print('----------------------')
print('TOTAL PROFIT = %.0f' % cv_perf)
print('----------------------')

In [None]:
##### RECHECK PROFIT  
prediction_reward(y, preds_oof)

###### TRACKING RESULTS (5 folds, strat = True, seed = 23) for CATBOOST

# V1: implementing CATBOOST: 215 = 85 - 15 + 65 - 10 + 90
# V2: param opt:             250 = 75 + 35 + 55 + 15 + 70



### CUTOFF OPTIMIZATION

In [None]:
##### OPTIMIZE CUTOFF

# set step
step = 100

# search
cutoffs = []
profits = []
for i in range(0, step):
    cutoffs.append(i / step)
    profits.append(recompute_reward(y, preds_oof, cutoff = cutoffs[i]))
        
# results
plt.figure(figsize = (10,4))
sns.lineplot(x = cutoffs[10:step], y = profits[10:step], color = 'red')
plt.tight_layout()
plt.axvline(x = cutoffs[np.argmax(profits)], color = 'white', linestyle = '--')
print('- optimal cutoff = %.4f' % cutoffs[np.argmax(profits)])
print('- optimal profit = %.4f' % profits[np.argmax(profits)])
plt.savefig('../cutoff_selection_catboost.pdf')

# 4. SUBMISSION

In [None]:
# file name
model = 'cb_v2'
perf  = str(round(cv_perf, 0).astype('int'))
name  = model + '_' + perf

In [None]:
# export OOF preds
oof = pd.DataFrame({'id': train['id'], 'fraud': preds_oof})
oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof.head()
name

In [None]:
# check submission
sub = pd.DataFrame({'id': test['id'], 'fraud': preds_test})
sub['fraud'] = np.round(sub['fraud']).astype('int')
sub.head()

In [None]:
# export submission
sub = sub[['fraud']]
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape

In [None]:
# check correlation with previous submission
prev_sub = pd.read_csv('../submissions/cb_v1_210.csv')
cor = np.sum(prev_sub[target] == sub.reset_index()[target]) / len(sub)
print("Share of the same predictions: " + str(np.round(cor, 6)))