# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import scipy.stats
import os
import time

In [None]:
# helper functions
import functions
from functions import *

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# dark background style
plt.style.use('dark_background')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

# 2. DATA PREPARATION

In [None]:
### extract test IDs

# import CSV
df = pd.read_csv('../data/data_v3.csv')

# partitioning
target = 'fraud'
test  = df[df[target].isnull() == True]

# extract IDs
test_ids = test['id']
del test

In [None]:
### IMPORT PREDS

# list names
names = sorted(os.listdir('../submissions'))

# preprocessing loop
for name in names:

    # load preds
    tmp_te = pd.read_csv('../submissions/' + str(name))
    tmp_te.columns = [name]    

    # cbind data
    if name == names[0]:     
        test  = tmp_te
    else:
        test  = pd.concat([test,  tmp_te], axis = 1)
        
# display information
print('- Data shape:',  test.shape)

In [None]:
# import CSV
df = pd.read_csv('../data/data_v3.csv')
print(df.shape)

In [None]:
# target variable
target = 'fraud'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target variable
y = train[target]
test_ids = test['id']
del train, test

In [None]:
### IMPORT OOF PREDS

# profit threshold
min_profit = 250

# list names
names = sorted(os.listdir('../stage3_oof_preds'))
names = [n for n in names if int(n[n.rindex('_')+1:-4]) > min_profit]

# preprocessing loop
for name in names:

    # load preds
    tmp_tr = pd.read_csv('../stage3_oof_preds/'   + str(name))
    tmp_te = pd.read_csv('../submissions/' + str(name))

    # sort OOF preds by ID
    if 'id' in tmp_tr:
        del tmp_tr['id']
    
    # rename columns
    tmp_tr.columns = [name]    
    tmp_te.columns = [name]    

    # cbind data
    if name == names[0]:     
        train = tmp_tr 
        test  = tmp_te
    else:
        train = pd.concat([train, tmp_tr], axis = 1)
        test  = pd.concat([test,  tmp_te], axis = 1)
        
# display information
print('- Train shape:', train.shape)
print('- Test shape:',  test.shape)

# 3. BLENDING

In [None]:
# keep best model per seed
models = []
for seed in range(1000, 1010):
    seed_models  = [x for x in list(train.columns) if str(seed) in x]
    seed_profits = [x[-7:-4] for x in seed_models]
    best_model   = seed_models[np.argmax(seed_profits)]
    models.append(best_model)
models

In [None]:
# median ensemble
means_tr = np.array(train[models].median(axis = 1))
means_te = np.array(test[models].median(axis  = 1))

In [None]:
# check profit on training data
prediction_reward(y, np.round(means_tr))[1]

# mean:   profit = 410
# median: profit = 450

# 4. SUBMISSION

In [None]:
# file name
name = 'median_ensemble'

In [None]:
# check submission
sub = pd.DataFrame({'id': test_ids, 'fraud': means_te})
sub['fraud'] = np.round(sub['fraud']).astype('int')
sub.head()

In [None]:
# export submission
sub = sub[['fraud']]
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape

In [None]:
# check correlation with best individual submission
prev_sub = pd.read_csv('../submissions/lgb_v8_375.csv')
cor = np.sum(prev_sub[target] == sub.reset_index()[target]) / len(sub)
print("Share of the same predictions: " + str(np.round(cor, 6)))