# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import scipy.stats
import os
import time

In [None]:
# helper functions
import functions
from functions import *

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# dark background style
plt.style.use('dark_background')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

# 2. DATA PREPARATION

In [None]:
# import CSV
df = pd.read_csv('../data/data_v1.csv')
print(df.shape)

In [None]:
# target variable
target = 'duration'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target variable
y = train.sort_values('id')[target]
test_ids = test['id']
classes = y.unique()
classes

In [None]:
### IMPORT OOF AND TEST PREDS

# which model to stack?
model = 'stack'

# performance threshold
#min_profit = 250

# list names
names = sorted(os.listdir('../oof_preds_stack2'))
#names = [n for n in names if int(n[n.rindex('_')+1:-4]) > min_profit]
names = [s for s in names if model in s]

# preprocessing loop
for name in names:

    # load preds
    tmp_tr = pd.read_csv('../oof_preds_stack2/'   + str(name))
    tmp_te = pd.read_csv('../submissions/' + str(name))

    # sort OOF preds by ID
    if 'id' in tmp_tr:
        tmp_tr = tmp_tr.sort_values('id')
        tr_id  = tmp_tr['id']
        del tmp_tr['id']
        
    # extract test IDs
    if 'id' in tmp_te:
        tmp_te = tmp_te.sort_values('id')
        te_id  = tmp_te['id']
        del tmp_te['id']
        
    # rename columns
    tmp_tr.columns = [str(name) + '_' + str(l.replace('class_', '')) for l in list(tmp_tr.columns)]    
    tmp_te.columns = [str(name) + '_' + str(l.replace('class_', '')) for l in list(tmp_te.columns)]  

    # cbind data
    if name == names[0]:     
        train = tmp_tr 
        test  = tmp_te
    else:
        train = pd.concat([train, tmp_tr], axis = 1)
        test  = pd.concat([test,  tmp_te], axis = 1)
        

# put back id
train.insert(0, column = 'id', value = tr_id)
test.insert(0,  column = 'id', value = te_id)
        
# display information
print('- Train shape:', train.shape)
print('- Test shape:',  test.shape)

# 3. BLENDING

In [None]:
# keep best model per seed
models = []
for seed in range(1000, 1010):
    seed_models  = [x for x in list(train.columns) if str(seed) in x]
    seed_profits = [x[-7:-4] for x in seed_models]
    best_model   = seed_models[np.argmax(seed_profits)]
    models.append(best_model)
models

In [None]:
# average probs for differnt classes
for cl in ['_0', '_1', '_2']:
    
    # train
    preds = [l for l in list(train.columns) if l.endswith(cl)]
    blend = train[preds].mean(axis = 1)
    train['blend_' + cl] = blend
    
    # test
    preds = [l for l in list(test.columns) if l.endswith(cl)]
    blend = test[preds].mean(axis = 1)
    test['blend_' + cl] = blend
    
# extract predictions
preds_oof  = train.filter(like = 'blend', axis = 1)
preds_test = test.filter(like  = 'blend', axis = 1)

In [None]:
# try different ensembles
#from scipy.stats.mstats import gmean
#armean_tr = np.array(train[models].mean(axis    = 1))
#median_tr = np.array(train[models].median(axis  = 1))
#gemean_tr = gmean(np.array(train[models]), axis = 1)
#min_tr    = np.array(train[models].min(axis     = 1))
#max_tr    = np.array(train[models].max(axis     = 1))

In [None]:
# check performance
cv_perf = np.round(log_loss(y, preds_oof), 6)
cv_perf

# 4. SUBMISSION

In [None]:
# file name
model = 'blend'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + '_' + perf

In [None]:
# export submission
#sub = pd.DataFrame({'id': test['id'], 'duration': preds_test})
#sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
#sub.head()

In [None]:
# export submission
sub = pd.DataFrame(preds_test)
sub.columns = ['id', '0', '1', '2']
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()

In [None]:
# check correlation with previous submission
#prev_sub = pd.read_csv('../submissions/lgb_v8_375.csv')
#cor = np.sum(prev_sub[target] == sub.reset_index()[target]) / len(sub)
#print("Share of the same predictions: " + str(np.round(cor, 6)))

In [None]:
# check correlation with previous submission
prev_sub = pd.read_csv('../submissions/lgb_v1_96790.csv')
pd.Series(np.diag(sub.apply(lambda x: prev_sub.corrwith(x))))

In [None]:
# submit to QSCORE
comment = ''
submit_prediction(sub, sep = ',', index = False, comment = str(comment) + ' - ' + name)