# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import scipy.stats
import os
import time

In [None]:
# helper functions
import functions
from functions import *

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# dark background style
plt.style.use('dark_background')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

# 2. DATA PREPARATION

In [None]:
# import CSV
df = pd.read_csv('../data/data_v1.csv')
print(df.shape)

In [None]:
# target variable
target = 'duration'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target variable
y = train.sort_values('id')[target]
test_ids = test['id']
classes = y.unique()
classes

In [None]:
### IMPORT OOF AND TEST PREDS

# which model to stack?
model = 'lgb'

# performance threshold
#min_profit = 250

# list names
names = sorted(os.listdir('../oof_preds'))
#names = [n for n in names if int(n[n.rindex('_')+1:-4]) > min_profit]
names = [s for s in names if model in s]

# preprocessing loop
for name in names:

    # load preds
    tmp_tr = pd.read_csv('../oof_preds/'   + str(name))
    tmp_te = pd.read_csv('../submissions/' + str(name))

    # sort OOF preds by ID
    if 'id' in tmp_tr:
        tmp_tr = tmp_tr.sort_values('id')
        tr_id  = tmp_tr['id']
        del tmp_tr['id']
        
    # extract test IDs
    if 'id' in tmp_te:
        tmp_te = tmp_te.sort_values('id')
        te_id  = tmp_te['id']
        del tmp_te['id']
        
    # rename columns
    tmp_tr.columns = [str(name) + '_' + str(l.replace('class_', '')) for l in list(tmp_tr.columns)]    
    tmp_te.columns = [str(name) + '_' + str(l.replace('class_', '')) for l in list(tmp_te.columns)]  

    # cbind data
    if name == names[0]:     
        train = tmp_tr 
        test  = tmp_te
    else:
        train = pd.concat([train, tmp_tr], axis = 1)
        test  = pd.concat([test,  tmp_te], axis = 1)
        

# put back id
train.insert(0, column = 'id', value = tr_id)
test.insert(0,  column = 'id', value = te_id)
        
# display information
print('- Train shape:', train.shape)
print('- Test shape:',  test.shape)

# 3. MODELING

In [None]:
# drop bad features
excluded_feats = ['id']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

In [None]:
### PARAMETERS

# settings
cores = 4
seed  = 3

# cross-validation
num_folds = 5
shuffle   = True

# muner of rounds
max_rounds = 1000
stopping   = 200
verbose    = 200

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'multiclass',
    'metric':            'multi_logloss',
    'num_class':         len(classes),
    'bagging_fraction':  0.9,
    'feature_fraction':  0.9,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0.01,
    'min_child_weight':  1,
    'min_child_samples': 1,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.05,
    'max_depth':         5,
    #'num_leaves':        70,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
}

# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [None]:
# placeholders
clfs = []
valid_perf  = np.zeros(num_folds) 

#preds_test   = np.zeros(test.shape[0])
#preds_oof    = np.zeros(train.shape[0])

preds_oof  = np.zeros((len(train), len(classes)))
preds_test = np.zeros((len(test),  len(classes)))

In [None]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    test_x       = test[features]
           
    # train lightGBM
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  early_stopping_rounds = stopping,
                  verbose               = verbose)
    clfs.append(clf)
    
    # find the best iteration
    best_iter = clf.best_iteration_
       
    # save predictions
    #preds_oof[val_idx]    = clf.predict_proba(val_x,  num_iteration = best_iter)[:, 1]
    #valid_profit[n_fold]  = log_loss(y, preds_oof)
    #preds_test           += clf.predict_proba(test_x, num_iteration = best_iter)[:, 1] / folds.n_splits 
    
    # save predictions
    preds_oof[val_idx, :] = clf.predict_proba(val_x, num_iteration = best_iter)
    valid_perf[n_fold]    = log_loss(y[val_idx], preds_oof[val_idx, :])
    preds_test           += clf.predict_proba(test_x, num_iteration = best_iter) / folds.n_splits 
   
    # print performance
    print('--------------------------------')
    print('FOLD%2d: LOGLOSS = %.6f' % (n_fold + 1, valid_perf[n_fold]))
    print('--------------------------------')
    print('')
        
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()

    
    
# print overall performance    
cv_perf = np.mean(valid_perf)
print('--------------------------------')
print('MEAN LOGLOSS = %.6f' % cv_perf)
print('--------------------------------')

In [None]:
##### RECHECK PERFORMANCE  
np.round(log_loss(y, preds_oof), 6)


###### TRACKING RESULTS (5 folds, strat = True, seed = 3)

# V1: 2 LGB models:   0.960575

# 4. SUBMISSION

In [None]:
# file name
model = 'stack_v1'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + '_' + perf

In [None]:
# export OOF preds
#oof = pd.DataFrame({'id': train['id'], 'duration': preds_oof})
#oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
#oof.head()

In [None]:
# export submission
#sub = pd.DataFrame({'id': test['id'], 'duration': preds_test})
#sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
#sub.head()

In [None]:
# export OOF preds
oof = pd.DataFrame(preds_oof)
oof.insert(0, column = 'id', value = train['id'].reset_index(drop = True))
oof.to_csv('../oof_preds_stack/' + str(name) + '.csv', index = False)
oof.head()

In [None]:
# export submission
sub = pd.DataFrame(preds_test)
sub.insert(0, column = 'id', value = test['id'].reset_index(drop = True))
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()

In [None]:
# check correlation with previous submission
#prev_sub = pd.read_csv('../submissions/lgb_v8_375.csv')
#cor = np.sum(prev_sub[target] == sub.reset_index()[target]) / len(sub)
#print("Share of the same predictions: " + str(np.round(cor, 6)))

In [None]:
# check correlation with previous submission
prev_sub = pd.read_csv('../submissions/lgb_v1_96790.csv')
pd.Series(np.diag(sub.apply(lambda x: prev_sub.corrwith(x))))

In [None]:
# submit to QSCORE
comment = ''
submit_prediction(sub, sep = ',', index = False, comment = str(comment) + ' - ' + name)