# 1. SETTINGS

In [43]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import scipy.stats
from scipy.stats.mstats import gmean
import os
import time

In [44]:
# helper functions
import functions
from functions import *

In [45]:
# pandas options
pd.set_option('display.max_columns', None)

In [46]:
# dark background style
plt.style.use('dark_background')

In [47]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [48]:
# garbage collection
import gc
gc.enable()

# 2. DATA PREPARATION

In [49]:
# import CSV
df = pd.read_csv('../data/data_v2.csv')
print(df.shape)

(11840, 154)


In [50]:
# target variable
target = 'duration'

In [51]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

(8880, 154)
(2960, 154)


In [52]:
# target variable
y = train.sort_values('id')[target]
test_ids = test['id']
classes = y.unique()
classes

array([2., 1., 0.])

In [53]:
### IMPORT OOF AND TEST PREDS

# which model to stack?
model = 'lgb'

# list names
names = sorted(os.listdir('../oof_preds_stack2'))
names = [s for s in names if model in s]

# preprocessing loop
for name in names:

    # load preds
    tmp_tr = pd.read_csv('../oof_preds_stack2/'   + str(name))
    tmp_te = pd.read_csv('../submissions/' + str(name))

    # sort OOF preds by ID
    if 'id' in tmp_tr:
        tmp_tr = tmp_tr.sort_values('id')
        tr_id  = tmp_tr['id']
        del tmp_tr['id']
        
    # extract test IDs
    if 'id' in tmp_te:
        tmp_te = tmp_te.sort_values('id')
        te_id  = tmp_te['id']
        del tmp_te['id']
        
    # rename columns
    tmp_tr.columns = [str(name) + '_' + str(l.replace('class_', '')) for l in list(tmp_tr.columns)]    
    tmp_te.columns = [str(name) + '_' + str(l.replace('class_', '')) for l in list(tmp_te.columns)]  

    # cbind data
    if name == names[0]:     
        train = tmp_tr 
        test  = tmp_te
    else:
        train = pd.concat([train, tmp_tr], axis = 1)
        test  = pd.concat([test,  tmp_te], axis = 1)
        

# put back id
train.insert(0, column = 'id', value = tr_id)
test.insert(0,  column = 'id', value = te_id)
        
# display information
print('- Train shape:', train.shape)
print('- Test shape:',  test.shape)

- Train shape: (8880, 7)
- Test shape: (2960, 7)


# 3. BLENDING

In [54]:
# average probs for differnt classes
for cl in ['_0', '_1', '_2']:
    
    # train
    preds = [l for l in list(train.columns) if l.endswith(cl)]
    blend = train[preds].mean(axis = 1)
    train['blend_' + cl] = blend
    
    # test
    preds = [l for l in list(test.columns) if l.endswith(cl)]
    blend = test[preds].mean(axis = 1)
    test['blend_' + cl] = blend
    
# extract predictions
preds_oof  = train.filter(like = 'blend', axis = 1)
preds_test = test.filter(like  = 'blend', axis = 1)

In [55]:
# try different ensembles
#from scipy.stats.mstats import gmean
#armean_tr = np.array(train[models].mean(axis    = 1))
#median_tr = np.array(train[models].median(axis  = 1))
#gemean_tr = gmean(np.array(train[models]), axis = 1)
#min_tr    = np.array(train[models].min(axis     = 1))
#max_tr    = np.array(train[models].max(axis     = 1))

In [56]:
# check performance
cv_perf = np.round(log_loss(y, preds_oof), 6)
cv_perf

0.950909

# 4. SUBMISSION

In [57]:
# file name
model = 'blend'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + '_' + perf

In [58]:
# export submission
#sub = pd.DataFrame({'id': test['id'], 'duration': preds_test})
#sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
#sub.head()

In [62]:
# export submission
sub = pd.DataFrame(preds_test)
sub.insert(0, column = 'id', value = test['id'].reset_index(drop = True))
sub.columns = ['id', '0', '1', '2']
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()

Unnamed: 0,id,0,1,2
0,0,0.221163,0.345744,0.433093
1,1,0.402359,0.23782,0.359821
2,2,0.215629,0.763349,0.021022
3,3,0.190338,0.271914,0.537748
4,4,0.556162,0.217798,0.22604


In [63]:
# check correlation with previous submission
#prev_sub = pd.read_csv('../submissions/lgb_v8_375.csv')
#cor = np.sum(prev_sub[target] == sub.reset_index()[target]) / len(sub)
#print("Share of the same predictions: " + str(np.round(cor, 6)))

In [64]:
# check correlation with previous submission
prev_sub = pd.read_csv('../submissions/lgb_v1_96790.csv')
pd.Series(np.diag(sub.apply(lambda x: prev_sub.corrwith(x))))

FileNotFoundError: [Errno 2] File b'../submissions/lgb_v1_96790.csv' does not exist: b'../submissions/lgb_v1_96790.csv'

In [65]:
# submit to QSCORE
comment = ''
submit_prediction(sub, sep = ',', index = False, comment = str(comment) + ' - ' + name)