# 1. SETTINGS

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

import os
import time
import multiprocessing

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

import gc
gc.enable()

# 2. DATA PREPARATION

In [2]:
### IMPORT OOF PREDS
names = os.listdir('../oof_preds')

for name in names:

    tmp = pd.read_csv('../oof_preds/' + str(name))
    tmp.columns = [name, 'target']
    
    if name == names[0]:     
        full_train = tmp
        
    else:
        del tmp['target']
        full_train = pd.concat([full_train, tmp], axis = 1)
        
print(full_train.shape)

(1855735, 6)


In [3]:
# extract target
y = full_train['target']
del full_train['target']

In [4]:
# remove constant columns
full_train = full_train.loc[:, full_train.apply(pd.Series.nunique) != 1]
full_train.shape

(1855735, 5)

# 3. CROSS-VALIDATION

## GALACTIC OBJECTS

In [5]:
# exclude features
excluded_feats = []
features = [f for f in full_train.columns if f not in excluded_feats]
print(full_train[features].shape)

(1855735, 5)


In [6]:
### PARAMETERS

# LGB parameters
lgb_params = {
    'boosting_type':   'gbdt',
    'objective':       'rmse',
    'metric':          'rmse',
    'subsample':        0.9,
    'feature_fraction': 0.7,
    'lambda_l1':        0.03,
    'lambda_l2':        0.03,
    'min_split_gain':   0.01,
    'min_child_weight': 5,
    'silent':           True,
    'verbosity':        -1,
    'learning_rate':    0.03,
    'max_depth':        3,
    'n_estimators':     1000,
    'nthread' :         16
}

# loss function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# validation
folds = KFold(n_splits     = 5, 
              shuffle      = True, 
              random_state = 42)

In [7]:
### CROSS-VALIDATION LOOP

# create objects
clfs = []
oof_preds = []
oof_reals = []

# modeling loop
start  = time.time()
for fold_, (trn_, val_) in enumerate(folds.split(full_train)):
    
    # data partitioning
    trn_x, trn_y = full_train[features].iloc[trn_], y.iloc[trn_]
    val_x, val_y = full_train[features].iloc[val_], y.iloc[val_]
    
    # train the model
    clf = lgb.LGBMRegressor(**lgb_params) 
    clf.fit(
        trn_x, trn_y,
        eval_set              = [(trn_x, trn_y), (val_x, val_y)],
        eval_metric           = 'rmse',
        verbose               = 50,
        early_stopping_rounds = 50
    )
    clfs.append(clf)
    
    # OOF predictions
    oof_preds.append(list(clf.predict(val_x, num_iteration = clf.best_iteration_)))
    oof_reals.append(val_y)
    
    # feedback
    print('-------------------------------------')
    print('Fold ' + str(fold_ + 1) + ': RMSE = ' + str(round(rmse(val_y, clf.predict(val_x, num_iteration = clf.best_iteration_)), 6)))  
    print('-------------------------------------')
    print('')
    
    # clean up
    gc.collect()
    
# OOF predictions
oof_preds = [item for sublist in oof_preds for item in sublist]
oof_reals = [item for sublist in oof_reals for item in sublist]
oof_preds_df = pd.DataFrame({'pred': oof_preds, 'real': oof_reals})
    
# print performance
cv_perf = rmse(oof_reals, oof_preds)
print('')
print('OOF RMSE: %.6f ' % cv_perf)
print('Done in %6.1f minutes' % ((time.time() - start) / 60))

Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 0.282617	valid_1's rmse: 0.282356
Early stopping, best iteration is:
[47]	training's rmse: 0.282663	valid_1's rmse: 0.282349
-------------------------------------
Fold 1: RMSE = 0.282349
-------------------------------------

Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 0.283715	valid_1's rmse: 0.277463
[100]	training's rmse: 0.283129	valid_1's rmse: 0.277403
[150]	training's rmse: 0.282776	valid_1's rmse: 0.277412
Early stopping, best iteration is:
[128]	training's rmse: 0.282923	valid_1's rmse: 0.277391
-------------------------------------
Fold 2: RMSE = 0.277391
-------------------------------------

Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 0.275136	valid_1's rmse: 0.30997
Early stopping, best iteration is:
[40]	training's rmse: 0.275334	valid_1's rmse: 0.309939
-------------------------------------
Fold 3: RMSE = 0.30

In [8]:
###### TRACKING RESULTS

# 3 LGB models:  0.282905
# 4 LGB models:  0.282833
# 5 LGB models:  0.282808

In [9]:
# calibration of porbs
preds = pd.Series(oof_preds)
preds[preds < 0] = 0
print('RMSE before: %.6f ' % rmse(oof_reals, oof_preds))
print('RMSE after:  %.6f ' % rmse(oof_reals, preds))

RMSE before: 0.282808 
RMSE after:  0.282808 


# 4. PREDICTIONS

In [10]:
### IMPORT TEST PREDS
for name in names:

    tmp = pd.read_csv('../submissions/' + str(name))
    tmp.columns = ['fullVisitorId', name]
    
    if name == names[0]:     
        full_test = tmp
        
    else:
        del tmp['fullVisitorId']
        full_test = pd.concat([full_test, tmp], axis = 1)
        
full_test.shape

(296530, 6)

In [11]:
### PREDICT

# make predictions
start  = time.time()
preds = None
for clf in clfs:
    cur_test_preds = clf.predict(full_test[features], num_iteration = clf.best_iteration_)
    cur_test_preds = pd.Series(cur_test_preds)
    cur_test_preds[cur_test_preds < 0] = 0
    cur_test_preds = cur_test_preds.values
    if preds is None:
        preds =  cur_test_preds / (folds.n_splits)
    else:
        preds += cur_test_preds / (folds.n_splits)

# clean up
gc.collect()
print('Done in %5.1f minutes' % ((time.time() - start) / 60))

Done in   0.0 minutes


In [12]:
# create submission
sub = pd.DataFrame(preds, columns = ['predictedLogRevenue'])
sub['fullVisitorId'] = full_test['fullVisitorId'].values
sub = sub[['fullVisitorId', 'predictedLogRevenue']]
print('Predictions shape: ', sub.shape)

Predictions shape:  (296530, 2)


In [13]:
# file name
model = 'stacking'
perf  = str(round(cv_perf, 6))[2:8]
name  = model + '_' + perf

In [14]:
# export submission
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape

(296530, 2)