# SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.metrics import log_loss
from sklearn.neighbors import NearestNeighbors
import copy
import scipy.stats
import os
import time
import requests

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# dark background style
plt.style.use('dark_background')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# helper functions
import functions
from functions import *

# 2. DATA PREPARATION

In [None]:
# import CSV
df = pd.read_csv('../data/data_v2.csv')
print(df.shape)

In [None]:
# target variable
target = 'granted_number_of_nights'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target variable
y = train[target]
del train[target], test[target]
classes = y.unique()
classes

# 3. MODELING

### PARAMETERS

In [None]:
# drop bad features
excluded_feats = ['request_id', 'group_main_requester_id', 'request_backoffice_creator_id', 
                  'answer_creation_date', 'group_creation_date', 'request_creation_date']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

In [None]:
### PARAMETERS

# settings
cores = -1
seed  = 23

# cross-validation
num_folds = 5
shuffle   = True

# muner of rounds
max_rounds = 1000
stopping   = 200
verbose    = 100

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'multiclass',
    'metric':            'multi_logloss',
    'num_class':         len(classes),
    'bagging_fraction':  0.9,
    'feature_fraction':  0.9,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0,
    'min_child_weight':  0.1,
    'min_child_samples': 20,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.1,
    'max_depth':         5,
    'num_leaves':        50,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
}

# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [None]:
# placeholders
clfs = []
valid_perf  = np.zeros(num_folds) 
importances = pd.DataFrame()

#preds_test   = np.zeros(test.shape[0])
#preds_oof    = np.zeros(train.shape[0])

preds_oof  = np.zeros((len(train), len(classes)))
preds_test = np.zeros((len(test),  len(classes)))

In [None]:
# SMOTE settings
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state = 23, n_jobs = 10, sampling_strategy = 0.1)

### CROSS-VALIDATION

In [None]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    test_x       = test[features]
    
    ## remove outliers
    #out_idx = (np.abs(scipy.stats.zscore(trn_x)) < 10).all(axis = 1)
    #trn_x = trn_x[out_idx]
    #trn_y = trn_y[out_idx]
    
    # scale data
    #scaler   = RobustScaler()
    #trn_x    = pd.DataFrame(scaler.fit_transform(trn_x), columns = features)
    #val_x    = pd.DataFrame(scaler.transform(val_x),     columns = features)
    #tmp_test = pd.DataFrame(scaler.transform(test_x),    columns = features)

    # augment training data with SMOTE
    #trn_x, trn_y = sm.fit_sample(trn_x, trn_y)
    #trn_x = pd.DataFrame(trn_x, columns = features)
    #trn_y = pd.Series(trn_y)
       
    # add noise to train to reduce overfitting
    #trn_x += np.random.normal(0, 0.01, trn_x.shape)
        
    # train lightGBM
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  early_stopping_rounds = stopping,
                  verbose               = verbose,
                  sample_weight         = 10**trn_y,
                  eval_sample_weight    = [10**trn_y, 10**val_y],
)
    clfs.append(clf)
    
    # find the best iteration
    best_iter = clf.best_iteration_

    # save predictions
    #preds_oof[val_idx]    = clf.predict_proba(val_x,  num_iteration = best_iter)[:, 1]
    #valid_profit[n_fold]  = log_loss(y, preds_oof)
    #preds_test           += clf.predict_proba(test_x, num_iteration = best_iter)[:, 1] / folds.n_splits 
    
    # save predictions
    preds_oof[val_idx, :] = clf.predict_proba(val_x, num_iteration = best_iter)
    valid_perf[n_fold]    = log_loss(val_y, preds_oof[val_idx, :], sample_weight = 10**val_y)
    preds_test           += clf.predict_proba(test_x, num_iteration = best_iter) / folds.n_splits 

    # importance
    fold_importance_df               = pd.DataFrame()
    fold_importance_df['Feature']    = features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold']       = n_fold + 1
    importances                      = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('--------------------------------')
    print('FOLD%2d: LOGLOSS = %.6f' % (n_fold + 1, valid_perf[n_fold]))
    print('--------------------------------')
    print('')
        
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    
# print overall performance    
cv_perf = np.mean(valid_perf)
print('--------------------------------')
print('MEAN LOGLOSS = %.6f' % cv_perf)
print('--------------------------------')

In [None]:
##### RECHECK PERFORMANCE  

# define the competition scorer
def competition_scorer(y_true, y_pred):
    return 'wloss', log_loss(y_true, y_pred, sample_weight = 10**y_true), False

##### RECHECK PERFORMANCE  
print(competition_scorer(y, preds_oof))


###### TRACKING RESULTS (5 folds, strat = True, seed = 4)



In [None]:
##### VARIABLE IMPORTANCE

# load importance    
top_feats = 100
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
    
# plot variable importance
plt.figure(figsize = (10, 15))
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False))
plt.tight_layout()

# save plot as pdf
plt.savefig('../var_importance.pdf')

# 4. SUBMISSION

In [None]:
# file name
model = 'lgb_v4'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + '_' + perf

In [None]:
# export OOF preds
#oof = pd.DataFrame({'id': train['id'], 'duration': preds_oof})
#oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
#oof.head()

In [None]:
# export submission
#sub = pd.DataFrame({'id': test['id'], 'duration': preds_test})
#sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
#sub.head()

In [None]:
# export OOF preds
oof = pd.DataFrame(preds_oof)
oof.insert(0, column = 'request_id', value = train['request_id'].reset_index(drop = True))
oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof.head()

In [None]:
# export submission
sub = pd.DataFrame(preds_test)
sub.insert(0, column = 'request_id', value = test['request_id'].reset_index(drop = True))
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()

In [None]:
##########
##########
########## SUBMIT TO QSCORE


import io, math, requests

# Get your token from qscore:
# 1. Go to https://qscore.datascience-olympics.com/
# 2. Chose the competition Data Science Olympics 2019
# 3. In the left menu click 'Submissions'
# 4. Your token is in the 'Submit from your Python Notebook' tab

def submit_prediction(df, sep=',', comment='', compression='gzip', **kwargs):
    TOKEN='434de6aeb7c04d6298f0d6b9e075e736903794bc342e69b650996ad064e78d5456b3491de344141180dbba9bfab85501c5e515a4c83c1427115ab8fed95a1a20'
    URL='https://qscore.datascience-olympics.com/api/submissions'
    df.to_csv('temporary.dat', sep=sep, compression=compression, **kwargs)
    r = requests.post(URL, headers={'Authorization': 'Bearer {}'.format(TOKEN)},files={'datafile': open('temporary.dat', 'rb')},data={'comment':comment, 'compression': compression})
    if r.status_code == 429:
        raise Exception('Submissions are too close. Next submission is only allowed in {} seconds.'.format(int(math.ceil(int(r.headers['x-rate-limit-remaining']) / 1000.0))))
    if r.status_code != 200:
        raise Exception(r.text)

In [None]:
# submit to QSCORE
comment = ''
submit_prediction(sub, sep = ',', index = False, comment = str(comment) + ' - ' + name)