In [1]:
%matplotlib inline
import os
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange, tqdm

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import lightgbm as lgb

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats

from fastai.datasets import Config

import warnings
warnings.filterwarnings("ignore")

base_path = Config.data_path()

In [5]:
# import lightgbm as lgb
# import xgboost as xgb
# import catboost as cb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample

# Competiton files setup

In [6]:
data_path = base_path/'LANL_Earthquake_Prediction'
competition_name = 'LANL-Earthquake-Prediction'
data_path

PosixPath('/home/krzysiek/.fastai/data/LANL_Earthquake_Prediction')

In [7]:
train_prcessed_segments_path = data_path/'train'/'processed_df'
test_prcessed_segments_path = data_path/'test_processed_df'

In [8]:
train_prcessed_segments_path.ls()

[PosixPath('/home/krzysiek/.fastai/data/LANL_Earthquake_Prediction/train/processed_df/no_overlap_time&freq_features.csv')]

In [9]:
test_prcessed_segments_path.ls()

[PosixPath('/home/krzysiek/.fastai/data/LANL_Earthquake_Prediction/test_processed_df/time&freq_features.csv')]

In [13]:
# training_set_step_10000_only_time = pd.read_csv(train_prcessed_segments_path/'step_10000_only_time_features.csv')
# training_set_no_overlap_only_time = pd.read_csv(train_prcessed_segments_path/'no_overlap_only_time_features.csv')
training_set_no_overlap_time_freq = pd.read_csv(train_prcessed_segments_path/'no_overlap_time&freq_features.csv')

In [14]:
test_set_time_freq = pd.read_csv(test_prcessed_segments_path/'time&freq_features.csv')

In [49]:
# training_set_no_overlap_only_time.drop(['Unnamed: 0'], axis=1, inplace=True)
test_set_time_freq['name']

0       seg_4743ab
1       seg_f86c41
2       seg_2d92f0
3       seg_1a8f2c
4       seg_8509db
5       seg_9b7ef8
6       seg_c654e7
7       seg_d59e4e
8       seg_25cca7
9       seg_22e509
10      seg_407b2b
11      seg_57908c
12      seg_268249
13      seg_15c9f9
14      seg_6f2222
15      seg_477c83
16      seg_32ad0f
17      seg_f7050a
18      seg_0968f1
19      seg_b36650
20      seg_61f504
21      seg_5254ce
22      seg_2c3203
23      seg_90c258
24      seg_5311d1
25      seg_62a403
26      seg_d398df
27      seg_1ef708
28      seg_533613
29      seg_218049
           ...    
2594    seg_41be7d
2595    seg_c42490
2596    seg_0c89ce
2597    seg_c901c0
2598    seg_b0a794
2599    seg_4c18e2
2600    seg_efc5fb
2601    seg_fde767
2602    seg_c08d36
2603    seg_a68007
2604    seg_1201e8
2605    seg_d516e3
2606    seg_f7290f
2607    seg_ab2a78
2608    seg_6d35cd
2609    seg_ff1a62
2610    seg_61219c
2611    seg_92be9f
2612    seg_5467c8
2613    seg_ef74dc
2614    seg_280863
2615    seg_

In [32]:
training_set_step_10000_only_time.drop(['Unnamed: 0'], axis=1, inplace=True)
training_set_step_10000_only_time.head()

Unnamed: 0,target,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,1.430797,4.884113,5.101106,104.0,-98.0,-8e-05,74836.577199,104.0,0.0,6.488552,...,0.295715,5.629,3.896,4.072,4.379,5.338,5.484,-1.704698e-06,74222.343443,5.629
1,1.4276,4.857127,4.324007,52.0,-56.0,7e-06,74890.988734,56.0,0.0,4.50636,...,0.290901,5.629,3.896,4.072,4.375,5.32,5.483,-1.90604e-06,74333.053867,5.629
2,1.425498,4.837627,5.334216,181.0,-154.0,-2.7e-05,74986.731761,181.0,0.0,4.680861,...,0.298291,5.667,3.412,4.069,4.36,5.32,5.483,-6.107383e-07,74474.087926,5.667
3,1.423396,4.81116,5.322245,181.0,-154.0,3.3e-05,74989.309697,181.0,0.0,4.748028,...,0.303731,5.667,3.412,4.069,4.36,5.32,5.483,-3.402685e-06,74346.540137,5.667
4,1.420198,4.792853,5.418203,181.0,-154.0,2.7e-05,75145.482449,181.0,0.0,4.725781,...,0.30277,5.667,3.412,4.069,4.359,5.32,5.483,-3.221477e-07,74551.37712,5.667


In [86]:
feature_scores_path = data_path/'feature_evaluation'/'single_fueature_scores.csv'
feature_scores = pd.read_csv(feature_scores_path)
# best_22_features = list(feature_scores.sort_values('score')[:22]['feature_name'])

In [77]:
sorted_features = list(feature_scores.sort_values('score')['feature_name'])
print(f"# of features: {len(sorted_features)}")

# of features: 137


In [78]:
best_70_features = sorted_features[:70]
best_22_features = sorted_features[:22]

In [15]:
# Current training and test sets
df_train = training_set_no_overlap_time_freq
df_test = test_set_time_freq

df_test_segment_names = df_test['name']
df_test.drop(['name'], axis=1, inplace=True)

In [80]:
df_test = df_test[best_22_features]
best_22_features.append('target')
df_train = df_train[best_22_features]
df_train.head()

Unnamed: 0,q05_roll_std_100,q05_roll_std_1000,q05_roll_std_10,q01_roll_std_100,q01_roll_std_10,q01_roll_std_1000,mad,MA_400MA_std_mean,q95,q95.1,...,ave_roll_std_100,ave_roll_std_1000,MA_700MA_std_mean,Hilbert_mean,q05,q01,q99,abs_q99,q95_roll_std_1000,target
0,2.475639,2.706474,1.636392,2.302634,1.264911,2.616094,3.263401,4.155806,11.0,11.0,...,4.05045,4.28859,4.229416,7.027028,-2.0,-8.0,18.0,20.0,8.185756,1.430797
1,2.475965,2.674879,1.646545,2.300285,1.286684,2.612482,3.574302,4.608579,12.0,12.0,...,4.436359,4.843486,4.735352,7.380383,-2.0,-11.0,21.0,24.0,10.544982,1.391499
2,2.538591,2.761534,1.686548,2.374613,1.316561,2.660178,3.948411,5.12256,13.0,13.0,...,4.917334,5.423013,5.28318,8.01693,-3.0,-15.0,26.0,30.0,14.845834,1.353196
3,2.496442,2.716991,1.649916,2.330539,1.269296,2.624962,3.647117,4.698889,12.0,12.0,...,4.533343,4.93928,4.826369,7.60685,-2.0,-12.0,22.0,26.0,11.715642,1.313798
4,2.491521,2.719174,1.646545,2.314731,1.269296,2.628699,3.826052,4.910515,12.0,12.0,...,4.761149,5.121868,5.023478,7.895403,-2.0,-15.0,26.0,32.0,13.923676,1.2744


In [33]:
# Switch only train set
df_train = training_set_step_10000_only_time

In [9]:
print(f"Train length: {len(df_train)}")
print(f"Test length: {len(df_test)}")

Train length: 4178
Test length: 2624


We will use LightGBM as a model to predict target for test dataset. This is a tree-based model, which cannot see frequency of the feature (number of occurrences of particular value in a feature). It turns out that this is important in this case, so frequencies needs to be added as separate feature. 

# Model Training
## 1st Layer of ensamble

In [20]:
train_data = df_train.drop('target', axis=1)
targets = df_train['target']

## Scaling
scaler.fit(train_data)
train_data = pd.DataFrame(scaler.transform(train_data))
df_test = pd.DataFrame(scaler.transform(df_test))
##

In [24]:
param = {
    'learning_rate': 0.055,
    'num_leaves': 8,
    'metric':'mean_absolute_error',
    'boost_from_average':'false',
    'feature_fraction': 0.7, # This is important; we must use ALL features in every iteration to make sure that feature freq will be used
    'max_depth': 3,
    'objective': 'regression',
#     'verbosity': -10
    'boosting': 'gbdt',
    'lambda_l1': 0.45,
    'lambda_l2': 0.02
}

In [21]:
param = lgbm_params

In [23]:
%%time

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

nbr_of_folds = 10
nbr_of_rounds = 10000

folds = KFold(
    n_splits=nbr_of_folds,
    shuffle=True,
    random_state=2019
)

scaler = StandardScaler()

out_of_fold_predictions = np.zeros(len(df_train))
test_predictions = np.zeros(len(df_test))


folds_predictions = np.zeros(len(df_test))
training_results = dict()

for fold_nbr, (train_idx, valid_idx) in enumerate(folds.split(train_data, targets)):
    fold_train_data = train_data.iloc[train_idx]
    fold_train_target = targets.iloc[train_idx]
        
    fold_valid_data = train_data.iloc[valid_idx]
    fold_valid_target = targets.iloc[valid_idx]

    lgb_fold_train = lgb.Dataset(fold_train_data, label=fold_train_target)
    lgb_fold_valid = lgb.Dataset(fold_valid_data, label=fold_valid_target, reference=lgb_fold_train)

    model = lgb.train(
        param, 
        lgb_fold_train, 
        nbr_of_rounds, 
        valid_sets=[lgb_fold_train, lgb_fold_valid], 
       early_stopping_rounds=500,
        verbose_eval=False, 
        evals_result=training_results,
#        verbose_eval=5000
    )
    
#     print(training_results)
#     scores = training_results['valid_1']['l1']
#     best_score = min(scores)
#     best_round = scores.index(min(scores))
#     print(f"Fold #{fold_nbr}: Best l1: {best_score} ({best_round} iteration)")

    out_of_fold_predictions[valid_idx] = model.predict(fold_valid_data) #, num_iteration=best_round)
    folds_predictions += model.predict(df_test) #, num_iteration=best_round)

test_predictions = folds_predictions/nbr_of_folds

print(f"Final score: {mean_absolute_error(targets.values, out_of_fold_predictions)}")

Final score: 2.0322617120741273
CPU times: user 31.7 s, sys: 112 ms, total: 31.8 s
Wall time: 7.96 s


In [24]:
test_predictions

array([7.19969 , 4.450899, 9.509817, 7.613244, ..., 5.66776 , 6.436429, 4.000546, 6.882363])

In [38]:
np.mean(targets)

5.663420702760496

Best 22 features

Without scaler: 2.07108839172104 (LB 1.614)
With scaler: 2.0674590671564608 

Best 70 features

Without scaler: 2.0766704829894915 (LB 1.604)
With scaler: 2.0746327066782286

All features

Without scaler:  2.0640357531770532 (LB 1.554?)
With scaler: 2.0634338328764064

In [16]:
#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**11 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'mae' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**12 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'mae' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#CATBOOST PARAMETERS
CB_MAX_DEPTH = 8 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'MAE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                          'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            
            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
            cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=False,
                                early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42)
            
            best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels)
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc'] #modify as required for other classification metrics
        objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
        objective_list_class = ['binary', 'cross_entropy']
        #for classification set objective_list = objective_list_class
        objective_list = objective_list_reg

        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 255, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01)
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #XGBoost
    #==========
    
    if package=='xgb':
        
        print('Running {} rounds of XGBoost parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth']
        
        def objective(space_params):
            
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract multiple nested tree_method conditional parameters
            #libera te tutemet ex inferis
            if space_params['tree_method']['tree_method'] == 'hist':
                max_bin = space_params['tree_method'].get('max_bin')
                space_params['max_bin'] = int(max_bin)
                if space_params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                    grow_policy = space_params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                    space_params['grow_policy'] = grow_policy
                    space_params['tree_method'] = 'hist'
                else:
                    max_leaves = space_params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                    space_params['grow_policy'] = 'lossguide'
                    space_params['max_leaves'] = int(max_leaves)
                    space_params['tree_method'] = 'hist'
            else:
                space_params['tree_method'] = space_params['tree_method'].get('tree_method')
                
            #for classification replace EVAL_METRIC_XGB_REG with EVAL_METRIC_XGB_CLASS
            cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB_REG],
                             early_stopping_rounds=100, stratified=False, seed=42)
            
            best_loss = cv_results['test-mae-mean'].iloc[-1] #or 'test-rmse-mean' if using RMSE
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['test-auc-mean'].iloc[-1]
            #if necessary, replace 'test-auc-mean' with 'test-[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = xgb.DMatrix(data, labels)
        
        #integer and string parameters, used with hp.choice()
        boosting_list = ['gbtree', 'gblinear'] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc']
        #modify as required for other classification metrics classification
        
        tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                                'grow_policy' : {'grow_policy':'lossguide',
                                                  'max_leaves': hp.quniform('max_leaves', 32, XGB_MAX_LEAVES, 1)}}}]
        
        #if using GPU, replace 'exact' with 'gpu_exact' and 'hist' with
        #'gpu_hist' in the nested dictionary above
        
        objective_list_reg = ['reg:linear', 'reg:gamma', 'reg:tweedie']
        objective_list_class = ['reg:logistic', 'binary:logistic']
        #for classification change line below to 'objective_list = objective_list_class'
        objective_list = objective_list_reg
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'tree_method' : hp.choice('tree_method', tree_method),
                'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
                'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
                'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
                'gamma' : hp.uniform('gamma', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'eval_metric' : hp.choice('eval_metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
                'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
                'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
                'nthread' : -1
            }
        
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        best['tree_method'] = tree_method[best['tree_method']]['tree_method']
        best['boosting'] = boosting_list[best['boosting']]
        best['eval_metric'] = metric_list[best['eval_metric']]
        best['objective'] = objective_list[best['objective']]
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        if 'max_bin' in best:
            best['max_bin'] = int(best['max_bin'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #CatBoost
    #==========
    
    if package=='cb':
        
        print('Running {} rounds of CatBoost parameter optimisation:'.format(num_evals))
        
        #clear memory 
        gc.collect()
            
        integer_params = ['depth',
                          #'one_hot_max_size', #for categorical data
                          'min_data_in_leaf',
                          'max_bin']
        
        def objective(space_params):
                        
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract nested conditional parameters
            if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
                bagging_temp = space_params['bootstrap_type'].get('bagging_temperature')
                space_params['bagging_temperature'] = bagging_temp
                
            if space_params['grow_policy']['grow_policy'] == 'LossGuide':
                max_leaves = space_params['grow_policy'].get('max_leaves')
                space_params['max_leaves'] = int(max_leaves)
                
            space_params['bootstrap_type'] = space_params['bootstrap_type']['bootstrap_type']
            space_params['grow_policy'] = space_params['grow_policy']['grow_policy']
                           
            #random_strength cannot be < 0
            space_params['random_strength'] = max(space_params['random_strength'], 0)
            #fold_len_multiplier cannot be < 1
            space_params['fold_len_multiplier'] = max(space_params['fold_len_multiplier'], 1)
                       
            #for classification set stratified=True
            cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, 
                             early_stopping_rounds=25, stratified=False, partition_random_seed=42)
           
            best_loss = cv_results['test-MAE-mean'].iloc[-1] #'test-RMSE-mean' for RMSE
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = cv_results['test-Logloss-mean'].iloc[-1]
            #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'
            
            return{'loss':best_loss, 'status': STATUS_OK}
        
        train = cb.Pool(data, labels.astype('float32'))
        
        #integer and string parameters, used with hp.choice()
        bootstrap_type = [{'bootstrap_type':'Poisson'}, 
                           {'bootstrap_type':'Bayesian',
                            'bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))},
                          {'bootstrap_type':'Bernoulli'}] 
        LEB = ['No', 'AnyImprovement', 'Armijo'] #remove 'Armijo' if not using GPU
        #score_function = ['Correlation', 'L2', 'NewtonCorrelation', 'NewtonL2']
        grow_policy = [{'grow_policy':'SymmetricTree'},
                       {'grow_policy':'Depthwise'},
                       {'grow_policy':'Lossguide',
                        'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
        eval_metric_list_reg = ['MAE', 'RMSE', 'Poisson']
        eval_metric_list_class = ['Logloss', 'AUC', 'F1']
        #for classification change line below to 'eval_metric_list = eval_metric_list_class'
        eval_metric_list = eval_metric_list_reg
                
        space ={'depth': hp.quniform('depth', 2, CB_MAX_DEPTH, 1),
                'max_bin' : hp.quniform('max_bin', 1, 32, 1), #if using CPU just set this to 254
                'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 5),
                'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 50, 1),
                'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
                #'one_hot_max_size' : hp.quniform('one_hot_max_size', 2, 16, 1), #uncomment if using categorical features
                'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
                'learning_rate' : hp.uniform('learning_rate', 0.05, 0.25),
                'eval_metric' : hp.choice('eval_metric', eval_metric_list),
                'objective' : OBJECTIVE_CB_REG,
                #'score_function' : hp.choice('score_function', score_function), #crashes kernel - reason unknown
                'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
                'grow_policy': hp.choice('grow_policy', grow_policy),
                #'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),# CPU only
                'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
                'od_type' : 'Iter',
                'od_wait' : 25,
                'task_type' : 'GPU',
                'verbose' : 0
            }
        
        #optional: run CatBoost without GPU
        #uncomment line below
        #space['task_type'] = 'CPU'
            
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        #unpack nested dicts first
        best['bootstrap_type'] = bootstrap_type[best['bootstrap_type']]['bootstrap_type']
        best['grow_policy'] = grow_policy[best['grow_policy']]['grow_policy']
        best['eval_metric'] = eval_metric_list[best['eval_metric']]
        
        #best['score_function'] = score_function[best['score_function']] 
        #best['leaf_estimation_method'] = LEM[best['leaf_estimation_method']] #CPU only
        best['leaf_estimation_backtracking'] = LEB[best['leaf_estimation_backtracking']]        
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')    

In [17]:
data = df_train.drop('target', axis=1)
labels = df_train['target']
lgbm_params = quick_hyperopt(data, labels, 'lgbm', 2500)

Running 2500 rounds of LightGBM parameter optimisation:
100%|██████████| 2500/2500 [46:54<00:00,  1.55s/it, best loss: 2.024108230634918] 
{bagging_fraction: 0.6900000000000001
boosting: goss
feature_fraction: 0.73
lambda_l1: 2.405766598405247
lambda_l2: 1.8511298264546527
learning_rate: 0.1747309888380998
max_bin: 218
max_depth: 16
metric: RMSE
min_data_in_bin: 113
min_data_in_leaf: 28
min_gain_to_split: 3.25
num_leaves: 431
objective: huber
other_rate: 0.27957018732110944
top_rate: 0.03663841459503298}


## 2nd Layer of ensamble

In [109]:
from sklearn.linear_model import LogisticRegression

X_train, X_valid, Y_train, Y_valid = train_test_split(
    out_of_fold_predictions, 
    targets.values, 
    test_size=0.15, 
    random_state=2019
)

ensamble_second_layer = LogisticRegression(n_jobs=-1)
ensamble_second_layer.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [118]:
probs = ensamble_second_layer.predict_proba(X_valid)
roc_auc_score(Y_valid, probs[:,1])

0.9040947053429751

In [115]:
import statsmodels.api as sm

logr = sm.Logit(targets, out_of_fold_predictions)
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(out_of_fold_predictions)
ensemble_auc = roc_auc_score(targets, ensemble_preds)  
print('##################')
print('Combined Model with magic Val_AUC=',round(ensemble_auc,5))

##################
Combined Model with magic Val_AUC= 0.91402


In [117]:
lgb_ensemble_train = lgb.Dataset(X_train, label=Y_train)
lgb_ensemble_valid = lgb.Dataset(X_valid, label=Y_valid, reference=lgb_ensemble_train)

model = lgb.train(
    param, 
    lgb_ensemble_train, 
    50000, 
    valid_sets=[lgb_ensemble_train, lgb_ensemble_valid], 
    early_stopping_rounds=500,
#     verbose_eval=False, 
#             evals_result=training_results
    verbose_eval=1000
    )

# ensemble_predicions = model.predict(fold_valid_data)
# folds_predictions += model.predict(df_test, num_iteration=best_round)


Training until validation scores don't improve for 500 rounds.
[1000]	training's auc: 0.893676	valid_1's auc: 0.878561
[2000]	training's auc: 0.912475	valid_1's auc: 0.895864
[3000]	training's auc: 0.921207	valid_1's auc: 0.902603
[4000]	training's auc: 0.926821	valid_1's auc: 0.906273
[5000]	training's auc: 0.931289	valid_1's auc: 0.908226
[6000]	training's auc: 0.935035	valid_1's auc: 0.909387
[7000]	training's auc: 0.938219	valid_1's auc: 0.910122
[8000]	training's auc: 0.94106	valid_1's auc: 0.910847
[9000]	training's auc: 0.943547	valid_1's auc: 0.911536
[10000]	training's auc: 0.945812	valid_1's auc: 0.912033
[11000]	training's auc: 0.947898	valid_1's auc: 0.912439
[12000]	training's auc: 0.949817	valid_1's auc: 0.912743
[13000]	training's auc: 0.951595	valid_1's auc: 0.912847
Early stopping, best iteration is:
[12584]	training's auc: 0.950872	valid_1's auc: 0.91289


# Final prediction and submission to Kaggle

In [25]:
submission_df = pd.DataFrame({'seg_id': df_test_segment_names, 'time_to_failure': test_predictions})

In [26]:
submission_df.head()

Unnamed: 0,seg_id,time_to_failure
0,seg_4743ab,7.19969
1,seg_f86c41,4.450899
2,seg_2d92f0,9.509817
3,seg_1a8f2c,7.613244
4,seg_8509db,5.202186


In [27]:
submission_file_name = "time_and_freq_features_new_parameters_fixed"

submission_path = data_path/'submissions'
submission_path.mkdir(exist_ok=True)
submission_file = submission_path/f"{submission_file_name}.csv"
submission_df.to_csv(submission_file, index=False)

In [28]:
# Only 2 submission allowed per day!
!kaggle competitions submit {competition_name} -f {submission_file} -m "LGBM (optimized, fixed scaling), all time and freq features, with scaling"

100%|██████████████████████████████████████| 74.6k/74.6k [00:08<00:00, 8.78kB/s]
403 - Your team has used its submission allowance (2 of 2). This resets at midnight UTC (2.1 hours from now).
