In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import pickle
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from skopt import gp_minimize

import lightgbm as lgb
from skopt import BayesSearchCV
from sklearn.svm import SVR

The purpose of this notebook is to examine the tsfresh 788 features created from the LANL time series
We want to remove the features that dont have much value (e.g., mostly 0)

In [2]:
X_train1 = pd.read_pickle("Xtf_train1.pkl")
X_train2 = pd.read_pickle("Xtf_train2.pkl")
X_train3 = pd.read_pickle("Xtf_train3.pkl")
X_train4 = pd.read_pickle("Xtf_train4.pkl")

X1_train = pd.concat([X_train1, X_train2, X_train3, X_train4], ignore_index=True)
X1_train.reset_index(drop=True)
print(X1_train.shape)


(4000, 788)


In [3]:
temp =X1_train.astype(bool).sum(axis=0)
# look for feature % that has valid values
temp = 100*temp/X1_train.shape[0]
print(temp.shape)

(788,)


In [4]:
len(temp[temp<50])

71

In [5]:
feature2drop = temp[temp<50].index.tolist()

In [6]:
X1_train_reduced = X1_train.drop(feature2drop,axis=1)
print(X1_train_reduced.shape)

(4000, 717)


In [7]:
y1_train = pd.read_pickle("y_train1.pkl")[0:4000]
y1_train.reset_index(drop=True)
print(y1_train.shape)

(4000,)


In [8]:
XX1_train, XX1_test, yy1_train, yy1_test = train_test_split(X1_train_reduced , y1_train, test_size=0.2, random_state=42)
print(XX1_train.shape, yy1_train.shape)
print(XX1_test.shape, yy1_test.shape)

(3200, 717) (3200,)
(800, 717) (800,)


---

In [9]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest MAE: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))

In [10]:
# reference
# https://www.kaggle.com/nanomathias/bayesian-optimization-of-xgboost-lb-0-9769

bayes_cv_tuner = BayesSearchCV(
    estimator = lgb.LGBMRegressor(
        objective='regression',
        metric='mae',
        num_threads=4,
    ),
    search_spaces = {
        'learning_rate': (0.01,0.05, 'uniform'),
        'num_leaves': (2, 256),      
        'max_depth': (2, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'subsample_freq': (0, 10),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'min_child_weight': (0, 10),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'n_estimators': (10, 5000),
        'feature_fraction': (0.5, 1.0),
        'bagging_fraction': (0.5, 1.0)
    },    
    scoring = 'neg_mean_absolute_error',
    cv = TimeSeriesSplit(n_splits=10),
    n_iter = 30,   
    verbose = 0,
    refit = True,
    random_state = 42
)

In [11]:
result = bayes_cv_tuner.fit(XX1_train, yy1_train, callback=status_print)

Model #1
Best MAE: -2.1619
Best params: {'bagging_fraction': 0.705051979426657, 'colsample_bytree': 0.7304484857455519, 'feature_fraction': 0.966433999423917, 'learning_rate': 0.02263198373948195, 'max_depth': 14, 'min_child_weight': 4, 'n_estimators': 1761, 'num_leaves': 190, 'reg_alpha': 5.497557739289786e-07, 'reg_lambda': 0.05936070635912049, 'subsample': 0.5544643023916863, 'subsample_freq': 1}

Model #2
Best MAE: -2.1554
Best params: {'bagging_fraction': 0.9186941777766422, 'colsample_bytree': 0.8844821246070537, 'feature_fraction': 0.6517050549420875, 'learning_rate': 0.04804895626373318, 'max_depth': 18, 'min_child_weight': 1, 'n_estimators': 700, 'num_leaves': 92, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'subsample': 0.6336020558163782, 'subsample_freq': 10}

Model #3
Best MAE: -2.1443
Best params: {'bagging_fraction': 0.7224162561505759, 'colsample_bytree': 0.9195352964526833, 'feature_fraction': 0.5524295792763518, 'learning_rate': 0.0273333120748


Model #23
Best MAE: -2.1385
Best params: {'bagging_fraction': 1.0, 'colsample_bytree': 1.0, 'feature_fraction': 0.5, 'learning_rate': 0.01, 'max_depth': 20, 'min_child_weight': 0, 'n_estimators': 5000, 'num_leaves': 256, 'reg_alpha': 1e-09, 'reg_lambda': 0.01985262260195755, 'subsample': 0.01, 'subsample_freq': 0}

Model #24
Best MAE: -2.1385
Best params: {'bagging_fraction': 1.0, 'colsample_bytree': 1.0, 'feature_fraction': 0.5, 'learning_rate': 0.01, 'max_depth': 20, 'min_child_weight': 0, 'n_estimators': 5000, 'num_leaves': 256, 'reg_alpha': 1e-09, 'reg_lambda': 0.01985262260195755, 'subsample': 0.01, 'subsample_freq': 0}

Model #25
Best MAE: -2.1385
Best params: {'bagging_fraction': 1.0, 'colsample_bytree': 1.0, 'feature_fraction': 0.5, 'learning_rate': 0.01, 'max_depth': 20, 'min_child_weight': 0, 'n_estimators': 5000, 'num_leaves': 256, 'reg_alpha': 1e-09, 'reg_lambda': 0.01985262260195755, 'subsample': 0.01, 'subsample_freq': 0}

Model #26
Best MAE: -2.1385
Best params: {'baggi

In [12]:
print(result.best_score_)
print(result.best_params_)

-2.138540201525192
{'bagging_fraction': 1.0, 'colsample_bytree': 1.0, 'feature_fraction': 0.5, 'learning_rate': 0.01, 'max_depth': 20, 'min_child_weight': 0, 'n_estimators': 5000, 'num_leaves': 256, 'reg_alpha': 1e-09, 'reg_lambda': 0.01985262260195755, 'subsample': 0.01, 'subsample_freq': 0}


In [13]:
lgb_model1 = result.best_estimator_

In [14]:
yy_test_hat = lgb_model1.predict(XX1_test)
lgb_MAE1 = mean_absolute_error(yy1_test, yy_test_hat)
print(lgb_MAE1)

2.006826421423026


with open('lgb1_bo.pkl', 'wb') as fid:
    pickle.dump(lgb_model1, fid)

In [15]:
# add a default lgb model
lgb_model1d = lgb.LGBMRegressor(boosting_type='gbdt', object='regression', random_state=42)
lgb_model1d.fit(XX1_train, yy1_train, eval_metric='mae')
yy_test_hat = lgb_model1d.predict(XX1_test)
lgb_MAE1d = mean_absolute_error(yy1_test, yy_test_hat)
print(lgb_MAE1d)

2.0608480150444333


with open('lgb1_default.pkl', 'wb') as fid:
    pickle.dump(lgb_model1d, fid)

---

Second LGB Model with 2nd 4000 sample data

In [16]:
X_train5 = pd.read_pickle("Xtf_train11.pkl")
X_train6 = pd.read_pickle("Xtf_train22.pkl")
X_train7 = pd.read_pickle("Xtf_train33.pkl")
X_train8 = pd.read_pickle("Xtf_train44.pkl")
X2_train = pd.concat([X_train5, X_train6, X_train7, X_train8], ignore_index=True)
X2_train.reset_index(drop=True)
print(X2_train.shape)
X2_train_reduced = X2_train.drop(feature2drop,axis=1)
print(X2_train_reduced.shape)

(4000, 788)
(4000, 717)


In [17]:
y2_train = pd.read_pickle("y_train2.pkl")[0:4000]
y2_train.reset_index(drop=True)
print(y2_train.shape)

(4000,)


In [18]:
XX2_train, XX2_test, yy2_train, yy2_test = train_test_split(X2_train_reduced , y2_train, test_size=0.2, random_state=42)
print(XX2_train.shape, yy2_train.shape)
print(XX2_test.shape, yy2_test.shape)

(3200, 717) (3200,)
(800, 717) (800,)


In [19]:
result = bayes_cv_tuner.fit(XX2_train, yy2_train, callback=status_print)

Model #1
Best MAE: -2.1604
Best params: {'bagging_fraction': 0.705051979426657, 'colsample_bytree': 0.7304484857455519, 'feature_fraction': 0.966433999423917, 'learning_rate': 0.02263198373948195, 'max_depth': 14, 'min_child_weight': 4, 'n_estimators': 1761, 'num_leaves': 190, 'reg_alpha': 5.497557739289786e-07, 'reg_lambda': 0.05936070635912049, 'subsample': 0.5544643023916863, 'subsample_freq': 1}

Model #2
Best MAE: -2.1604
Best params: {'bagging_fraction': 0.705051979426657, 'colsample_bytree': 0.7304484857455519, 'feature_fraction': 0.966433999423917, 'learning_rate': 0.02263198373948195, 'max_depth': 14, 'min_child_weight': 4, 'n_estimators': 1761, 'num_leaves': 190, 'reg_alpha': 5.497557739289786e-07, 'reg_lambda': 0.05936070635912049, 'subsample': 0.5544643023916863, 'subsample_freq': 1}

Model #3
Best MAE: -2.1482
Best params: {'bagging_fraction': 0.7224162561505759, 'colsample_bytree': 0.9195352964526833, 'feature_fraction': 0.5524295792763518, 'learning_rate': 0.027333312074


Model #22
Best MAE: -2.1367
Best params: {'bagging_fraction': 0.8670140089927842, 'colsample_bytree': 0.9399760402267441, 'feature_fraction': 0.5818035893192751, 'learning_rate': 0.017537011670617206, 'max_depth': 16, 'min_child_weight': 4, 'n_estimators': 2301, 'num_leaves': 138, 'reg_alpha': 0.011683028450342707, 'reg_lambda': 0.0048879464985534336, 'subsample': 0.5930070264428381, 'subsample_freq': 6}

Model #23
Best MAE: -2.1367
Best params: {'bagging_fraction': 0.8670140089927842, 'colsample_bytree': 0.9399760402267441, 'feature_fraction': 0.5818035893192751, 'learning_rate': 0.017537011670617206, 'max_depth': 16, 'min_child_weight': 4, 'n_estimators': 2301, 'num_leaves': 138, 'reg_alpha': 0.011683028450342707, 'reg_lambda': 0.0048879464985534336, 'subsample': 0.5930070264428381, 'subsample_freq': 6}

Model #24
Best MAE: -2.1367
Best params: {'bagging_fraction': 0.8670140089927842, 'colsample_bytree': 0.9399760402267441, 'feature_fraction': 0.5818035893192751, 'learning_rate': 0.

In [20]:
print(result.best_score_)
print(result.best_params_)

-2.1367227700971965
{'bagging_fraction': 0.8670140089927842, 'colsample_bytree': 0.9399760402267441, 'feature_fraction': 0.5818035893192751, 'learning_rate': 0.017537011670617206, 'max_depth': 16, 'min_child_weight': 4, 'n_estimators': 2301, 'num_leaves': 138, 'reg_alpha': 0.011683028450342707, 'reg_lambda': 0.0048879464985534336, 'subsample': 0.5930070264428381, 'subsample_freq': 6}


In [21]:
lgb_model2 = result.best_estimator_

In [22]:
yy_test_hat = lgb_model2.predict(XX2_test)
lgb_MAE2 = mean_absolute_error(yy2_test, yy_test_hat)
print(lgb_MAE2)

2.0325702406591044


with open('lgb2_bo.pkl', 'wb') as fid:
    pickle.dump(lgb_model2, fid)

In [23]:
# add a default lgb model
lgb_model2d = lgb.LGBMRegressor(boosting_type='gbdt', object='regression', random_state=42)
lgb_model2d.fit(XX2_train, yy2_train, eval_metric='mae')
yy_test_hat = lgb_model2d.predict(XX2_test)
lgb_MAE2d = mean_absolute_error(yy2_test, yy_test_hat)
print(lgb_MAE2d)

2.0561466564533526


with open('lgb2_default.pkl', 'wb') as fid:
    pickle.dump(lgb_model2d, fid)

---

Add a default lgb model for all (8 sets) training data

In [24]:
XX12_train = pd.concat([XX1_train, XX2_train], ignore_index=True)
yy12_train = pd.concat([yy1_train, yy2_train], ignore_index=True)
XX12_test = pd.concat([XX1_test, XX2_test], ignore_index=True)
yy12_test = pd.concat([yy1_test, yy2_test], ignore_index=True)
print(XX12_train.shape)
print(yy12_train.shape)
print(XX12_test.shape)
print(yy12_test.shape)


(6400, 717)
(6400,)
(1600, 717)
(1600,)


In [25]:
lgb_model0d = lgb.LGBMRegressor(boosting_type='gbdt', object='regression', random_state=42)
lgb_model0d.fit(XX12_train, yy12_train, eval_metric='mae')
yy_test_hat = lgb_model0d.predict(XX12_test)
lgb_MAE0d = mean_absolute_error(yy12_test, yy_test_hat)
print(lgb_MAE0d)

1.957283148400325


with open('lgb0_default.pkl', 'wb') as fid:
    pickle.dump(lgb_model0d, fid)

---
## Final start from here

In [26]:
svr_cv_tuner = BayesSearchCV(
    estimator = SVR(),
    search_spaces = {
        'C': (1e-6, 1e+6, 'log-uniform'),  
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'epsilon': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),  # integer valued parameter
        'kernel': ['rbf'],  # categorical parameter
    },    
    scoring = 'neg_mean_absolute_error',
    cv = KFold(shuffle=True, random_state=42),
    n_iter = 30,   
    verbose = 0,
    refit = True,
    random_state = 42
)

In [27]:
def svr_status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(svr_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(svr_cv_tuner.best_params_)
    print('Model #{}\nBest MAE: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(svr_cv_tuner.best_score_, 4),
        svr_cv_tuner.best_params_
    ))

with open('lgb0_default.pkl','rb') as fid:
    lgb_model0d = pickle.load(fid)
    
with open('lgb1_default.pkl','rb') as fid:
    lgb_model1d = pickle.load(fid)
    
with open('lgb1_bo.pkl','rb') as fid:
    lgb_model1 = pickle.load(fid)
    
with open('lgb2_default.pkl','rb') as fid:
    lgb_model2d = pickle.load(fid)
    
with open('lgb2_bo.pkl','rb') as fid:
    lgb_model2 = pickle.load(fid)

In [28]:
# using the XX12_test and yy12_test to train a SVR, using lgb1_default, lgb1_bo, lgb2_default, lgb2_bo, lgb0_default
temp1 = pd.Series(lgb_model0d.predict(XX12_test))
temp2 = pd.Series(lgb_model1d.predict(XX12_test))
temp3 = pd.Series(lgb_model1.predict(XX12_test))
temp4 = pd.Series(lgb_model2d.predict(XX12_test))
temp5 = pd.Series(lgb_model2.predict(XX12_test))
tempDF = pd.concat([temp1,temp2,temp3,temp4,temp5], axis=1, ignore_index=True)
print(tempDF.shape)
print(tempDF.head())

(1600, 5)
          0         1         2         3         4
0  5.500301  6.215069  5.777023  5.911646  5.147127
1  2.040123  1.510145  1.821516  2.638002  2.044061
2  3.820480  4.031160  3.882667  3.476231  3.267888
3  8.798126  9.198162  9.464924  9.843836  8.484780
4  2.935534  3.345457  3.844396  3.405868  4.040596


In [29]:
result = svr_cv_tuner.fit(tempDF, yy12_test, callback=svr_status_print)

Model #1
Best MAE: -2.8309
Best params: {'C': 0.08341564384216595, 'degree': 6, 'epsilon': 3.389034515643755, 'gamma': 0.00016240416181810798, 'kernel': 'rbf'}

Model #2
Best MAE: -2.5453
Best params: {'C': 11185.625288472094, 'degree': 7, 'epsilon': 0.00013300585802877296, 'gamma': 4.555828280308062, 'kernel': 'rbf'}

Model #3
Best MAE: -1.9912
Best params: {'C': 0.21776603694820984, 'degree': 7, 'epsilon': 5.420184998488358e-06, 'gamma': 0.0010797659128262546, 'kernel': 'rbf'}

Model #4
Best MAE: -1.9912
Best params: {'C': 0.21776603694820984, 'degree': 7, 'epsilon': 5.420184998488358e-06, 'gamma': 0.0010797659128262546, 'kernel': 'rbf'}

Model #5
Best MAE: -1.9912
Best params: {'C': 0.21776603694820984, 'degree': 7, 'epsilon': 5.420184998488358e-06, 'gamma': 0.0010797659128262546, 'kernel': 'rbf'}

Model #6
Best MAE: -1.924
Best params: {'C': 643.1854566491933, 'degree': 8, 'epsilon': 1.3971732124886836e-05, 'gamma': 2.084380695810582e-05, 'kernel': 'rbf'}

Model #7
Best MAE: -1.924

In [30]:
print(result.best_score_)
print(result.best_params_)

-1.9238495869643901
{'C': 1283.0569028277723, 'degree': 8, 'epsilon': 3.6982728415017055e-06, 'gamma': 1.0933824717608816e-05, 'kernel': 'rbf'}


In [31]:
# final model
Fmodel = result.best_estimator_

---
## Submission

In [32]:
X_test = pd.read_pickle('Xtest_DF788.pkl')
X_test= X_test.drop(feature2drop,axis=1)
print(X_test.shape)

(2624, 717)


In [33]:
temp1 = pd.Series(lgb_model0d.predict(X_test))
temp2 = pd.Series(lgb_model1d.predict(X_test))
temp3 = pd.Series(lgb_model1.predict(X_test))
temp4 = pd.Series(lgb_model2d.predict(X_test))
temp5 = pd.Series(lgb_model2.predict(X_test))
tempDF = pd.concat([temp1,temp2,temp3,temp4,temp5], axis=1, ignore_index=True)
print(tempDF.shape)
print(tempDF.head())

(2624, 5)
          0         1         2         3         4
0  3.354115  3.054809  3.496142  4.309653  3.740725
1  5.865328  5.649376  5.068571  5.231328  5.146969
2  5.588823  4.924261  5.939926  5.232051  5.342942
3  8.521590  8.753887  8.662658  7.879896  9.107186
4  6.061305  7.373901  6.450980  6.499582  6.116163


In [34]:
y_hat = Fmodel.predict(tempDF)
print(y_hat.shape)
print(y_hat[:10])

(2624,)
[2.87921739 5.2290565  5.39292273 8.66217215 5.86240409 1.03773466
 6.60268539 3.74514315 2.72469695 2.39473128]


In [35]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')
submission['time_to_failure'] = y_hat.clip(0, 16)
submission.to_csv('submission_lgb_svr_02.csv')