In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.random import set_seed

from kaggle_prediction_interval_birthweight.data.data_processing import DataProcessor
from kaggle_prediction_interval_birthweight.workflow.validation import Validator
from kaggle_prediction_interval_birthweight.workflow.histboost_tuning import (
    HistBoostTuner,
)

np.random.seed(1)
set_seed(1)

In [3]:
def plot_result(obs, lower, upper, lower_new, upper_new):
    fig, ax = plt.subplots(1, 3, figsize=(10, 3))
    ax[0].vlines(x=obs, ymin=lower, ymax=upper, alpha=0.25)
    ax[0].set_xlim(
        np.min(np.concatenate([obs, lower])), np.max(np.concatenate([obs, upper]))
    )
    ax[0].set_ylim(
        np.min(np.concatenate([obs, lower])), np.max(np.concatenate([obs, upper]))
    )
    ax[0].plot([0, 1], [0, 1], transform=ax[0].transAxes, color="orange")
    ax[0].set_xlabel("observations")
    ax[0].set_ylabel("predictions")
    ax[1].hist(lower, bins=100, density=True, color="blue", alpha=0.75, label="train")
    ax[1].hist(
        lower_new, bins=100, density=True, color="orange", alpha=0.75, label="test"
    )
    ax[1].set_xlabel("predicted lower bounds")
    ax[1].legend()
    ax[2].hist(upper, bins=100, density=True, color="blue", alpha=0.75, label="train")
    ax[2].hist(
        upper_new, bins=100, density=True, color="orange", alpha=0.75, label="test"
    )
    ax[2].set_xlabel("predicted upper bounds")
    ax[2].legend()
    plt.show()

In [4]:
data = pd.read_csv("~/dev/data/kaggle-prediction-interval-birthweight/train.csv")
data_test = pd.read_csv("~/dev/data/kaggle-prediction-interval-birthweight/test.csv")

In [5]:
data_processor = DataProcessor("HistBoostRegressor")
X, y = data_processor(data)

In [6]:
tuner = HistBoostTuner(
    verbose=True, categorical_feature_mask=data_processor.categorical_features
)
tuner.tune(X, y)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0002
Function value obtained: 1682.9838
Current minimum: 1664.0651
Iteration No: 2 started. Evaluating function at random point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 2 ended. Evaluation done at random point.
Time taken: 2.5587
Function value obtained: 1666.6727
Current minimum: 1664.0651
Iteration No: 3 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 3 ended. Evaluation done at random point.
Time taken: 21.1828
Function value obtained: 1676.4680
Current minimum: 1664.0651
Iteration No: 4 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 4 ended. Evaluation done at random point.
Time taken: 18.6495
Function value obtained: 1693.2273
Current minimum: 1664.0651
Iteration No: 5 started. Evaluating function at random point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 5 ended. Evaluation done at random point.
Time taken: 2.3425
Function value obtained: 1755.1341
Current minimum: 1664.0651
Iteration No: 6 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 6 ended. Evaluation done at random point.
Time taken: 4.1314
Function value obtained: 1674.2554
Current minimum: 1664.0651
Iteration No: 7 started. Evaluating function at random point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 7 ended. Evaluation done at random point.
Time taken: 2.4912
Function value obtained: 1723.9683
Current minimum: 1664.0651
Iteration No: 8 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 8 ended. Evaluation done at random point.
Time taken: 5.1719
Function value obtained: 1714.7201
Current minimum: 1664.0651
Iteration No: 9 started. Evaluating function at random point.




Iteration No: 9 ended. Evaluation done at random point.
Time taken: 15.7966
Function value obtained: 1771.6823
Current minimum: 1664.0651
Iteration No: 10 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 10 ended. Evaluation done at random point.
Time taken: 4.6725
Function value obtained: 1665.6424
Current minimum: 1664.0651
Iteration No: 11 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 11 ended. Evaluation done at random point.
Time taken: 9.6887
Function value obtained: 1682.9838
Current minimum: 1664.0651
Iteration No: 12 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 12 ended. Evaluation done at random point.
Time taken: 15.7692
Function value obtained: 1700.7893
Current minimum: 1664.0651
Iteration No: 13 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 13 ended. Evaluation done at random point.
Time taken: 37.5914
Function value obtained: 1716.8765
Current minimum: 1664.0651
Iteration No: 14 started. Evaluating function at random point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 14 ended. Evaluation done at random point.
Time taken: 2.7001
Function value obtained: 1717.2354
Current minimum: 1664.0651
Iteration No: 15 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 15 ended. Evaluation done at random point.
Time taken: 6.5257
Function value obtained: 1718.4355
Current minimum: 1664.0651
Iteration No: 16 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 16 ended. Evaluation done at random point.
Time taken: 4.3920
Function value obtained: 1639.6033
Current minimum: 1639.6033
Iteration No: 17 started. Evaluating function at random point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 17 ended. Evaluation done at random point.
Time taken: 4.1165
Function value obtained: 1664.0432
Current minimum: 1639.6033
Iteration No: 18 started. Evaluating function at random point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 18 ended. Evaluation done at random point.
Time taken: 2.7713
Function value obtained: 1650.3600
Current minimum: 1639.6033
Iteration No: 19 started. Evaluating function at random point.




Iteration No: 19 ended. Evaluation done at random point.
Time taken: 12.5509
Function value obtained: 1846.6014
Current minimum: 1639.6033
Iteration No: 20 started. Evaluating function at random point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 20 ended. Evaluation done at random point.
Time taken: 2.5894
Function value obtained: 1677.7789
Current minimum: 1639.6033
Iteration No: 21 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 6.4073
Function value obtained: 1763.9070
Current minimum: 1639.6033
Iteration No: 22 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 12.0869
Function value obtained: 1704.4623
Current minimum: 1639.6033
Iteration No: 23 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 2.6620
Function value obtained: 1769.5936
Current minimum: 1639.6033
Iteration No: 24 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 2.9237
Function value obtained: 1650.1851
Current minimum: 1639.6033
Iteration No: 25 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 2.4835
Function value obtained: 1682.0170
Current minimum: 1639.6033
Iteration No: 26 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 3.3551
Function value obtained: 1668.6096
Current minimum: 1639.6033
Iteration No: 27 started. Searching for the next optimal point.




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 10.1447
Function value obtained: 1884.5480
Current minimum: 1639.6033
Iteration No: 28 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 7.0830
Function value obtained: 1677.4401
Current minimum: 1639.6033
Iteration No: 29 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 2.6197
Function value obtained: 1775.6825
Current minimum: 1639.6033
Iteration No: 30 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 2.7345
Function value obtained: 1777.2025
Current minimum: 1639.6033
Iteration No: 31 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 4.7000
Function value obtained: 1660.2006
Current minimum: 1639.6033
Iteration No: 32 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 3.0775
Function value obtained: 1683.6722
Current minimum: 1639.6033
Iteration No: 33 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 2.6404
Function value obtained: 1785.5072
Current minimum: 1639.6033
Iteration No: 34 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 3.9243
Function value obtained: 1723.2560
Current minimum: 1639.6033
Iteration No: 35 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 4.5654
Function value obtained: 1652.0507
Current minimum: 1639.6033
Iteration No: 36 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 5.3295
Function value obtained: 1654.8681
Current minimum: 1639.6033
Iteration No: 37 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 7.1283
Function value obtained: 1706.2740
Current minimum: 1639.6033
Iteration No: 38 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 4.3857
Function value obtained: 1647.3303
Current minimum: 1639.6033
Iteration No: 39 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 2.6771
Function value obtained: 1781.3615
Current minimum: 1639.6033
Iteration No: 40 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 4.6218
Function value obtained: 1655.6811
Current minimum: 1639.6033
Iteration No: 41 started. Searching for the next optimal point.




Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 41.2127
Function value obtained: 1798.3140
Current minimum: 1639.6033
Iteration No: 42 started. Searching for the next optimal point.




Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 14.9615
Function value obtained: 1845.5839
Current minimum: 1639.6033
Iteration No: 43 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 5.7417
Function value obtained: 1658.9778
Current minimum: 1639.6033
Iteration No: 44 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 3.4676
Function value obtained: 1665.7114
Current minimum: 1639.6033
Iteration No: 45 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 2.5170
Function value obtained: 1777.1792
Current minimum: 1639.6033
Iteration No: 46 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 7.9060
Function value obtained: 1677.1758
Current minimum: 1639.6033
Iteration No: 47 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 9.3285
Function value obtained: 1680.8898
Current minimum: 1639.6033
Iteration No: 48 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 3.6914
Function value obtained: 1723.8624
Current minimum: 1639.6033
Iteration No: 49 started. Searching for the next optimal point.


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 5.8378
Function value obtained: 1711.5217
Current minimum: 1639.6033
Iteration No: 50 started. Searching for the next optimal point.


The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 9.3312
Function value obtained: 1676.7453
Current minimum: 1639.6033


The upper quantile predictions are lower
than the lower quantile predictions
at some points.
The upper predictions are lower thanthe lower predictions at some points.


Iteration No: 51 ended. Search finished for the next optimal point.
Time taken: 13.7498
Function value obtained: 1670.3116
Current minimum: 1639.6033


In [7]:
tuner.res_gp.func_vals

array([1681.11592994, 1677.99559318, 1699.64182955, 1665.34872695,
       1674.25537926, 1723.968289  , 1664.06507615, 1771.68227481,
       1715.00671317, 1682.98376227, 1666.67270747, 1676.46796599,
       1693.22731249, 1755.13412205, 1674.25537926, 1723.968289  ,
       1714.72009921, 1771.68227481, 1665.64239566, 1682.98376227,
       1700.78931628, 1716.87646397, 1717.23543074, 1718.43550673,
       1639.60334937, 1664.04324278, 1650.35996966, 1846.60140044,
       1677.77890218, 1763.90696602, 1704.46234277, 1769.59357574,
       1650.18508719, 1682.0170287 , 1668.60955183, 1884.54799407,
       1677.44007698, 1775.68250949, 1777.20246101, 1660.20057607,
       1683.67216205, 1785.50719861, 1723.25600721, 1652.05074065,
       1654.86813483, 1706.27399281, 1647.33030166, 1781.36151133,
       1655.68113493, 1798.31402117, 1845.58394512, 1658.97776319,
       1665.71143858, 1777.17920677, 1677.17584851, 1680.88982416,
       1723.86236214, 1711.52171457, 1676.74532402, 1670.31162

In [8]:
tuner.res_gp.x_iters

[[0.9807412289900471, 0.46003068044903095, 2, 50, 31],
 [0.015478974396671025, 0.0008700690210600545, 7, 47, 86],
 [0.008706037847015437, 0.004195085318955972, 5, 15, 58],
 [0.5519326346081447, 0.0019320752621429876, 5, 47, 80],
 [0.140576118573469, 0.10322562466964824, 2, 28, 88],
 [0.30721380131411513, 0.1406101906115138, 3, 8, 70],
 [0.06014458808993631, 0.022817627560812234, 5, 14, 36],
 [0.0026690727196422457, 0.08252249755325355, 5, 7, 66],
 [0.09591956282799739, 0.0003107967535980341, 5, 15, 17],
 [0.02556910810331702, 3.0259468970108244e-05, 9, 10, 57],
 [0.9807412289900471, 0.46003068044903095, 2, 50, 31],
 [0.015478974396671025, 0.0008700690210600545, 7, 47, 86],
 [0.008706037847015437, 0.004195085318955972, 5, 15, 58],
 [0.5519326346081447, 0.0019320752621429876, 5, 47, 80],
 [0.140576118573469, 0.10322562466964824, 2, 28, 88],
 [0.30721380131411513, 0.1406101906115138, 3, 8, 70],
 [0.06014458808993631, 0.022817627560812234, 5, 14, 36],
 [0.0026690727196422457, 0.08252249755

In [9]:
tuner.result

{'NOIS': 1639.6033493654973,
 'opt_parameters': {'learning_rate': 0.14772745761339162,
  'l2_regularization': 0.009648963330761146,
  'max_depth': 3,
  'max_leaf_nodes': 26,
  'min_samples_leaf': 83}}

In [10]:
# eim_validator = Validator("HistBoostRegressor")
# eim_validator.fit(data)
# eim_validator.print_performance_summary()

# lower, upper = eim_validator.predict_intervals(data)
# lower_new, upper_new = eim_validator.predict_intervals(data_test)

# plot_result(data["DBWT"], lower, upper, lower_new, upper_new)