In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.random import set_seed

from kaggle_prediction_interval_birthweight.workflow.validation import Validator

np.random.seed(1)
set_seed(1)

In [3]:
def plot_result(obs, lower, upper, lower_new, upper_new):
    fig, ax = plt.subplots(1, 3, figsize=(10, 3))
    ax[0].vlines(x=obs, ymin=lower, ymax=upper, alpha=0.25)
    ax[0].set_xlim(
        np.min(np.concatenate([obs, lower])), np.max(np.concatenate([obs, upper]))
    )
    ax[0].set_ylim(
        np.min(np.concatenate([obs, lower])), np.max(np.concatenate([obs, upper]))
    )
    ax[0].plot([0, 1], [0, 1], transform=ax[0].transAxes, color="orange")
    ax[0].set_xlabel("observations")
    ax[0].set_ylabel("predictions")
    ax[1].hist(lower, bins=100, density=True, color="blue", alpha=0.75, label="train")
    ax[1].hist(
        lower_new, bins=100, density=True, color="orange", alpha=0.75, label="test"
    )
    ax[1].set_xlabel("predicted lower bounds")
    ax[1].legend()
    ax[2].hist(upper, bins=100, density=True, color="blue", alpha=0.75, label="train")
    ax[2].hist(
        upper_new, bins=100, density=True, color="orange", alpha=0.75, label="test"
    )
    ax[2].set_xlabel("predicted upper bounds")
    ax[2].legend()
    plt.show()

In [4]:
data = pd.read_csv("~/dev/data/kaggle-prediction-interval-birthweight/train.csv")
data_test = pd.read_csv("~/dev/data/kaggle-prediction-interval-birthweight/test.csv")

In [None]:
nnens_validator = Validator("NeuralNetEnsembler")
nnens_validator.fit(data)
nnens_validator.print_performance_summary()

lower, upper = nnens_validator.predict_intervals(data)
lower_new, upper_new = nnens_validator.predict_intervals(data_test)

plot_result(data["DBWT"], lower, upper, lower_new, upper_new)

data_test[["id"]].assign(pi_lower=lower_new, pi_upper=upper_new).to_csv(
    "~/dev/data/kaggle-prediction-interval-birthweight/submission_nnens.csv",
    index=False,
)

Validation on fold 1 of 2 begins.
Ensembler fold 1 of 3 begins.
Training the ridge regression model.
Training the histogram boosting model.
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Training the neural network regressor.


In [None]:
hbens_validator = Validator("HistBoostEnsembler")
hbens_validator.fit(data)
hbens_validator.print_performance_summary()

lower, upper = hbens_validator.predict_intervals(data)
lower_new, upper_new = hbens_validator.predict_intervals(data_test)

plot_result(data["DBWT"], lower, upper, lower_new, upper_new)

data_test[["id"]].assign(pi_lower=lower_new, pi_upper=upper_new).to_csv(
    "~/dev/data/kaggle-prediction-interval-birthweight/submission_hbens.csv",
    index=False,
)

In [None]:
nnc_validator = Validator("MissingnessNeuralNetClassifier")
nnc_validator.fit(data)
nnc_validator.print_performance_summary()

lower, upper = nnc_validator.predict_intervals(data)
lower_new, upper_new = nnc_validator.predict_intervals(data_test)

plot_result(data["DBWT"], lower, upper, lower_new, upper_new)

data_test[["id"]].assign(pi_lower=lower_new, pi_upper=upper_new).to_csv(
    "~/dev/data/kaggle-prediction-interval-birthweight/submission_nnc.csv",
    index=False,
)

In [None]:
rr_validator = Validator("RidgeRegressor")
rr_validator.fit(data)
rr_validator.print_performance_summary()

lower, upper = rr_validator.predict_intervals(data)
lower_new, upper_new = rr_validator.predict_intervals(data_test)

plot_result(data["DBWT"], lower, upper, lower_new, upper_new)

data_test[["id"]].assign(pi_lower=lower_new, pi_upper=upper_new).to_csv(
    "~/dev/data/kaggle-prediction-interval-birthweight/submission_rr.csv",
    index=False,
)

In [None]:
nn_validator = Validator("MissingnessNeuralNetRegressor", bayesian=False)
nn_validator.fit(data)
nn_validator.print_performance_summary()

lower, upper = nn_validator.predict_intervals(data)
lower_new, upper_new = nn_validator.predict_intervals(data_test)

plot_result(data["DBWT"], lower, upper, lower_new, upper_new)

data_test[["id"]].assign(pi_lower=lower_new, pi_upper=upper_new).to_csv(
    "~/dev/data/kaggle-prediction-interval-birthweight/submission_nn.csv",
    index=False,
)

In [None]:
hbr_validator = Validator("HistBoostRegressor")
hbr_validator.fit(data)
hbr_validator.print_performance_summary()

lower, upper = hbr_validator.predict_intervals(data)
lower_new, upper_new = hbr_validator.predict_intervals(data_test)

plot_result(data["DBWT"], lower, upper, lower_new, upper_new)

data_test[["id"]].assign(pi_lower=lower_new, pi_upper=upper_new).to_csv(
    "~/dev/data/kaggle-prediction-interval-birthweight/submission_hbr.csv",
    index=False,
)