# The 3 Common Synthetic Baseline Datasets

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
# # Try to limit to one thread:
# os.environ["OMP_NUM_THREADS"] = "1"
# os.environ["NUMEXPR_NUM_THREADS"] = "1"
# os.environ["MKL_NUM_THREADS"] = "1"
# os.environ["BLOSC_NTHREADS"] = "1"

from synthetic_dataset_1 import make_ds1
from synthetic_dataset_2 import make_ds2
from synthetic_dataset_3 import make_ds3
from icp_and_mil_for_deepcqr_synth import \
    create_fit_predict_evaluate, \
    create_single_unit_model, \
    train_validation_test_random_splitter, \
    create_initializer_near_0_or_1, \
    get_range_of_fit_seeds, \
    evaluation_measures, \
    _mil_icp_cross

import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp

import pickle
import os
from glob import glob
from functools import partial

# Deep Tobit Type I

In [2]:
def tobit_type1_nll_tensorflow(y_true, y_pred, noise_sigma, lower_threshold):
    def nll_not_censored(y_true, y_pred, gamma):
        nrm = tfp.distributions.Normal(loc=0, scale=1)
        cens_labels = y_true <= 0
        return -tf.math.reduce_sum(
            np.log(gamma) + nrm.log_prob(
                gamma * tf.boolean_mask(y_true, ~cens_labels) - \
                tf.boolean_mask(y_pred, ~cens_labels)))
    
    def nll_censored(y_pred, gamma, lower_threshold):
        nrm = tfp.distributions.Normal(loc=0, scale=1)
        cens_labels = y_true <= 0
        return -tf.math.reduce_sum(
            nrm.log_cdf(gamma * lower_threshold - \
                        tf.boolean_mask(y_pred, cens_labels)))
    
    return \
        nll_not_censored(y_pred=y_pred, 
                         y_true=y_true, 
                         gamma=1 / noise_sigma) + \
        nll_censored(y_pred=y_pred,
                     gamma=1 / noise_sigma,
                     lower_threshold=lower_threshold)

In [3]:
# ---------------- 1st Synth DS ------------------
create_fit_predict_evaluate(
    noise_sigma=1,
    splitter=partial(train_validation_test_random_splitter, seed=0, percent_test=0.33, percent_val_from_train=0.2),
    initalization_creator=create_initializer_near_0_or_1,
    fit_seeds=get_range_of_fit_seeds(),
    model_creator=partial(
        create_single_unit_model, 
        loss=lambda y_true, y_pred: tobit_type1_nll_tensorflow(
            noise_sigma=1, 
            y_true=y_true, 
            y_pred=y_pred, 
            lower_threshold=0),
        activation='linear',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, clipnorm=1),
        num_x_features=3),
    verbose=False,
    dataset=make_ds1(verbose=True, num_samples=1000, seed=42)
)['evaluation']

fit seeds:   0%|          | 0/20 [00:00<?, ?it/s]

Censored observations: 274 of 1000 (0.27%)
Fraction censored of test: 0.2878787878787879


fit seeds:   5%|▌         | 1/20 [00:01<00:19,  1.03s/it]

Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Current loss = 159.34573364257812, best loss before current = None


fit seeds:  10%|█         | 2/20 [00:01<00:17,  1.06it/s]

Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Current loss = 159.24673461914062, best loss before current = 159.34573364257812


fit seeds:  15%|█▌        | 3/20 [00:02<00:15,  1.11it/s]

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Current loss = 159.57029724121094, best loss before current = 159.24673461914062


fit seeds:  20%|██        | 4/20 [00:03<00:13,  1.16it/s]

Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping
Current loss = 159.31019592285156, best loss before current = 159.24673461914062


fit seeds:  25%|██▌       | 5/20 [00:04<00:13,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping
Current loss = 159.8618927001953, best loss before current = 159.24673461914062


fit seeds:  30%|███       | 6/20 [00:05<00:11,  1.17it/s]

Restoring model weights from the end of the best epoch.
Epoch 00024: early stopping
Current loss = 159.24578857421875, best loss before current = 159.24673461914062


fit seeds:  35%|███▌      | 7/20 [00:06<00:11,  1.10it/s]

Restoring model weights from the end of the best epoch.
Epoch 00048: early stopping
Current loss = 159.36019897460938, best loss before current = 159.24578857421875


fit seeds:  40%|████      | 8/20 [00:06<00:10,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00016: early stopping
Current loss = 159.27455139160156, best loss before current = 159.24578857421875


fit seeds:  45%|████▌     | 9/20 [00:07<00:09,  1.14it/s]

Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Current loss = 159.6719512939453, best loss before current = 159.24578857421875


fit seeds:  50%|█████     | 10/20 [00:08<00:08,  1.19it/s]

Restoring model weights from the end of the best epoch.
Epoch 00016: early stopping
Current loss = 159.24227905273438, best loss before current = 159.24578857421875


fit seeds:  55%|█████▌    | 11/20 [00:09<00:07,  1.23it/s]

Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Current loss = 160.5338592529297, best loss before current = 159.24227905273438


fit seeds:  60%|██████    | 12/20 [00:10<00:06,  1.21it/s]

Restoring model weights from the end of the best epoch.
Epoch 00029: early stopping
Current loss = 159.33444213867188, best loss before current = 159.24227905273438


fit seeds:  65%|██████▌   | 13/20 [00:11<00:06,  1.14it/s]

Restoring model weights from the end of the best epoch.
Epoch 00041: early stopping
Current loss = 159.29754638671875, best loss before current = 159.24227905273438


fit seeds:  70%|███████   | 14/20 [00:12<00:05,  1.13it/s]

Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Current loss = 159.221923828125, best loss before current = 159.24227905273438


fit seeds:  75%|███████▌  | 15/20 [00:12<00:04,  1.19it/s]

Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Current loss = 160.48602294921875, best loss before current = 159.221923828125


fit seeds:  80%|████████  | 16/20 [00:13<00:03,  1.21it/s]

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Current loss = 159.33859252929688, best loss before current = 159.221923828125


fit seeds:  85%|████████▌ | 17/20 [00:14<00:02,  1.25it/s]

Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Current loss = 160.22116088867188, best loss before current = 159.221923828125


fit seeds:  90%|█████████ | 18/20 [00:15<00:01,  1.24it/s]

Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping
Current loss = 159.3190155029297, best loss before current = 159.221923828125


fit seeds:  95%|█████████▌| 19/20 [00:16<00:00,  1.19it/s]

Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Current loss = 159.5835723876953, best loss before current = 159.221923828125


fit seeds: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]

Restoring model weights from the end of the best epoch.
Epoch 00026: early stopping
Current loss = 159.23318481445312, best loss before current = 159.221923828125
Best model weights: [array([[1.0300485 ],
       [1.0182496 ],
       [0.94238794]], dtype=float32)]
y_true > 0 = 235 of 330 (0.71)





{'all': {'mil': 3.2897072539029453, 'cross': 0.0, 'icp': 0.9090909090909091},
 'only_non_censored': {'mil': 3.2897072539029457,
  'cross': 0.0,
  'icp': 0.9148936170212766}}

In [4]:
# ---------------- 2nd Synth DS ------------------
create_fit_predict_evaluate(
    noise_sigma=1,
    splitter=partial(train_validation_test_random_splitter, seed=0, percent_test=0.33, percent_val_from_train=0.2),
    initalization_creator=create_initializer_near_0_or_1,
    fit_seeds=get_range_of_fit_seeds(),
    model_creator=partial(
        create_single_unit_model, 
        loss=lambda y_true, y_pred: tobit_type1_nll_tensorflow(
            noise_sigma=1, 
            y_true=y_true, 
            y_pred=y_pred, 
            lower_threshold=0),
        activation='linear',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, clipnorm=1),
        num_x_features=3),
    verbose=False,
    dataset=make_ds2(verbose=True, num_samples=1000, seed=42)
)['evaluation']

fit seeds:   0%|          | 0/20 [00:00<?, ?it/s]

Censored observations: 309 of 1000 (0.31%)
Fraction censored of test: 0.3181818181818182


fit seeds:   5%|▌         | 1/20 [00:00<00:15,  1.19it/s]

Restoring model weights from the end of the best epoch.
Epoch 00030: early stopping
Current loss = 211.37237548828125, best loss before current = None


fit seeds:  10%|█         | 2/20 [00:01<00:14,  1.22it/s]

Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping
Current loss = 212.05091857910156, best loss before current = 211.37237548828125


fit seeds:  15%|█▌        | 3/20 [00:02<00:14,  1.20it/s]

Restoring model weights from the end of the best epoch.
Epoch 00028: early stopping
Current loss = 211.54507446289062, best loss before current = 211.37237548828125


fit seeds:  20%|██        | 4/20 [00:03<00:14,  1.14it/s]

Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Current loss = 211.83624267578125, best loss before current = 211.37237548828125


fit seeds:  25%|██▌       | 5/20 [00:04<00:12,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
Current loss = 211.77171325683594, best loss before current = 211.37237548828125


fit seeds:  30%|███       | 6/20 [00:05<00:11,  1.21it/s]

Restoring model weights from the end of the best epoch.
Epoch 00012: early stopping
Current loss = 211.74217224121094, best loss before current = 211.37237548828125


fit seeds:  35%|███▌      | 7/20 [00:05<00:10,  1.21it/s]

Restoring model weights from the end of the best epoch.
Epoch 00024: early stopping
Current loss = 211.64456176757812, best loss before current = 211.37237548828125


fit seeds:  40%|████      | 8/20 [00:06<00:10,  1.18it/s]

Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Current loss = 211.61578369140625, best loss before current = 211.37237548828125


fit seeds:  45%|████▌     | 9/20 [00:07<00:09,  1.18it/s]

Restoring model weights from the end of the best epoch.
Epoch 00024: early stopping
Current loss = 211.33328247070312, best loss before current = 211.37237548828125


fit seeds:  50%|█████     | 10/20 [00:08<00:08,  1.19it/s]

Restoring model weights from the end of the best epoch.
Epoch 00022: early stopping
Current loss = 211.5064239501953, best loss before current = 211.33328247070312


fit seeds:  55%|█████▌    | 11/20 [00:09<00:07,  1.23it/s]

Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Current loss = 212.33950805664062, best loss before current = 211.33328247070312


fit seeds:  60%|██████    | 12/20 [00:09<00:06,  1.25it/s]

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Current loss = 211.13604736328125, best loss before current = 211.33328247070312


fit seeds:  65%|██████▌   | 13/20 [00:10<00:05,  1.18it/s]

Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Current loss = 212.86590576171875, best loss before current = 211.13604736328125


fit seeds:  70%|███████   | 14/20 [00:11<00:04,  1.23it/s]

Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Current loss = 211.52993774414062, best loss before current = 211.13604736328125


fit seeds:  75%|███████▌  | 15/20 [00:12<00:04,  1.23it/s]

Restoring model weights from the end of the best epoch.
Epoch 00022: early stopping
Current loss = 212.33584594726562, best loss before current = 211.13604736328125


fit seeds:  80%|████████  | 16/20 [00:13<00:03,  1.26it/s]

Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping
Current loss = 211.8233184814453, best loss before current = 211.13604736328125


fit seeds:  85%|████████▌ | 17/20 [00:14<00:02,  1.09it/s]

Restoring model weights from the end of the best epoch.
Epoch 00047: early stopping
Current loss = 211.85214233398438, best loss before current = 211.13604736328125


fit seeds:  90%|█████████ | 18/20 [00:15<00:01,  1.13it/s]

Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping
Current loss = 211.1562957763672, best loss before current = 211.13604736328125


fit seeds:  95%|█████████▌| 19/20 [00:15<00:00,  1.19it/s]

Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Current loss = 213.30300903320312, best loss before current = 211.13604736328125


fit seeds: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]

Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Current loss = 212.22894287109375, best loss before current = 211.13604736328125
Best model weights: [array([[0.96875584],
       [1.1288785 ],
       [1.1875395 ]], dtype=float32)]
y_true > 0 = 225 of 330 (0.68)





{'all': {'mil': 3.2897072539029453, 'cross': 0.0, 'icp': 0.8454545454545455},
 'only_non_censored': {'mil': 3.289707253902946,
  'cross': 0.0,
  'icp': 0.8311111111111111}}

In [6]:
# ---------------- 3rd Synth DS ------------------
create_fit_predict_evaluate(
    noise_sigma=1,
    splitter=partial(train_validation_test_random_splitter, seed=0, percent_test=0.33, percent_val_from_train=0.2),
    initalization_creator=create_initializer_near_0_or_1,
    fit_seeds=get_range_of_fit_seeds(),
    model_creator=partial(
        create_single_unit_model, 
        loss=lambda y_true, y_pred: tobit_type1_nll_tensorflow(
            noise_sigma=1, 
            y_true=y_true, 
            y_pred=y_pred, 
            lower_threshold=0),
        activation='linear',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, clipnorm=1),
        num_x_features=3),
    verbose=False,
    dataset=make_ds3(verbose=True, num_samples=1000, seed=42)
)['evaluation']

fit seeds:   0%|          | 0/20 [00:00<?, ?it/s]

Censored observations: 264 of 1000 (0.26%)
Fraction censored of test: 0.2818181818181818


fit seeds:   5%|▌         | 1/20 [00:00<00:14,  1.33it/s]

Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Current loss = 148.53842163085938, best loss before current = None


fit seeds:  10%|█         | 2/20 [00:01<00:14,  1.24it/s]

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Current loss = 147.49368286132812, best loss before current = 148.53842163085938


fit seeds:  15%|█▌        | 3/20 [00:02<00:13,  1.25it/s]

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Current loss = 147.87454223632812, best loss before current = 147.49368286132812


fit seeds:  20%|██        | 4/20 [00:03<00:12,  1.29it/s]

Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Current loss = 147.36758422851562, best loss before current = 147.49368286132812


fit seeds:  25%|██▌       | 5/20 [00:04<00:13,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00053: early stopping
Current loss = 147.30503845214844, best loss before current = 147.36758422851562


fit seeds:  30%|███       | 6/20 [00:05<00:12,  1.16it/s]

Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
Current loss = 147.37417602539062, best loss before current = 147.30503845214844


fit seeds:  35%|███▌      | 7/20 [00:06<00:11,  1.10it/s]

Restoring model weights from the end of the best epoch.
Epoch 00024: early stopping
Current loss = 147.84710693359375, best loss before current = 147.30503845214844


fit seeds:  40%|████      | 8/20 [00:06<00:10,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Current loss = 147.297119140625, best loss before current = 147.30503845214844


fit seeds:  45%|████▌     | 9/20 [00:07<00:09,  1.17it/s]

Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping
Current loss = 147.93215942382812, best loss before current = 147.297119140625


fit seeds:  50%|█████     | 10/20 [00:08<00:08,  1.17it/s]

Restoring model weights from the end of the best epoch.
Epoch 00028: early stopping
Current loss = 147.2862548828125, best loss before current = 147.297119140625


fit seeds:  55%|█████▌    | 11/20 [00:09<00:07,  1.21it/s]

Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Current loss = 148.7484588623047, best loss before current = 147.2862548828125


fit seeds:  60%|██████    | 12/20 [00:10<00:06,  1.25it/s]

Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Current loss = 147.33694458007812, best loss before current = 147.2862548828125


fit seeds:  65%|██████▌   | 13/20 [00:11<00:06,  1.08it/s]

Restoring model weights from the end of the best epoch.
Epoch 00045: early stopping
Current loss = 147.3028564453125, best loss before current = 147.2862548828125


fit seeds:  70%|███████   | 14/20 [00:12<00:05,  1.07it/s]

Restoring model weights from the end of the best epoch.
Epoch 00038: early stopping
Current loss = 147.31031799316406, best loss before current = 147.2862548828125


fit seeds:  75%|███████▌  | 15/20 [00:12<00:04,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Current loss = 148.96336364746094, best loss before current = 147.2862548828125


fit seeds:  80%|████████  | 16/20 [00:13<00:03,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00029: early stopping
Current loss = 147.27532958984375, best loss before current = 147.2862548828125


fit seeds:  85%|████████▌ | 17/20 [00:14<00:02,  1.11it/s]

Restoring model weights from the end of the best epoch.
Epoch 00041: early stopping
Current loss = 147.32492065429688, best loss before current = 147.27532958984375


fit seeds:  90%|█████████ | 18/20 [00:15<00:01,  1.05it/s]

Restoring model weights from the end of the best epoch.
Epoch 00029: early stopping
Current loss = 147.2845458984375, best loss before current = 147.27532958984375


fit seeds:  95%|█████████▌| 19/20 [00:16<00:00,  1.12it/s]

Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Current loss = 148.72471618652344, best loss before current = 147.27532958984375


fit seeds: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]

Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Current loss = 147.6136016845703, best loss before current = 147.27532958984375
Best model weights: [array([[0.96592826],
       [1.1058626 ],
       [0.9715684 ]], dtype=float32)]
y_true > 0 = 237 of 330 (0.72)





{'all': {'mil': 3.2897072539029453, 'cross': 0.0, 'icp': 0.9363636363636364},
 'only_non_censored': {'mil': 3.2897072539029457,
  'cross': 0.0,
  'icp': 0.9409282700421941}}

# YS-based QR

In [7]:
def analyze_results_from_hpc(thetas_dir):
    def eval_subset(qpred_5, qpred_95, y_true):
        return _mil_icp_cross(qpred_5=qpred_5, qpred_95=qpred_95, y_true=y_true)
    res_q5 = pickle.load(open(os.path.join(thetas_dir, 'theta_0.05.pkl'), 'rb'))
    res_q95 = pickle.load(open(os.path.join(thetas_dir, 'theta_0.95.pkl'), 'rb'))
    qpred_5_all = np.array(res_q5[1]['ystar_pred']).flatten()
    qpred_95_all = np.array(res_q95[1]['ystar_pred']).flatten()
    y_true = np.array(res_q5[1]['ystar_true']).flatten()
    return {
        'all': eval_subset(qpred_5=qpred_5_all, qpred_95=qpred_95_all, y_true=y_true),
        'only_non_censored': eval_subset(qpred_5=qpred_5_all[y_true > 0], qpred_95=qpred_95_all[y_true > 0], y_true=y_true[y_true > 0])
    }

In [8]:
analyze_results_from_hpc(thetas_dir='./thetas/synth_ds_1/')

{'all': {'mil': 3.1550411658079334, 'cross': 0.0, 'icp': 0.8878787878787879},
 'only_non_censored': {'mil': 3.2352524436217673,
  'cross': 0.0,
  'icp': 0.8851063829787233}}

In [9]:
analyze_results_from_hpc(thetas_dir='./thetas/synth_ds_2/')

{'all': {'mil': 3.7423493673404056,
  'cross': 0.03333333333333333,
  'icp': 0.7787878787878788},
 'only_non_censored': {'mil': 4.480432671176063,
  'cross': 0.0,
  'icp': 0.9066666666666666}}

In [10]:
analyze_results_from_hpc(thetas_dir='./thetas/synth_ds_3/')

{'all': {'mil': 2.643803209495364,
  'cross': 0.006060606060606061,
  'icp': 0.7212121212121212},
 'only_non_censored': {'mil': 3.081279080251098,
  'cross': 0.0,
  'icp': 0.9071729957805907}}