In [91]:
from joblib import Parallel, delayed
import numpy as np
import scipy
import pickle
import json
from itertools import product

from datasets import make_datasets
from metrics import Results, normal_nll, rmse, mae, auc_rmse, auc_mae
from shallow_models import LinearRegression, BayesianLinearRegression, RFBaseline, RFUncertainty, GBTQuantile, XGBaseline, XGBLogLikelihood


import numpy as np
from scipy import stats, optimize
from collections import namedtuple

from sklearn.model_selection import cross_validate, ShuffleSplit, KFold, RepeatedKFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from datetime import datetime

from bokeh.io import output_notebook, show
from bokeh.plotting import Figure
from bokeh.palettes import Set3, Paired
from bokeh.models import Span

output_notebook()

In [84]:
model = XGBLogLikelihood()

In [85]:
datasets = make_datasets(year=False)

In [86]:
def normal_nll_opt(actual, pred, std):
    error = np.array(actual) - np.array(pred)
    std[std <= 1e-30] = 1e-30
    func = lambda x: -stats.norm.logpdf(error, loc=x[0], scale=x[1]*std).mean()
    x, f, d = optimize.fmin_l_bfgs_b(func, np.array([0.0, 1.0]), bounds=[(None, None), (0, None)], approx_grad=True)
    return f


def auc_rmse_norm(actual, pred, std):
    base_rmse = rmse(actual, pred)
    error = np.array(actual) - np.array(pred)
    rmses = []
    data = sorted(zip(std, error), reverse=True)
    for i in range(len(data)):
        _, err = zip(*data[i:])
        rmses.append(np.sqrt((np.array(err)**2).mean())/base_rmse)
    return np.trapz(y=rmses, x=np.arange(len(data))/(len(data)-1))


def eval_dataset_model(d, X, y, model):    
    reg = model()
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    
    scaler_X = scaler_X.fit(X)
    X_train = scaler_X.transform(X)
    
    scaler_y = scaler_y.fit(y.reshape(-1, 1))
    y_train = np.ravel(scaler_y.transform(y.reshape(-1, 1)))
    reg.fit(X_train, y_train)
    
    return reg, scaler_X, scaler_y

In [92]:
y[-1]

11.9

In [98]:
datasets.keys()

odict_keys(['boston', 'concrete', 'energy', 'kin8nm', 'naval', 'power', 'protein', 'wine', 'yacht', 'make_regression', 'make_friedman1', 'make_friedman2', 'make_friedman3'])

In [109]:
ds = 'kin8nm'
x, y = datasets[ds]
try:
    x = x.values
    y = y.values
except AttributeError:
    pass

models = [
    LinearRegression, 
    BayesianLinearRegression, 
    RFBaseline, 
    RFUncertainty, 
    GBTQuantile, 
    XGBaseline, 
    XGBLogLikelihood
]

f = Figure(title=ds)

for i, model in enumerate(models):    
    reg, sx, sy = eval_dataset_model(ds, x[:-1], y[:-1], model)

    pred = reg.predict(sx.transform(x[-1:]))
    mu = sy.inverse_transform(pred[0])[0]
    sigma = sy.inverse_transform(pred[1])[0]
    xx = np.linspace(mu-3*sigma, mu+3*sigma, 1000)
    pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(xx-mu)**2 / (2*sigma**2))
    f.line(xx, pdf, legend=model.__name__, line_width=2, color=Paired[10][i])

vline = Span(location=y[-1], dimension='height', line_color='black', line_width=2)
f.renderers.append(vline)
f.xaxis.axis_label = 'Value'
f.yaxis.axis_label = 'PDF'
show(f)