In [1]:
import time
import datetime
from collections import OrderedDict
import pandas as pd
import numpy as np
import scipy
from xgboost import XGBRegressor
import sklearn
from sklearn.model_selection import ShuffleSplit, KFold, RepeatedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.datasets import load_boston
from sklearn.base import BaseEstimator, RegressorMixin

In [2]:
datasets = OrderedDict()
results = OrderedDict()

In [3]:
datasets['boston'] = (load_boston()['data'], load_boston()['target'])

In [4]:
df = pd.read_csv('datasets/concrete_data.csv')
datasets['concrete'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [5]:
df = pd.read_csv('datasets/energy_efficiency.csv')
datasets['energy'] =  (df.iloc[:, :-2], df.iloc[:, -2])

In [6]:
df = pd.read_csv('datasets/kin8nm.csv')
datasets['kin8nm'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [7]:
df = pd.read_table('datasets/naval.txt', sep='\s+', header=None)
datasets['naval'] =  (df.iloc[:, :-2], df.iloc[:, -2])

In [8]:
df = pd.read_csv('datasets/power.csv')
datasets['power'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [9]:
df = pd.read_csv('datasets/protein.csv')
datasets['protein'] =  (df.iloc[:, 1:], df.iloc[:, 0])

In [10]:
df = pd.read_csv('datasets/wine.csv', sep=';')
datasets['wine'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [11]:
df = pd.read_table('datasets/yacht.txt', sep='\s+', header=None)
datasets['yacht'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [12]:
df = pd.read_table('datasets/year.txt', sep=',', header=None)
datasets['year'] =  (df.iloc[:, 1:], df.iloc[:, 0])

In [13]:
def ll(y, y_pred):
    mean, log_var = y_pred
    return scipy.stats.norm.logpdf(y, loc = mean, scale = np.exp(log_var/2)).mean()

ll_scorer = make_scorer(ll)

In [None]:
def ll_objective(y_true, y_pred):
    err = y_true
    log_var = y_pred
    grad = 1/(2*np.exp(log_var)) - 1/(2*np.exp(2*log_var))*(err**2)
    hess = -1/(2*np.exp(2*log_var)) + 2/(2*np.exp(3*log_var))*(err**2)
    return grad, hess

class XGBLogLikelihood(BaseEstimator, RegressorMixin):  
    
    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators

    def fit(self, X, y):
        self.xgb_mean = XGBRegressor(n_estimators=self.n_estimators)
        self.xgb_var = XGBRegressor(n_estimators=self.n_estimators, objective=ll_objective)
        self.xgb_mean.fit(X, y)
        mean = self.xgb_mean.predict(X)
        self.xgb_var.fit(X, y-mean)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)
        pred_var = self.xgb_var.predict(X)
        return pred_mean, pred_var

In [None]:
for d, (X,y) in datasets.items():
    reg = XGBRegressor(n_estimators=300)
    if d == 'year':
        cv = ShuffleSplit(3, test_size=0.1)
    elif d == 'protein':
        cv = KFold(n_splits=10)
    else:
        cv = RepeatedKFold(n_splits=10, n_repeats=4)

    rmse = np.sqrt(cross_val_score(reg, X, y, cv=cv, scoring=make_scorer(mean_squared_error)))

    ll = cross_val_score(XGBLogLikelihood(n_estimators=300), X, y, cv=cv, scoring=ll_scorer)
    
    results[d] = (datetime.datetime.now(), d, X.shape[0], X.shape[1],
                  rmse.mean(), scipy.stats.sem(rmse), ll.mean(), scipy.stats.sem(ll))
    
    print(*results[d])

2018-02-01 14:39:37.089934 boston 506 13 2.914846666181382 0.10458028706401429 -4.83869219344313 0.40332427532355103
2018-02-01 14:39:48.647246 concrete 1030 8 4.30891380114884 0.08532772668959696 -2.89459374503507 0.019764341387791703
2018-02-01 14:39:55.843361 energy 768 8 0.39438253219839836 0.00914544407567054 -1.2693399524903182 0.007587203896492379
2018-02-01 14:41:33.970550 kin8nm 8192 8 0.1633382681779371 0.0008638650162281974 -1.1769896145849943 7.333850022723467e-05
2018-02-01 14:44:17.551006 naval 11934 16 0.0018171791996022869 9.349778378912723e-06 -1.1689395560100184 1.126933218620689e-08
2018-02-01 14:45:19.749663 power 9568 4 3.552468877201535 0.036498881521130455 -3.0497946011488706 0.02160275544761731


In [17]:
for i, j in results.items():
    res = ['{:.2f}'.format(x) for x in j[-4:]]
    print(i, 'rmse', ' & ' + ' \\tpm '.join(res[0:2]))
    print(i, 'll', ' & ' + ' \\tpm '.join(res[2:]))

boston rmse  & 2.91 \tpm 0.10
boston ll  & -4.84 \tpm 0.40
concrete rmse  & 4.31 \tpm 0.09
concrete ll  & -2.89 \tpm 0.02
energy rmse  & 0.39 \tpm 0.01
energy ll  & -1.27 \tpm 0.01
kin8nm rmse  & 0.16 \tpm 0.00
kin8nm ll  & -1.18 \tpm 0.00
naval rmse  & 0.00 \tpm 0.00
naval ll  & -1.17 \tpm 0.00
power rmse  & 3.55 \tpm 0.04
power ll  & -3.05 \tpm 0.02
protein rmse  & 4.37 \tpm 0.02
protein ll  & -3.15 \tpm 0.00
wine rmse  & 0.61 \tpm 0.01
wine ll  & -1.11 \tpm 0.00
yacht rmse  & 0.53 \tpm 0.03
yacht ll  & -1.29 \tpm 0.01
year rmse  & 9.09 \tpm 0.07
year ll  & -4.68 \tpm 0.00
