In [1]:
import time
import datetime
from collections import OrderedDict
import pandas as pd
import numpy as np
import scipy
from scipy.stats import norm
from xgboost import XGBRegressor
import sklearn
from sklearn.model_selection import ShuffleSplit, KFold, RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_error
from sklearn.datasets import load_boston
from sklearn.base import BaseEstimator, RegressorMixin

In [2]:
N_JOBS = 16
datasets = OrderedDict()
results = OrderedDict()
results2 = OrderedDict()
results_gs = OrderedDict()
results_gs2 = OrderedDict()
results_gs3 = OrderedDict()

In [3]:
datasets['boston'] = (load_boston()['data'], load_boston()['target'])

In [4]:
df = pd.read_csv('datasets/concrete_data.csv')
datasets['concrete'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [5]:
df = pd.read_csv('datasets/energy_efficiency.csv')
datasets['energy'] =  (df.iloc[:, :-2], df.iloc[:, -2])

In [6]:
df = pd.read_csv('datasets/kin8nm.csv')
datasets['kin8nm'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [7]:
df = pd.read_table('datasets/naval.txt', sep='\s+', header=None)
datasets['naval'] =  (df.iloc[:, :-2], df.iloc[:, -2])

In [8]:
df = pd.read_csv('datasets/power.csv')
datasets['power'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [9]:
df = pd.read_csv('datasets/protein.csv')
datasets['protein'] =  (df.iloc[:, 1:], df.iloc[:, 0])

In [10]:
df = pd.read_csv('datasets/wine.csv', sep=';')
datasets['wine'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [11]:
df = pd.read_table('datasets/yacht.txt', sep='\s+', header=None)
datasets['yacht'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [23]:
df = pd.read_table('datasets/year.txt', sep=',', header=None)
datasets['year'] =  (df.iloc[:, 1:], df.iloc[:, 0])

In [21]:
def normal_ll(y, y_pred):
    mean, log_var = y_pred
    return scipy.stats.norm.logpdf(y, loc = mean, scale = np.exp(log_var/2)).mean()

ll_scorer = make_scorer(normal_ll, greater_is_better=True)

class BaselineLL(BaseEstimator, RegressorMixin):      
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, subsample=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample

    def fit(self, X, y):
        self.xgb_mean = XGBRegressor(n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_depth=self.max_depth, subsample=self.subsample)
        self.xgb_mean.fit(X, y)
        mean = self.xgb_mean.predict(X)
        self.mu, self.std = norm.fit(y-mean)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)#-self.mu
        #print(np.mean(self.mu), np.mean(self.mu/pred_mean))
        pred_var = np.sqrt(self.std)
        return pred_mean, pred_var

In [None]:
for d, (X,y) in reversed(datasets.items()):
    reg = XGBRegressor(n_estimators=300)
    if d == 'year':
        cv = ShuffleSplit(3, test_size=0.1)
    elif d == 'protein':
        cv = KFold(n_splits=10)
    else:
        cv = RepeatedKFold(n_splits=10, n_repeats=4)

    rmse = np.sqrt(cross_val_score(reg, X, y, cv=cv, scoring=make_scorer(mean_squared_error), n_jobs=N_JOBS))
    
    ll = cross_val_score(BaselineLL(n_estimators=300), X, y, cv=cv, scoring=ll_scorer, n_jobs=N_JOBS)
    
    results[d] = (datetime.datetime.now(), d, X.shape[0], X.shape[1],
                  rmse.mean(), scipy.stats.sem(rmse),
                  ll.mean(), scipy.stats.sem(ll))
    
    print(*results[d])

In [16]:
for i, j in results.items():
    res = ['{:.2f}'.format(x) for x in j[-4:]]
    print(i, 'rmse', ' & ' + ' \\tpm '.join(res[0:2]))
    print(i, 'll', ' & ' + ' \\tpm '.join(res[2:]))

boston rmse  & 2.94 \tpm 0.09
boston ll  & -3.33 \tpm 0.18
concrete rmse  & 4.32 \tpm 0.11
concrete ll  & -3.56 \tpm 0.07
energy rmse  & 0.40 \tpm 0.01
energy ll  & -1.25 \tpm 0.00
kin8nm rmse  & 0.16 \tpm 0.00
kin8nm ll  & -1.12 \tpm 0.00
naval rmse  & 0.00 \tpm 0.00
naval ll  & -0.94 \tpm 0.00
power rmse  & 3.55 \tpm 0.03
power ll  & -2.87 \tpm 0.02
protein rmse  & 4.37 \tpm 0.02
protein ll  & -3.17 \tpm 0.01
wine rmse  & 0.61 \tpm 0.01
wine ll  & -1.33 \tpm 0.00
yacht rmse  & 0.57 \tpm 0.04
yacht ll  & -1.25 \tpm 0.02


In [27]:
results

OrderedDict([('boston',
              (datetime.datetime(2018, 2, 26, 19, 21, 1, 777986),
               'boston',
               506,
               13,
               2.9573087801970432,
               0.11340998303447675,
               -3.2152817983060187,
               0.12498107408641214)),
             ('concrete',
              (datetime.datetime(2018, 2, 26, 19, 20, 59, 430078),
               'concrete',
               1030,
               8,
               4.299174794347391,
               0.09713329307022893,
               -3.58624786692869,
               0.07407400993637028)),
             ('energy',
              (datetime.datetime(2018, 2, 26, 19, 20, 57, 16869),
               'energy',
               768,
               8,
               0.392606850415966,
               0.010035053120325304,
               -1.246975549532089,
               0.0017984015710252798)),
             ('kin8nm',
              (datetime.datetime(2018, 2, 26, 19, 20, 55, 75123),
           

In [28]:
results

OrderedDict([('boston',
              (datetime.datetime(2018, 2, 26, 19, 21, 1, 777986),
               'boston',
               506,
               13,
               2.9573087801970432,
               0.11340998303447675,
               -3.2152817983060187,
               0.12498107408641214)),
             ('concrete',
              (datetime.datetime(2018, 2, 26, 19, 20, 59, 430078),
               'concrete',
               1030,
               8,
               4.299174794347391,
               0.09713329307022893,
               -3.58624786692869,
               0.07407400993637028)),
             ('energy',
              (datetime.datetime(2018, 2, 26, 19, 20, 57, 16869),
               'energy',
               768,
               8,
               0.392606850415966,
               0.010035053120325304,
               -1.246975549532089,
               0.0017984015710252798)),
             ('kin8nm',
              (datetime.datetime(2018, 2, 26, 19, 20, 55, 75123),
           