In [68]:
import time
import datetime
from collections import OrderedDict
import pandas as pd
import numpy as np
import scipy
from xgboost import XGBRegressor
import sklearn
from sklearn.model_selection import ShuffleSplit, KFold, RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.datasets import load_boston
from sklearn.base import BaseEstimator, RegressorMixin

In [69]:
datasets = OrderedDict()
results = OrderedDict()
results_gs = OrderedDict()

In [71]:
datasets['boston'] = (load_boston()['data'], load_boston()['target'])

In [72]:
df = pd.read_csv('datasets/concrete_data.csv')
datasets['concrete'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [73]:
df = pd.read_csv('datasets/energy_efficiency.csv')
datasets['energy'] =  (df.iloc[:, :-2], df.iloc[:, -2])

In [7]:
df = pd.read_csv('datasets/kin8nm.csv')
datasets['kin8nm'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [8]:
df = pd.read_table('datasets/naval.txt', sep='\s+', header=None)
datasets['naval'] =  (df.iloc[:, :-2], df.iloc[:, -2])

In [9]:
df = pd.read_csv('datasets/power.csv')
datasets['power'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [10]:
df = pd.read_csv('datasets/protein.csv')
datasets['protein'] =  (df.iloc[:, 1:], df.iloc[:, 0])

In [11]:
df = pd.read_csv('datasets/wine.csv', sep=';')
datasets['wine'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [12]:
df = pd.read_table('datasets/yacht.txt', sep='\s+', header=None)
datasets['yacht'] =  (df.iloc[:, :-1], df.iloc[:, -1])

In [13]:
#df = pd.read_table('datasets/year.txt', sep=',', header=None)
#datasets['year'] =  (df.iloc[:, 1:], df.iloc[:, 0])

In [43]:
def ll(y, y_pred):
    mean, log_var = y_pred
    return scipy.stats.norm.logpdf(y, loc = mean, scale = np.exp(log_var/2)).mean()

ll_scorer = make_scorer(ll, greater_is_better=True)

In [81]:
def ll_objective(y_true, y_pred):
    err = y_true
    log_var = y_pred
    #grad = 1/(2*np.exp(log_var)) - 1/(2*np.exp(2*log_var))*(err**2)
    grad = -1/(2*np.exp(log_var))*(1 - 1/np.exp(log_var)*(err**2))
    #hess = -1/(2*np.exp(2*log_var)) + 2/(2*np.exp(3*log_var))*(err**2)
    hess = 1/np.exp(1.5*log_var) - 2/np.exp(2.5*log_var)*(err**2)
    return grad, hess


class XGBLogLikelihood(BaseEstimator, RegressorMixin):  
    
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth

    def fit(self, X, y):
        self.xgb_mean = XGBRegressor(n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_depth=self.max_depth)
        self.xgb_var = XGBRegressor(objective=ll_objective, n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_depth=self.max_depth)
        self.xgb_mean.fit(X, y)
        mean = self.xgb_mean.predict(X)
        self.xgb_var.fit(X, y-mean)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)
        pred_var = self.xgb_var.predict(X)
        return pred_mean, pred_var

In [82]:
for d, (X,y) in datasets.items():
    reg = XGBRegressor(n_estimators=300)
    if d == 'year':
        cv = ShuffleSplit(3, test_size=0.1)
    elif d == 'protein':
        cv = KFold(n_splits=10)
    else:
        cv = RepeatedKFold(n_splits=10, n_repeats=4)

    rmse = np.sqrt(cross_val_score(reg, X, y, cv=cv, scoring=make_scorer(mean_squared_error)))
    
    ll = cross_val_score(XGBLogLikelihood(n_estimators=300), X, y, cv=cv, scoring=ll_scorer)
    
    results[d] = (datetime.datetime.now(), d, X.shape[0], X.shape[1],
                  rmse.mean(), scipy.stats.sem(rmse),
                  ll.mean(), scipy.stats.sem(ll))
    
    print(*results[d])

2018-02-19 02:27:06.555965 boston 506 13 2.922818975249791 0.08546700902214542 -3.052459720702976 0.007266909107266683
2018-02-19 02:27:23.557018 concrete 1030 8 4.292375906237041 0.09674479838640206 -3.2988783696115584 0.004579172070292657
2018-02-19 02:27:34.790497 energy 768 8 0.3949329765520758 0.009867768202915734 -3.113318796551477 0.00041282118616941496


In [None]:
for i, j in results.items():
    res = ['{:.2f}'.format(x) for x in j[-4:]]
    print(i, 'rmse', ' & ' + ' \\tpm '.join(res[0:2]))
    print(i, 'll', ' & ' + ' \\tpm '.join(res[2:]))

In [74]:
parameters = {'max_depth': [2,3,4], 
              'n_estimators':[100, 250, 500], 
              'learning_rate':[0.01, 0.1, 0.25]}

for d, (X,y) in datasets.items():
    if d == 'year':
        cv = ShuffleSplit(3, test_size=0.1)
    elif d == 'protein':
        cv = KFold(n_splits=10)
    else:
        cv = RepeatedKFold(n_splits=10, n_repeats=4)
    
    clf = GridSearchCV(XGBLogLikelihood(), parameters, scoring=ll_scorer, cv=cv)
    clf.fit(X, y)
        
    results_gs[d] = (datetime.datetime.now(), clf.best_score_, clf.best_params_)
    
    print(*results_gs[d])

KeyboardInterrupt: 