In [1]:
%load_ext autoreload
%matplotlib inline

In [88]:
from sklearn.model_selection import cross_validate, ShuffleSplit, KFold, RepeatedKFold
from sklearn.metrics import make_scorer
from datetime import datetime
from joblib import Parallel, delayed

In [72]:
%autoreload 2
from datasets import make_datasets

In [73]:
datasets = make_datasets(year=False)

In [74]:
%autoreload 2
from metrics import normal_nll, rmse, mae, auc_rmse, auc_mae

In [75]:
%autoreload 2
from shallow_models import LinearRegression, BayesianLinearRegression, GBTQuantile, XGBaseline, XGBLogLikelihood
models = LinearRegression, BayesianLinearRegression, GBTQuantile, XGBaseline, XGBLogLikelihood

In [86]:
results = []

Results = namedtuple('Results', 'datetime dataset model shape normal_nll rmse mae auc_rmse auc_mae')

for d, (X,y) in datasets.items():
    try:
        X = X.values
        y = y.values
    except AttributeError:
        pass
    
    if d == 'year':
        cv = ShuffleSplit(1, test_size=0.1)
    elif d == 'protein':
        cv = KFold(n_splits=3)
    elif d.startswith('make'):
        cv = KFold(n_splits=2)
    else:
        cv = RepeatedKFold(n_splits=5, n_repeats=1)
    
    for m in models:
        reg = m()
        cv_metrics = []
        for train_index, test_index in cv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            reg.fit(X, y)
            pred_mean, pred_std = reg.predict(X)
            cv_metrics.append((
                normal_nll(y, pred_mean, pred_std),
                rmse(y, pred_mean),
                mae(y, pred_mean),
                auc_rmse(y, pred_mean, pred_std),
                auc_mae(y, pred_mean, pred_std)))

        metrics_mean = np.mean(cv_metrics, axis = 1)
        metrics_stderr = scipy.stats.sem(cv_metrics, axis = 1)
        
        r = Results(
                str(datetime.now()),
                d, 
                m.__name__,
                X.shape,
                *zip(metrics_mean, metrics_stderr)
            )
        results.append(r)
        print(r)

Results(datetime='2018-02-28 21:12:46.340390', dataset='boston', model='LinearRegression', shape=(506, 13), normal_nll=(3.869981139822862, 0.33501465337831454), rmse=(3.869981139822862, 0.33501465337831454), mae=(3.869981139822862, 0.33501465337831454), auc_rmse=(3.869981139822862, 0.33501465337831454), auc_mae=(3.869981139822862, 0.33501465337831454))
Results(datetime='2018-02-28 21:12:46.796980', dataset='boston', model='BayesianLinearRegression', shape=(506, 13), normal_nll=(3.987720908492345, 0.4216856005732957), rmse=(3.987720908492345, 0.4216856005732957), mae=(3.987720908492345, 0.4216856005732957), auc_rmse=(3.987720908492345, 0.4216856005732957), auc_mae=(3.987720908492345, 0.4216856005732957))
Results(datetime='2018-02-28 21:12:50.424099', dataset='boston', model='GBTQuantile', shape=(506, 13), normal_nll=(1.772707082286443, 0.37978723066883624), rmse=(1.747863360744369, 0.3814361604211815), mae=(1.7753589316199676, 0.3705406199510351), auc_rmse=(1.7606285817674057, 0.3639752

KeyboardInterrupt: 

In [None]:
def eval_dataset_model(d, X, y):
    try:
        X = X.values
        y = y.values
    except AttributeError:
        pass
    
    if d == 'year':
        cv = ShuffleSplit(1, test_size=0.1)
    elif d == 'protein':
        cv = KFold(n_splits=3)
    elif d.startswith('make'):
        cv = KFold(n_splits=2)
    else:
        cv = RepeatedKFold(n_splits=5, n_repeats=1)
    
    for m in models:
        reg = m()
        cv_metrics = []
        for train_index, test_index in cv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            reg.fit(X, y)
            pred_mean, pred_std = reg.predict(X)
            cv_metrics.append((
                normal_nll(y, pred_mean, pred_std),
                rmse(y, pred_mean),
                mae(y, pred_mean),
                auc_rmse(y, pred_mean, pred_std),
                auc_mae(y, pred_mean, pred_std)))

        metrics_mean = np.mean(cv_metrics, axis = 1)
        metrics_stderr = scipy.stats.sem(cv_metrics, axis = 1)
        
        return Results(
                str(datetime.now()),
                d, 
                m.__name__,
                X.shape,
                *zip(metrics_mean, metrics_stderr)
            )

Parallel(n_jobs=16)(delayed(eval_dataset_model)(d, X, y) for d, (X, y) in datasets.items())