In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [2]:
from matplotlib import pyplot
import numpy as np
import pandas
from matplotlib import pyplot as plt
from scipy.stats import rankdata

import sys
sys.path.append('../tasks')
from database_utils import Database

from data import ALL_REGRESSION_DATATSETS, ALL_CLASSIFICATION_DATATSETS
ALL_DATATSETS = {}
ALL_DATATSETS.update(ALL_REGRESSION_DATATSETS)
ALL_DATATSETS.update(ALL_CLASSIFICATION_DATATSETS)

regression_datasets = list(ALL_REGRESSION_DATATSETS.keys())
regression_datasets.sort()

classification_datasets = list(ALL_CLASSIFICATION_DATATSETS.keys())
classification_datasets.sort()


In [3]:
def rankarray(A):
    ranks = []
    for a in A:
        ranks.append(rankdata(a))
    return np.array(ranks)


def read_regression_classification(fs, models_names, datasets, task):
    if task == 'classification':
        fields = ['dataset', 'N', 'D', 'K'] + [m[1] for m in models_names]
    else:
        fields = ['dataset', 'N', 'D'] + [m[1] for m in models_names]

    results = {}
    for f in fs:
        results[f] = {'table':{f:[] for f in fields}, 'vals':[]}

    with Database('../results/results.db') as db:

        for dataset in datasets:

            for f in fs:
                results[f]['table']['dataset'].append(dataset[:10])
                results[f]['table']['N'].append(ALL_DATATSETS[dataset].N)
                results[f]['table']['D'].append(ALL_DATATSETS[dataset].D)
                if task == 'classification':
                    results[f]['table']['K'].append(ALL_DATATSETS[dataset].K)

            row = {f:[] for f in fs}
            for model, name in models_names:

                res = db.read(task, fs, {'model':model, 'dataset':dataset})

                if len(res) == 0:
                    for f in fs:
                        results[f]['table'][name].append('')
                        row[f].append(np.nan)
                else:
                    for i, f in enumerate(fs):
                        L = [float(l[i]) for l in res]
                        m = np.average(L)
                        if m < 1000 and m > -1000:
                            r = '{:.3f}'.format(m)
                            row[f].append(m)
                        else:
                            r = 'nan'
                            row[f].append(np.nan)

                        results[f]['table'][name].append(r)

            #             stderr = np.std(L)/float(len(L))**0.5
            #             r = '{:.3f} ({:.3f})'.format(m, stderr)
            for f in fs:   
                results[f]['vals'].append(row[f])


    for f in fs:
        if 'unnormalized' not in f:
            vals = np.array(results[f]['vals'])

            avgs = np.nanmean(vals, 0)
            meds = np.nanmedian(vals, 0)
            rks = np.nanmean(rankarray(vals), 0)

            for s, n in [[avgs, 'avg'], [meds, 'median'], [rks, 'avg rank']]:
                results[f]['table']['dataset'].append(n)
                results[f]['table']['N'].append('')
                results[f]['table']['D'].append('')
                if task == 'classification':
                    results[f]['table']['K'].append('')
                for ss, name in zip(s, [m[1] for m in models_names]):
                    results[f]['table'][name].append('{:.3f}'.format(ss))
    
    return results, fields


In [4]:
models_names = [['linear', 'lin'],
                ['variationally_sparse_gp', 'SVGP'],
                ['deep_gp_doubly_stochastic','DGP'],
                ['svm', 'svm'],
                ['knn', 'knn'],
#                 ['naive_bayes', 'nb'],
#                 ['decision_tree', 'dt'],
#                 ['random_forest', 'rf'],
                ['gradient_boosting_machine', 'gbm'],
#                 ['adaboost', 'ab'],
                ['mlp', 'mlp'],
                ]

fs = 'test_loglik', 'test_rmse', 'test_loglik_unnormalized', 'test_rmse_unnormalized'

results, fields = read_regression_classification(fs, models_names, regression_datasets, 'regression')


In [5]:
print(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_latex(index=False))
print(pandas.DataFrame(results['test_rmse']['table'], columns=fields).to_latex(index=False))

\begin{tabular}{llllllllll}
\toprule
   dataset &      N &   D &     lin &    SVGP &     DGP &     svm &     knn &     gbm &     mlp \\
\midrule
    boston &    506 &  13 &  -0.545 &  -0.089 &  -0.160 &  -0.107 &  -0.296 &  -0.516 &  -0.123 \\
  concrete &   1030 &   8 &  -0.965 &  -0.387 &  -0.438 &  -0.716 &  -0.875 &  -0.421 &  -0.636 \\
    energy &    768 &   8 &  -0.074 &   1.205 &   1.216 &   0.135 &   0.218 &   1.746 &   0.071 \\
    kin8nm &   8192 &   8 &  -1.118 &  -0.355 &  -0.079 &  -0.245 &  -0.653 &  -1.002 &  -0.265 \\
     naval &  11934 &  12 &  -0.454 &   2.214 &   1.476 &   0.048 &   0.870 &  -0.092 &   1.283 \\
     power &   9568 &   4 &  -0.140 &  -0.029 &  -0.109 &  -0.038 &  -0.057 &  -0.027 &  -0.059 \\
   protein &  45730 &   9 &  -1.248 &  -1.107 &  -1.612 &  -1.131 &  -1.009 &  -1.149 &  -1.087 \\
   winered &   1599 &  11 &  -1.134 &  -1.071 &  -1.083 &  -1.066 &  -1.206 &  -1.077 &  -1.095 \\
 winewhite &   4898 &  12 &  -1.215 &  -1.136 &         &  -1.1

In [6]:
fs = ['test_loglik', 'test_acc']
results, fields = read_regression_classification(fs, models_names, classification_datasets, 'classification')


In [7]:
print(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_latex(index=False))

\begin{tabular}{lllllllllll}
\toprule
    dataset &       N &    D &    K &     lin &    SVGP &     DGP &     svm &     knn &     gbm &     mlp \\
\midrule
    abalone &    4177 &    9 &    3 &  -0.754 &         &         &  -0.730 &  -2.169 &  -0.654 &  -0.692 \\
 acute-infl &     120 &    7 &    2 &  -0.069 &  -0.009 &  -0.119 &  -0.019 &  -0.000 &  -0.000 &  -0.033 \\
 acute-neph &     120 &    7 &    2 &  -0.032 &  -0.005 &  -0.084 &  -0.020 &  -0.000 &  -0.000 &  -0.020 \\
      adult &   48842 &   15 &    2 &  -0.348 &  -0.321 &         &  -0.363 &  -1.109 &  -0.295 &  -0.318 \\
  annealing &     898 &   32 &    5 &  -0.479 &  -1.645 &         &  -0.509 &  -1.197 &  -0.134 &  -0.402 \\
 arrhythmia &     452 &  263 &   13 &  -1.369 &  -1.389 &         &  -1.213 &  -7.528 &  -1.612 &  -1.256 \\
 audiology- &     196 &   60 &   18 &  -1.181 &  -1.073 &  -1.218 &  -1.167 &  -2.101 &  -0.849 &  -0.753 \\
 balance-sc &     625 &    5 &    3 &  -0.178 &  -0.035 &  -0.057 &  -0.176 &  -0

In [8]:
print(pandas.DataFrame(results['test_acc']['table'], columns=fields).to_latex(index=False))


\begin{tabular}{lllllllllll}
\toprule
    dataset &       N &    D &    K &    lin &   SVGP &    DGP &    svm &    knn &    gbm &    mlp \\
\midrule
    abalone &    4177 &    9 &    3 &  0.627 &        &        &  0.667 &  0.624 &  0.696 &  0.660 \\
 acute-infl &     120 &    7 &    2 &  1.000 &  1.000 &  1.000 &  1.000 &  1.000 &  1.000 &  1.000 \\
 acute-neph &     120 &    7 &    2 &  1.000 &  1.000 &  1.000 &  1.000 &  1.000 &  1.000 &  1.000 \\
      adult &   48842 &   15 &    2 &  0.837 &  0.844 &        &  0.847 &  0.828 &  0.866 &  0.851 \\
  annealing &     898 &   32 &    5 &  0.767 &  0.800 &        &  0.789 &  0.811 &  0.944 &  0.778 \\
 arrhythmia &     452 &  263 &   13 &  0.609 &  0.696 &        &  0.652 &  0.609 &  0.761 &  0.717 \\
 audiology- &     196 &   60 &   18 &  0.750 &  0.700 &  0.700 &  0.650 &  0.550 &  0.850 &  0.750 \\
 balance-sc &     625 &    5 &    3 &  0.952 &  0.984 &  0.968 &  0.952 &  0.857 &  0.905 &  1.000 \\
   balloons &      16 &    5 &    2

In [9]:
# fields = ['dataset', 'N', 'D']

                
# colours = ['C{}'.format(i) for i in range(10)]

# fields = fields + [m[1] for m in models_names]
# results = {f:[] for f in fields}


# for dataset in regression_datasets:
    
#     fig, axs = plt.subplots(1, 2, figsize=(10, 5))

#     results['dataset'].append(dataset)
#     results['N'].append(ALL_REGRESSION_DATATSETS[dataset].N)
#     results['D'].append(ALL_REGRESSION_DATATSETS[dataset].D)

#     for (model, name), c in zip(models_names, colours):
#         with Database('../results/results.db') as db:
#             d = {'model':model, 'dataset':dataset}

#             res = db.read('active_learning_continuous', ['total_loglik', 'total_rmse'], d) 
#         if len(res)>0:
#             test_ll = res[0][0]
#             test_acc = res[0][1]

#             axs[0].plot(test_ll, label=model, color=c)r
#             axs[1].plot(test_acc, label=model, color=c)
#     axs[0].set_ylim(-10, 10)
#     plt.title('{} {} {}'.format(dataset,
#                                    ALL_REGRESSION_DATATSETS[dataset].N,
#                                    ALL_REGRESSION_DATATSETS[dataset].D))
#     plt.legend()
#     plt.show()


In [10]:

# fields = ['dataset', 'N', 'D', 'K']

# models_names = [['linear', 'lin'],
#                 ['variationally_sparse_gp', 'SVGP'],
#                 ['deep_gp_doubly_stochastic','DGP'],
#                 ['svm', 'svm'],
#                 ['knn', 'knn'],
#                 ['naive_bayes', 'nb'],
#                 ['decision_tree', 'dt'],
#                 ['random_forest', 'rf'],
#                 ['gradient_boosting_machine', 'gbm'],
#                 ['adaboost', 'ab'],
#                 ['mlp', 'mlp'],
#                 ]
                
# colours = ['C{}'.format(i) for i in range(10)]

# fields = fields + [m[1] for m in models_names]
# results = {f:[] for f in fields}


# for dataset in classification_datasets[:4]:  # don't show them all...
    
#     fig, axs = plt.subplots(1, 2, figsize=(10, 5))

#     results['dataset'].append(dataset)
#     results['N'].append(ALL_CLASSIFICATION_DATATSETS[dataset].N)
#     results['D'].append(ALL_CLASSIFICATION_DATATSETS[dataset].D)
#     results['K'].append(ALL_CLASSIFICATION_DATATSETS[dataset].K)

#     for (model, name), c in zip(models_names, colours):
#         with Database('../results/results.db') as db:
#             d = {'model':model, 'dataset':dataset}

#             res = db.read('active_learning_discrete', ['test_loglik', 'total_acc'], d) 
#         if len(res)>0:
#             test_ll = res[0][0]
#             test_acc = res[0][1]

#             axs[0].plot(test_ll, label=model, color=c)
#             axs[1].plot(test_acc, label=model, color=c)

#     plt.title('{} {} {} {}'.format(dataset,
#                                    ALL_CLASSIFICATION_DATATSETS[dataset].N,
#                                    ALL_CLASSIFICATION_DATATSETS[dataset].D,
#                                    ALL_CLASSIFICATION_DATATSETS[dataset].K))
#     plt.legend()
#     plt.show()