In [129]:
import inspect
import numpy as np
import pandas as pd
from sklearn.tree import *
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import *

from Constants import *
from Utils import *
from Dataset import *

In [130]:
inspect.getmro(DecisionTreeClassifier)

(sklearn.tree.tree.DecisionTreeClassifier,
 sklearn.tree.tree.BaseDecisionTree,
 abc.NewBase,
 sklearn.base.BaseEstimator,
 sklearn.base.ClassifierMixin,
 object)

In [131]:
dataset = Dataset(combined_csv)
df = dataset.load()

In [132]:
def get_data(dataframe, x_lab, y_lab):
    return np.array(dataframe[x_lab]), np.array(dataframe[y_lab])

def learn(model, X, Y, scorers):
    Y_pr = cross_val_predict(model, X, Y, cv=10)
    errs = [scorer(Y, Y_pr, multioutput='raw_values') for scorer in scorers]
#     errs = [scorer(Y, Y_pr, multioutput='uniform_average') for scorer in scorers]
    return Y_pr, errs

In [133]:
# Define Y values to learn
Y_labels = Y_labels_default[1:] # All except 'inc', which is binary
# Y_labels = Y_labels_default
# Y_labels = ['dif', 'nrd', 'skt'] # Sketchability, general, scalar metrics
# Y_labels = ['dif', 'nrd', 'skt', 'vis'] # Visual, general, scalar metrics
# Y_labels = ['vis', 'phy', 'obj'] # Physical, general, scalar metrics

# Define X values to learn from
X_labels = X_labels_default

# Define models to evaluate
models = [
    MultiOutputRegressor(GradientBoostingRegressor()),
]

# Define scoring functions
scoring_funcs = [
    mean_squared_error,
    mean_absolute_error,
    explained_variance_score,
    mean_squared_log_error,
    r2_score
]

# Train Multi-task models on interval_size-sample increments of data, up to the whole dataset
interval_size = 10
ns_samples = np.arange(0, df.shape[0], interval_size)[1:]

# Take ns_eval_samples samples of random x for each increment, to get accurate results at low n
ns_eval_samples = [10] * len(ns_samples)

# Load data
X, Y = get_data(df, X_labels, Y_labels)

results = []
for i in range(len(models)):
    results += [[]]
    for j in range(len(ns_samples)):
        results[-1] += [[]]
        for k in range(ns_eval_samples[j]):
            indices = np.random.choice(X.shape[0], ns_samples[j])
            results[-1][-1] += [ learn(models[i], X[indices], Y[indices], scoring_funcs) ]

In [140]:
for i in range(len(models)):
    print("\t" + str(type(models[i].estimator)).replace('\n', ' '))
    res = [[np.mean([results[i][j][k][1][l]
                          for k in range(ns_eval_samples[j])], axis=0)
                          for j in range(len(ns_samples))]
                          for l in range(len(scoring_funcs))]
    for l in range(len(scoring_funcs)):
          pr(scoring_funcs[l].__name__)
          pr([[float("{:,}".format(round(r_, 3))) for r_ in list(r)] for r in res[l]])
    print()

	<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
'mean_squared_error'
[   [6.29, 6.263, 5.959, 4.572, 7.727, 10.636, 6.279],
    [6.405, 6.787, 7.583, 4.75, 7.73, 11.082, 9.405],
    [4.069, 4.526, 4.15, 3.682, 4.866, 7.448, 4.52],
    [3.361, 3.339, 3.511, 2.97, 3.545, 6.163, 3.442],
    [2.435, 2.548, 2.907, 2.385, 3.667, 4.474, 3.108]]
'mean_absolute_error'
[   [1.766, 1.775, 1.836, 1.617, 2.087, 2.528, 1.795],
    [1.642, 1.758, 1.95, 1.456, 1.885, 2.28, 2.122],
    [1.192, 1.342, 1.204, 1.106, 1.279, 1.705, 1.269],
    [0.956, 0.971, 1.015, 0.977, 1.002, 1.415, 1.012],
    [0.771, 0.802, 0.878, 0.809, 0.941, 1.105, 0.881]]
'explained_variance_score'
[   [-0.014, -0.081, 0.015, -0.017, -0.146, -0.159, 0.196],
    [0.006, -0.193, -0.18, -0.088, -0.329, -0.129, -0.15],
    [0.345, 0.134, 0.363, 0.141, 0.163, 0.193, 0.425],
    [0.465, 0.417, 0.453, 0.356, 0.371, 0.365, 0.564],
    [0.554, 0.51, 0.482, 0.457, 0.298, 0.532, 0.585]]
'mean_squared_log_error'
[   [0

In [135]:
for i in range(len(models)):
    print("\t" + str(type(models[i].estimator)).replace('\n', ' '))
    res = [[round(np.mean([np.mean(results[i][j][k][1][l])
                          for k in range(ns_eval_samples[j])]), 3)
                          for j in range(len(ns_samples))]
                          for l in range(len(scoring_funcs))]
    for l in range(len(scoring_funcs)):
        pr(scoring_funcs[l].__name__ + ": " + \
          "[ " + ''.join(["{:,}".format(float(r)) + ", " for r in res[l]])[:-2] + " ]")
    print()

	<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
'mean_squared_error: [ 6.818, 7.678, 4.752, 3.761, 3.075 ]'
'mean_absolute_error: [ 1.915, 1.87, 1.3, 1.05, 0.884 ]'
'explained_variance_score: [ -0.029, -0.152, 0.252, 0.427, 0.488 ]'
'mean_squared_log_error: [ 0.349, 0.333, 0.23, 0.181, 0.139 ]'
'r2_score: [ -0.066, -0.169, 0.244, 0.422, 0.482 ]'



In [123]:
print("Word: '" + df.index[11] + "'")
print("Prediction: " + str([round(y, 3) for y in results[0][-1][0][0][11]]))
print("Actual: " + str(Y[11]))

Word: 'bounces'
Prediction: [1.0, 5.0, 6.0, 6.0, 8.0, 6.5, 5.75, 4.5]
Actual: [ 1.    6.5   6.    8.5   9.    9.5   3.7   8.75]


In [100]:
np.mean([np.array([1,2]), np.array([3,4])], axis=0)

array([ 2.,  3.])