In [16]:
import inspect
import numpy as np
import pandas as pd
from sklearn.tree import *
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

from Constants import *
from Utils import *
from Dataset import *
from DataRequests import *
from WordRepLibrary import *

In [17]:
inspect.getmro(DecisionTreeClassifier)

(sklearn.tree.tree.DecisionTreeClassifier,
 sklearn.tree.tree.BaseDecisionTree,
 abc.NewBase,
 sklearn.base.BaseEstimator,
 sklearn.base.ClassifierMixin,
 object)

In [18]:
dataset = Dataset(combined_csv)
df = dataset.load()

In [19]:
def get_data(dataframe, x_lab, y_lab):
    return np.array(dataframe[x_lab]), np.array(dataframe[y_lab])

def learn(model, X, Y, scorers):
    Y_pr = cross_val_predict(model, X, Y, cv=10)
    errs = [scorer(Y, Y_pr, multioutput='raw_values') for scorer in scorers]
    return Y_pr, errs

In [20]:
# Define Y values to learn
Y_labels = Y_labels_default[1:] # All except 'inc', which is binary
# Y_labels = Y_labels_default
# Y_labels = ['dif', 'nrd', 'skt'] # Sketchability
# Y_labels = ['dif', 'nrd', 'skt', 'vis'] # Visuality
# Y_labels = ['vis', 'phy', 'obj'] # Physicality

# Define X values to learn from
X_labels = X_labels_default

# Define models to evaluate
models = [
    Pipeline(steps=[
        ("scale", RobustScaler()),
        ("model", MultiOutputRegressor(GradientBoostingRegressor())),
    ]),
]

# Define scoring functions
scoring_funcs = [
    mean_absolute_error,
    r2_score
#     mean_squared_error,
#     mean_squared_log_error,
#     explained_variance_score,
]

# Train Multi-task models on interval_size-sample increments of data, up to the whole dataset
interval_size = 18
ns_samples = np.arange(0, df.shape[0], interval_size)[1:]

# Take ns_eval_samples samples of random x for each increment, to get accurate results at low n
ns_eval_samples = [10] * len(ns_samples)

# Load data
X, Y = get_data(df, X_labels, Y_labels)

# For debug: convert x data into random numbers
X = np.random.random(X.shape)

In [5]:
# Perform cross validation
results = []
for i in range(len(models)):
    results += [[]]
    for j in range(len(ns_samples)):
        results[-1] += [[]]
        for k in range(ns_eval_samples[j]):
            indices = np.random.choice(X.shape[0], ns_samples[j])
            results[-1][-1] += [ learn(models[i], X[indices], Y[indices], scoring_funcs) ]

In [6]:
# Print errors
for i in range(len(models)):
    print("\t" + str(type(models[i].estimator)).replace('\n', ' '))
    res = [[np.mean([results[i][j][k][1][l]
                          for k in range(ns_eval_samples[j])], axis=0)
                          for j in range(len(ns_samples))]
                          for l in range(len(scoring_funcs))]
    for l in range(len(scoring_funcs)):
          pr(scoring_funcs[l].__name__)
          pr([[float("{:,}".format(round(r_, 3))) for r_ in list(r)] for r in res[l]])
    print()

	<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
'mean_absolute_error'
[   [1.772, 1.659, 1.935, 1.644, 1.963, 2.113, 2.169],
    [1.854, 1.728, 2.166, 1.687, 1.689, 2.413, 2.224],
    [1.389, 1.323, 1.44, 1.184, 1.565, 1.935, 1.476],
    [1.063, 0.929, 1.071, 0.933, 1.088, 1.401, 1.119],
    [0.792, 0.794, 0.916, 0.839, 0.876, 1.118, 0.802]]
'r2_score'
[   [-0.019, -0.114, -0.019, -0.289, -0.364, -0.033, -0.077],
    [-0.316, -0.124, -0.433, -0.202, -0.052, -0.149, -0.307],
    [0.253, 0.176, 0.222, 0.243, 0.011, 0.076, 0.299],
    [0.314, 0.446, 0.376, 0.385, 0.307, 0.326, 0.446],
    [0.574, 0.551, 0.532, 0.374, 0.461, 0.545, 0.684]]



In [7]:
# Print average errors
for i in range(len(models)):
    print("\t" + str(type(models[i].estimator)).replace('\n', ' '))
    res = [[round(np.mean([np.mean(results[i][j][k][1][l])
                          for k in range(ns_eval_samples[j])]), 3)
                          for j in range(len(ns_samples))]
                          for l in range(len(scoring_funcs))]
    for l in range(len(scoring_funcs)):
        pr(scoring_funcs[l].__name__ + ": " + \
          "[ " + ''.join(["{:,}".format(float(r)) + ", " for r in res[l]])[:-2] + " ]")
    print()

	<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
'mean_absolute_error: [ 1.894, 1.966, 1.473, 1.086, 0.877 ]'
'r2_score: [ -0.131, -0.226, 0.183, 0.372, 0.532 ]'



In [21]:
# Predict for some random words
n_pred = 100
ngrams_db = load_ngram_counts()            # Load ngram counts database
library = WordRepLibrary(lib_paths[-1])    # Load word representation database


Loading ngram counts database... 5899064 entries
Loading ngram counts database took 12.817402124404907 seconds.
Loading word representation library 'glove.42B.300d.txt'
Loading word representation library took 501.7303628921509 seconds.



NotImplementedError: ix is not iterable

In [26]:
for i in range(n_pred):
    word, word_i, x = library.get_new_word()
    print((ngrams_db.loc(word)))
    x += list(ngrams_db.loc(word)) + [ google_search_count(word) ]
    print(len(x), X.shape)
    sys.exit()
#     y_pred = 

<pandas.core.indexing._LocIndexer object at 0x111aff358>


NotImplementedError: ix is not iterable

In [8]:
print("Word: '" + df.index[11] + "'")
print("Prediction: " + str([round(y, 3) for y in results[0][-1][0][0][11]]))
print("Actual: " + str(Y[11]))

Word: 'bounces'
Prediction: [2.0339999999999998, 6.9260000000000002, 3.774, 3.343, 6.6349999999999998, 8.5410000000000004, 2.1179999999999999]
Actual: [ 6.5   6.    8.5   9.    9.5   3.7   8.75]


In [7]:
models[-1]

NameError: name 'models' is not defined

In [None]:
ns_samples = np.arange(0, df.shape[0], 18)[1:]
