In [1]:
import inspect
import numpy as np
import pandas as pd
from sklearn.tree import *
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

from Constants import *
from Utils import *
from Dataset import *
from DataRequests import *
from WordRepLibrary import *

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fonz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
dataset = Dataset(combined_csv)
df = dataset.load()

In [4]:
def get_data(dataframe, x_lab, y_lab):
    return np.array(dataframe[x_lab]), np.array(dataframe[y_lab])

def learn(model, X, Y, scorers):
    Y_pr = cross_val_predict(model, X, Y, cv=10)
    errs = [scorer(Y, Y_pr, multioutput='raw_values') for scorer in scorers]
    return Y_pr, errs

In [5]:
# Define Y values to learn
Y_labels = Y_labels_default[1:] # All except 'inc', which is binary
# Y_labels = Y_labels_default
# Y_labels = ['dif', 'nrd', 'skt'] # Sketchability
# Y_labels = ['dif', 'nrd', 'skt', 'vis'] # Visuality
# Y_labels = ['vis', 'phy', 'obj'] # Physicality

# Define X values to learn from
X_labels = X_labels_default

# Define models to evaluate
models = [
    Pipeline(steps=[
        ("scale", RobustScaler()),
        ("model", MultiOutputRegressor(GradientBoostingRegressor())),
    ]),
]

# Define scoring functions
scoring_funcs = [
    mean_absolute_error,
    r2_score
#     mean_squared_error,
#     mean_squared_log_error,
#     explained_variance_score,
]

# Train Multi-task models on interval_size-sample increments of data, up to the whole dataset
interval_size = 18
ns_samples = np.arange(0, df.shape[0], interval_size)[1:]

# Take ns_eval_samples samples of random x for each increment, to get accurate results at low n
ns_eval_samples = [10] * len(ns_samples)

# Load data
X, Y = get_data(df, X_labels, Y_labels)

# For debug: convert x data into random numbers
# X = np.random.random(X.shape)

In [6]:
# Perform cross validation
results = []
for i in range(len(models)):
    results += [[]]
    for j in range(len(ns_samples)):
        results[-1] += [[]]
        for k in range(ns_eval_samples[j]):
            indices = np.random.choice(X.shape[0], ns_samples[j])
            results[-1][-1] += [ learn(models[i], X[indices], Y[indices], scoring_funcs) ]

In [9]:
# Print errors
for i in range(len(models)):
    print("\t" + str(type(models[i].steps[-1][1].estimator)).replace('\n', ' '))
    res = [[np.mean([results[i][j][k][1][l]
                          for k in range(ns_eval_samples[j])], axis=0)
                          for j in range(len(ns_samples))]
                          for l in range(len(scoring_funcs))]
    for l in range(len(scoring_funcs)):
          pr(scoring_funcs[l].__name__)
          pr([format_numlist(r, 3) for r in res[l]])
    print()

	<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
'mean_absolute_error'
[   [1.92, 1.793, 1.914, 1.729, 1.637, 2.643, 1.836],
    [1.237, 1.122, 1.213, 1.073, 1.268, 1.693, 1.408]]
'r2_score'
[   [-0.12, -0.195, -0.23, -0.492, -0.311, -0.392, 0.104],
    [0.185, 0.214, 0.351, 0.203, 0.193, 0.164, 0.263]]



In [10]:
# Print average errors
for i in range(len(models)):
    print("\t" + str(type(models[i].steps[-1][1].estimator)).replace('\n', ' '))
    res = [[round(np.mean([np.mean(results[i][j][k][1][l])
                          for k in range(ns_eval_samples[j])]), 3)
                          for j in range(len(ns_samples))]
                          for l in range(len(scoring_funcs))]
    for l in range(len(scoring_funcs)):
        pr(scoring_funcs[l].__name__ + ": " + \
           str(format_numlist(res[l], 3)))
    print()

	<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
'mean_absolute_error: [1.925, 1.288]'
'r2_score: [-0.234, 0.225]'



In [None]:
# Inspect errors
print("Word: '" + df.index[11] + "'")
print("Prediction: " + str([round(y, 3) for y in results[0][-1][0][0][11]]))
print("Actual: " + str(Y[11]))

In [11]:
# Load data for new word prediction
ngrams_db = load_ngram_counts()             # Load ngram counts database
wrep_lib = WordRepLibrary(lib_paths[-1])    # Load word representation database


Loading ngram counts database... 5899064 entries
Loading ngram counts database took 4.8438801765441895 seconds.
Loading word representation library 'glove.42B.300d.txt'
Loading word representation library took 97.02606701850891 seconds.



In [16]:
# Predict for some random new words
n_pred = 100
i_seen = map(int, list(df["lib_i"]))
model = models[-1]
model.fit(X, Y)
print("         ", Y_labels)
for i in range(n_pred):
    word, word_i, x = wrep_lib.get_new_word(i_seen)
    while word not in ngrams_db.index or not valid_word(word, verbose=False):
        word, word_i, x = wrep_lib.get_new_word()
    x += list(ngrams_db.loc[word]) + [ google_search_count(word) ]
    y_pred = model.predict(np.atleast_2d(x))
    print(word + ": " + str(format_numlist(y_pred[0], 3)))

          ['dif', 'nrd', 'skt', 'vis', 'phy', 'obj', 'com']
alaskans: [3.73, 4.321, 1.941, 5.43, 5.848, 8.015, 1.988]
hyperbilirubinemia: [1.686, 3.164, 3.995, 6.919, 6.753, 2.726, 1.19]
allies: [4.887, 5.133, 5.439, 6.328, 8.146, 6.931, 5.532]
loggings: [1.903, 4.138, 4.321, 4.901, 4.741, 1.513, 1.387]
astute: [2.87, 6.658, 3.775, 7.071, 5.445, 6.764, 3.471]
faster: [5.064, 4.403, 7.141, 7.123, 7.624, 2.671, 6.413]
plowed: [6.461, 6.285, 6.452, 6.362, 8.309, 7.31, 5.935]
ancistrodon: [1.419, 3.095, 1.518, 6.468, 8.181, 5.347, 0.951]
stoppers: [5.706, 5.543, 5.084, 5.898, 6.923, 4.983, 4.457]
eatings: [1.899, 3.204, 3.356, 6.023, 8.035, 3.211, 2.119]
communion: [4.67, 2.661, 4.983, 3.734, 6.433, 2.117, 6.073]
prevent: [5.875, 5.216, 5.758, 7.711, 8.276, 3.422, 5.141]
spotless: [4.827, 5.954, 6.728, 4.075, 5.043, 8.334, 4.703]
uscb: [1.792, 3.314, 4.26, 6.674, 6.474, 5.412, 1.114]
firethorn: [3.08, 5.301, 2.02, 6.057, 7.791, 7.92, 1.092]
surmounting: [2.057, 3.25, 1.886, 5.034, 6.696, 5