In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

In [None]:
sys.path.append('code')

## Load data

In [None]:
with open("data/2020-09-08_sentences.txt") as sfile:
    sents = sfile.readlines()
sents = [x.replace("\n", "") for x in sents]

with open("data/2020-09-14_embeddings_mean.csv") as efile:
    embs = efile.readlines()
embs = [[float(y) for y in x.replace("\n", "").split(";")] for x in embs]

with open("data/2020-09-22_properties.csv") as pfile:
    props = pfile.readlines()
props = [[int(y) for y in x.replace("\n", "").split(";")] for x in props]

In [None]:
# create dataset
X, y = np.array(props), np.array(embs)
# summarize shape
print(X.shape, y.shape)
inputdim = X.shape[1]
outputdim = y.shape[1]
print(inputdim)
print(outputdim)

## MLP for regression

### Define model

In [None]:
from utils import *

In [None]:
net = Net(inputdim, outputdim)     # define the network
optimizer = torch.optim.SGD(net.parameters(), lr=0.2)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

X_train, X_test, y_train, y_test = train_test_split(props, embs, test_size=0.20, random_state=42)
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)

### Train model

In [None]:
# train the network
for t in range(2000):
    prediction = net(X_train)     # input x and predict based on x
    loss = loss_func(prediction, y_train)     # must be (1. nn output, 2. target)
    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

### Save model

In [None]:
emptymodel = Net(inputdim, outputdim)
model = my_load_model("test", emptymodel)

In [None]:
preds = model(X_test)

## RF for regression

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

### Define and train model

In [None]:
# define model
model_rf = RandomForestRegressor(n_estimators=200)
# fit model
model_rf.fit(X_train, y_train)

## Evaluate predictions

In [None]:
l = np.sort(model_rf.feature_importances_)

In [None]:
# features names in right order
names = ['Same-Unit', 'elaboration-additional-e', 'List', 'elaboration-object-attribute-e',
                'condition', 'attribution', 'elaboration-additional', 'circumstance',
                'purpose', 'elaboration-general-specific', 'restatement-e', 'reason',
                'elaboration-part-whole-e', 'rhetorical-question', 'manner', 'concession',
                'consequence-n', 'temporal-same-time', 'restatement', 'elaboration-object-attribute',
                'antithesis', 'consequence-s', 'definition', 'Contrast', 'result', 'means', 'attribution-n',
                'interpretation-s', 'temporal-after', 'example-e', 'result-e', 'explanation-argumentative',
                'elaboration-general-specific-e', 'elaboration-set-member-e', 'attribution-e',
                'evaluation-s', 'circumstance-e', 'background', 'evidence', 'comment-e', 'purpose-e',
                'interpretation-n', 'condition-e', 'temporal-before', 'comment', 'elaboration-part-whole',
                'comparison', 'contingency', 'hypothetical', 'analogy-e', 'evaluation-s-e',
                'elaboration-set-member', 'temporal-after-e', 'comparison-e', 'means-e', 'example',
                'evidence-e', 'cause', 'preference', 'enablement', 'question-answer-s',
                'explanation-argumentative-e', 'definition-e', 'antithesis-e', 'summary-n',
                'analogy', 'interpretation-s-e', 'concession-e', 'consequence-s-e', 'manner-e',
                'preference-e', 'reason-e', 'temporal-same-time-e', 'consequence-n-e',
                'enablement-e', 'evaluation-n', 'temporal-before-e', 'otherwise', 'question-answer-n',
                'conclusion', 'dr_exist', 'nb_distinct_rel', 'nb_nuc', 'nb_sat', 'nb_root'
                'width', "height", "multinuc"]

feat_imp = model_rf.feature_importances_
    
sVals = sorted(feat_imp)  
Z = [x for _,x in sorted(zip(feat_imp,names))]
for feat, name in zip(sVals, Z):
    print(str(feat)+"  "+name)

In [None]:
l = np.sort(model_rf.feature_importances_)
plt.plot(l)
plt.show()

In [None]:
preds_rf = model_rf.predict(X_test)

In [None]:
from scipy.spatial import distance

In [None]:
from FImp import rsqq, rsq, mse

In [None]:
cos_dists = []
rsq_dists = []
mse_dists = []

for true, pred in zip(preds_rf, y_test):
    cos_dists.append(distance.cosine(true, pred))
    rsq_dists.append(rsqq(true, pred))
    mse_dists.append(mse(true, pred))

In [None]:
print(np.min(rsq_dists))
print(np.mean(rsq_dists))
print(np.max(rsq_dists))

In [None]:
print(np.min(rsq_dists))
print(np.mean(rsq_dists))
print(np.max(rsq_dists))

In [None]:
print(np.min(cos_dists))
print(np.mean(cos_dists))
print(np.max(cos_dists))

In [None]:
print(np.min(mse_dists))
print(np.mean(mse_dists))
print(np.max(mse_dists))

In [None]:
for x, y in zip(preds_rf[0], y_test[0]):
    print(str(x)+"   "+str(float(y)))

In [None]:
cos_dists = []
rsq_dists = []


for true, pred in zip(preds_rf, y_test):
    #cos_dists.append(distance.cosine(true, pred))
    rsq_dists.append(rsqq(true, pred))


print(np.min(rsq_dists))
print(np.mean(rsq_dists))
print(np.max(rsq_dists))

In [None]:
from FImp import *

In [None]:
pred_X = net(X_test)

In [None]:
pred_X = pred_X.tolist()

In [None]:
mean_mse = []

In [None]:
for x, y in zip(y_test, pred_X):
    x = [int(b) for b in x]
    y = [int(a) for a in y]
    cur_mse = mse(list(x), list(y))
    mean_mse.append(cur_mse)

In [None]:
mean_mae = []

In [None]:
for x, y in zip(y_test, pred_X):
    x = [int(b) for b in x]
    y = [int(a) for a in y]
    cur_mae = mae(list(x), list(y))
    mean_mae.append(cur_mae)

In [None]:
print(np.mean(mean_mae))

In [None]:
np.sqrt(np.mean(mean_mae))

In [None]:
print(np.mean(mean_mse))

In [None]:
np.sqrt(np.mean(mean_mse))

In [None]:
dbg_pred = pred_X[100]
dbg_true = y_test[100]


print(mse(dbg_pred, dbg_true))
print("===============")
for x, y in zip(dbg_pred, dbg_true):
    print(str(x)+ "\t"+ str(y))