# Random Forest

### Requirements

In [1]:
import numpy as np
import pandas as pd

from scipy.stats import spearmanr, pearsonr
from pathlib import Path

data_dir = Path("/home/iailab36/iser/data")
model_dir = Path("/home/iailab36/iser/models")

SEEDS = [1337, 42, 87]
COREX_HIDDEN = 64
VEC_FEAT = 10_000

In [2]:
# Load benchmark dataset
train_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-train.feather")
val_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-dev.feather")
test_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-test.feather")

train_data = pd.concat([train_data, val_data]).reset_index(drop=True)

## Pre-compute features

In [3]:
from utils import CoreXProbsFactory, SyntaxFactory

features_train = []
features_val = []

  return torch._C._cuda_getDeviceCount() > 0


### topic model features

In [4]:
get_topic_probs = CoreXProbsFactory(
    vectorizer_path=model_dir / f"sts_vec={VEC_FEAT}",
    corex_name=f"corex_n_hidden={COREX_HIDDEN}_iter=7",
)

In [5]:
# compute topic probabilities
topic_probs_train_1 = get_topic_probs(train_data.s1)
topic_probs_train_2 = get_topic_probs(train_data.s2)
topic_probs_test_1 = get_topic_probs(test_data.s1)
topic_probs_test_2 = get_topic_probs(test_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_test = np.concatenate([topic_probs_test_1, topic_probs_test_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_test)

### syntax features

In [6]:
get_syntax_deps = SyntaxFactory()
# compute syntax tokens
syntax_train_1 = get_syntax_deps(train_data.s1)
syntax_train_2 = get_syntax_deps(train_data.s2)
syntax_test_1 = get_syntax_deps(test_data.s1)
syntax_test_2 = get_syntax_deps(test_data.s2)
# mask matching syntax
syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
syntax_test = (syntax_test_1 == syntax_test_2).astype(int)
# append to features list
features_train.append(syntax_train)
features_val.append(syntax_test)

## Training without data augmentation

In [7]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_test = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_val = test_data.score
print("X_train:", X_train.shape)

X_train: (7030, 131)


In [8]:
print()
print()
print(f"train configuration: COREX_HIDDEN={COREX_HIDDEN}, VEC_FEAT={VEC_FEAT}")
print()



train configuration: COREX_HIDDEN=64, VEC_FEAT=10000



In [9]:
print("without augmentation")
print()

without augmentation



In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

spearman_train = np.empty((len(SEEDS,)))
spearman_test = np.empty((len(SEEDS,)))

rnd_forest = lambda seed: RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=seed)
dec_tree = lambda seed: DecisionTreeRegressor(random_state=seed)
mlp = lambda seed: MLPRegressor((512, 256, 128), random_state=seed)

for model_cls in [dec_tree]:
    print(model_cls(0).__class__.__name__)

    for i, seed in enumerate(SEEDS):

        model = model_cls(seed)
        model.fit(X_train, y_train)

        # evaluate model
        spearman_train[i] = spearmanr(model.predict(X_train), y_train)[0]
        spearman_test[i] = spearmanr(model.predict(X_test), y_val)[0]

        print(f"SpearmanRank-train: {spearman_train[i]:.4f},\t SpearmanRank-test: {spearman_test[i]:.4f}")
    
    print(f"Mean & Std for {model.__class__.__name__}")
    print(f"SpearmanRank-train: mean={spearman_train.mean():.4f}, std={spearman_train.std():.4f}")
    print(f"SpearmanRank-test: mean={spearman_test.mean():.4f}, std={spearman_test.std():.4f}")
    print()

DecisionTreeRegressor
SpearmanRank-train: 0.9997,	 SpearmanRank-test: 0.2726
SpearmanRank-train: 0.9997,	 SpearmanRank-test: 0.2859
SpearmanRank-train: 0.9997,	 SpearmanRank-test: 0.2795
Mean & Std for <class 'sklearn.tree._classes.DecisionTreeRegressor'>
SpearmanRank-train: mean=0.9997, std=0.0000
SpearmanRank-test: mean=0.2793, std=0.0054



## Training with data augmentation

In [13]:
print("with augmentation")

with augmentation


In [14]:
# load augmentation dataset
aug_data = pd.read_feather("df_augment.feather")

features_aug = []

In [15]:
# get topics of the augmented sentences
topic_probs_augmented = np.concatenate([
    topic_probs_train_1[aug_data.idx1],
    topic_probs_train_2[aug_data.idx2]
], axis=1)
features_aug.append(topic_probs_augmented)

In [16]:
# syntax features
syntax_aug = (syntax_train_1[aug_data.idx1] == syntax_train_2[aug_data.idx2]).astype(int)
features_aug.append(syntax_aug)

In [17]:
# create inputs / targets of augmented dataset
X_aug = np.concatenate(features_aug, axis=1)
y_aug = aug_data.score
print(f"#augmented: {y_aug.shape[0]}")

X_train_w_aug = np.concatenate([X_train, X_aug])
y_train_w_aug = np.concatenate([y_train, y_aug])
print(f"#(train+augmented): {y_aug.shape[0]}")
print()

#augmented: 6307
#(train+augmented): 6307



In [18]:
spearman_train = np.empty((len(SEEDS,)))
spearman_test = np.empty((len(SEEDS,)))

for model_cls in [dec_tree]:
    print(model_cls(0).__class__.__name__)

    for i, seed in enumerate(SEEDS):

        model = model_cls(seed)
        model.fit(X_train_w_aug, y_train_w_aug)

        # evaluate model
        spearman_train[i] = spearmanr(model.predict(X_train), y_train)[0]
        spearman_test[i] = spearmanr(model.predict(X_test), y_val)[0]

        print(f"SpearmanRank-train: {spearman_train[i]:.4f},\t SpearmanRank-test: {spearman_test[i]:.4f}")
    
    print(f"Mean & Std for {model.__class__.__name__}")
    print(f"SpearmanRank-train: mean={spearman_train.mean():.4f}, std={spearman_train.std():.4f}")
    print(f"SpearmanRank-test: mean={spearman_test.mean():.4f}, std={spearman_test.std():.4f}")
    print()

DecisionTreeRegressor
SpearmanRank-train: 0.9946,	 SpearmanRank-test: 0.2482
SpearmanRank-train: 0.9946,	 SpearmanRank-test: 0.2385
SpearmanRank-train: 0.9946,	 SpearmanRank-test: 0.2393
Mean & Std for DecisionTreeRegressor
SpearmanRank-train: mean=0.9946, std=0.0000
SpearmanRank-test: mean=0.2420, std=0.0044



## Qualitative analysis

In [None]:
# unit vectors
x1 = topic_probs_train_1 / np.linalg.norm(topic_probs_train_1, axis=1)[:, None]
x2 = topic_probs_train_2 / np.linalg.norm(topic_probs_train_2, axis=1)[:, None]

y_naiv = (x1[:, None, ...] @ x2[..., None]).squeeze()

In [None]:
from scipy.stats import wasserstein_distance
from sklearn.preprocessing import MinMaxScaler

# softmax
x1 = np.exp(topic_probs_train_1)/np.exp(topic_probs_train_1).sum(0)
x2 = np.exp(topic_probs_train_2)/np.exp(topic_probs_train_2).sum(0)

dists = np.array([wasserstein_distance(x1[i], x2[i]) for i in range(x1.shape[0])])

mms = MinMaxScaler()
y_naiv = mms.fit_transform(dists.reshape(-1, 1)).squeeze()

In [None]:
spearman_val = spearmanr(y_naiv, y_val)[0]
pearson_val = pearsonr(y_naiv, y_val)[0]
print(f"SpearmanRank-val: {spearman_val:.4f}")
print(f"PearsonRank-val: {pearson_val:.4f}")

In [None]:
y_pred = model.predict(X_test)
spearman_val = spearmanr(y_pred, y_val)[0]
pearson_val = pearsonr(y_pred, y_val)[0]
print(f"SpearmanRank-val: {spearman_val:.4f}")
print(f"PearsonRank-val: {pearson_val:.4f}")

In [None]:
from utils import preprocess

df = test_data[["s1", "s2"]].copy()
df["s1_processed"] = df.s1.apply(preprocess)
df["s2_processed"] = df.s2.apply(preprocess)
df["y_true"] = y_val
df["y_pred"] = y_pred
df["y_naiv"] = y_naiv
df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_test]

In [None]:
df[(df.y_true - df.y_pred).abs() > 0.4]

In [None]:
"ad" in get_topic_probs.vectorizer.get_feature_names()

In [None]:
get_syntax_deps(test_data.s1)[7]

In [None]:
df[(df.y_true - df.y_pred) < 0.1]

### Naive cosine similarity

In [None]:
# import spacy
# nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
# df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
# print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
# doc = nlp("Blue and red plane in mid-air flight.")
# print("\t".join([token.dep_ for token in doc]))
# print("\t".join([token.lemma_ for token in doc]))