# Random Forest

### Requirements

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from utils import corex_probs_factory, preprocess_factory, syntax_factory

data_dir = Path("/home/iailab36/iser/data")
model_dir = Path("/home/iailab36/iser/models")

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
# Load benchmark dataset
train_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-train.feather")
val_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-dev.feather")

## Pre-compute features

In [3]:
features_train = []
features_val = []

### topic model features

In [4]:
get_topic_probs = corex_probs_factory(
    corex_path=model_dir / "corex-sts-128",
    vectorizer_path=model_dir / "vectorizer-sts-10000",
)

In [5]:
# compute topic probabilities
topic_probs_train_1 = get_topic_probs(train_data.s1)
topic_probs_train_2 = get_topic_probs(train_data.s2)
topic_probs_val_1 = get_topic_probs(val_data.s1)
topic_probs_val_2 = get_topic_probs(val_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_val = np.concatenate([topic_probs_val_1, topic_probs_val_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_val)

### syntax features

In [6]:
get_syntax_deps = syntax_factory()
# compute syntax tokens
syntax_train_1 = get_syntax_deps(train_data.s1)
syntax_train_2 = get_syntax_deps(train_data.s2)
syntax_val_1 = get_syntax_deps(val_data.s1)
syntax_val_2 = get_syntax_deps(val_data.s2)
# mask matching syntax
syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
syntax_val = (syntax_val_1 == syntax_val_2).astype(int)
# append to features list
features_train.append(syntax_train)
features_val.append(syntax_val)

## Training without data augmentation

In [7]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_val = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_val = val_data.score
print("X_train:", X_train.shape)

X_train: (5552, 256)


In [8]:
# train model
model = RandomForestRegressor(criterion="squared_error", n_estimators=100, max_depth=15, random_state=1337)
# model = DecisionTreeRegressor(random_state=1337)
# model = MLPRegressor((1024, 512, 256, 128))
model.fit(X_train, y_train)
# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_val = spearmanr(model.predict(X_val), y_val)[0]
print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-val: {spearman_val:.4f}")

SpearmanRank-train: 0.9681,	 SpearmanRank-val: 0.5079


Only STS-Benchmark:
```python
spearmanrank_wo_syntax = 0.4593
spearmanrank_w_syntax = 0.4541
```

STS-B + NLI:
```python
spearmanrank_wo_syntax = 0.4586
spearmanrank_w_syntax = 0.4614
```

## Training with data augmentation

In [9]:
# # load augmentation dataset
# aug_data = pd.read_feather(benchmark_dir / "df_augment.feather")

# features_aug = []

In [10]:
# # get topics of the augmented sentences
# topic_probs_augmented = np.concatenate([
#     topic_probs_train_1[aug_data.idx1],
#     topic_probs_train_2[aug_data.idx2]
# ], axis=1)
# features_aug.append(topic_probs_augmented)

In [11]:
# syntax_aug = (syntax_train_1[aug_data.idx1] == syntax_train_2[aug_data.idx2]).astype(int)
# features_aug.append(syntax_aug)

In [12]:
# # create inputs / targets of augmented dataset
# X_aug = np.concatenate(features_aug, axis=1)
# y_aug = aug_data.score
# print(f"#augmented: {y_aug.shape[0]}")

In [13]:
# X_train_w_aug = np.concatenate([X_train, X_aug])
# y_train_w_aug = np.concatenate([y_train, y_aug])
# print(f"#(train+augmented): {y_aug.shape[0]}")

In [14]:
# # train model
# model = RandomForestRegressor(criterion="squared_error", n_estimators=100, max_depth=15, random_state=1337)
# model.fit(X_train_w_aug, y_train_w_aug)
# # evaluate model
# spearman_train = spearmanr(model.predict(X_train_w_aug), y_train_w_aug)[0]
# spearman_test = spearmanr(model.predict(X_val), y_val)[0]
# print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-val: {spearman_val:.4f}")

```python
SpearmanRank = 0.4677
np.random.seed(??)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4557
np.random.seed(42)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4374
np.random.seed(1337)
f = lambda x: np.exp(4*x) * 2  # sampling function - #2655
```

## Qualitative analysis

In [15]:
# get predictions
y_pred = model.predict(X_val)

In [16]:
preprocess = preprocess_factory()

In [17]:
df = val_data[["s1", "s2"]].copy()
df["s1_processed"] = df.s1.apply(preprocess)
df["s2_processed"] = df.s2.apply(preprocess)
df["y_true"] = y_val
df["y_pred"] = y_pred
df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_val]

NameError: name 'syntax_val' is not defined

In [None]:
df[(df.y_true - df.y_pred) > 0.4]

In [None]:
get_syntax_deps(val_data.s1)[7]

In [None]:
df[(df.y_true - df.y_pred) < 0.1]

### Naive cosine similarity

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
doc = nlp("Blue and red plane in mid-air flight.")
print("\t".join([token.dep_ for token in doc]))
print("\t".join([token.lemma_ for token in doc]))