# Random Forest

### Requirements

In [25]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor

from sentence_similarity.data.datasets import STSB
from sentence_similarity.data.features import CoreXFeatures

data_dir = Path("data")
model_dir = Path("models") / "corex_nli_stsb"

In [3]:
# Load benchmark dataset
train_data = STSB(data_dir, partition="train")
val_data = STSB(data_dir, partition="dev")

## Pre-compute features

In [18]:
features_train = []
features_val = []

### topic model features

In [22]:
# features for the model
corex_feat = CoreXFeatures(model_dir)
print(f"#topics: {corex_feat.input_size}")

#topics: 50


In [None]:
# compute topic probabilities
topic_probs_train_1 = corex_feat(train_data.s1)
topic_probs_train_2 = corex_feat(train_data.s2)
topic_probs_val_1 = corex_feat(val_data.s1)
topic_probs_val_2 = corex_feat(val_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_val = np.concatenate([topic_probs_val_1, topic_probs_val_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_val)

### syntax features

In [21]:
# syntax_feat = SyntaxFeatures()
# # compute syntax tokens
# syntax_train_1 = syntax_feat(train_data.s1)
# syntax_train_2 = syntax_feat(train_data.s2)
# syntax_val_1 = syntax_feat(val_data.s1)
# syntax_val_2 = syntax_feat(val_data.s2)
# # mask matching syntax
# syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
# syntax_val = (syntax_val_1 == syntax_val_2).astype(int)
# # append to features list
# features_train.append(syntax_train)
# features_val.append(syntax_val)

## Training without data augmentation

In [None]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_val = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_val = val_data.score

In [None]:
# train model
model = RandomForestRegressor(criterion="squared_error", n_estimators=100, max_depth=15, random_state=1337)
model.fit(X_train, y_train)
# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_val = spearmanr(model.predict(X_val), y_val)[0]
print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-val: {spearman_val:.4f}")

Only STS-Benchmark:
```python
spearmanrank_wo_syntax = 0.4593
spearmanrank_w_syntax = 0.4541
```

STS-B + NLI:
```python
spearmanrank_wo_syntax = 0.4586
spearmanrank_w_syntax = 0.4614
```

## Training with data augmentation

In [None]:
# load augmentation dataset
augmentation_data = pd.read_feather(data_dir / "df_augment.feather")

In [None]:
# get topics of the augmented sentences
topic_probs_augmented = np.concatenate([
    topic_probs_train_1[augmentation_data.idx1],
    topic_probs_train_2[augmentation_data.idx2]
], axis=1)

# create inputs / targets of augmented dataset
X_augmented = topic_probs_augmented
y_augmented = augmentation_data.score
print(f"#augmented: {y_augmented.shape[0]}")

In [None]:
X_train_w_augment = np.concatenate([X_train, X_augmented])
y_train_w_augment = np.concatenate([y_train, y_augmented])
print(f"#(train+augmented): {y_augmented.shape[0]}")

In [None]:
# train model
model = RandomForestRegressor(criterion="squared_error", n_estimators=100, max_depth=15, random_state=1337)
model.fit(X_train_w_augment, y_train_w_augment)
# evaluate model
spearman_train = spearmanr(model.predict(X_train_w_augment), y_train_w_augment)[0]
spearman_test = spearmanr(model.predict(X_val), y_val)[0]
print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-val: {spearman_val:.4f}")

```python
SpearmanRank = 0.4677
np.random.seed(??)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4557
np.random.seed(42)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4374
np.random.seed(1337)
f = lambda x: np.exp(4*x) * 2  # sampling function - #2655
```

## Qualitative analysis

In [None]:
# get predictions
y_pred = model.predict(X_val)

In [1]:
from sentence_similarity.data.preprocess import Pipeline, PipelineConfig

In [None]:
config = PipelineConfig.load(model_dir / "pipeline.cfg")
pipeline = Pipeline(config)

In [None]:
df = val_data.df[["s1", "s2"]].copy()
df["s1_processed"] = pipeline(df.s1)
df["s2_processed"] = pipeline(df.s2)
df["y_true"] = y_val
df["y_pred"] = y_pred
# df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_val]

In [None]:
df[df.y_pred > df.y_true]

In [None]:
df.iloc[df.y_pred.argmax()].to_frame().T

In [None]:
df[(df.y_true - df.y_pred) > 0.4]

### Naive cosine similarity

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
doc = nlp("Blue and red plane in mid-air flight.")
print("\t".join([token.dep_ for token in doc]))
print("\t".join([token.lemma_ for token in doc]))