# Random Forest

### Requirements

In [58]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from torchmetrics import Accuracy, SpearmanCorrcoef
from tqdm.notebook import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr

from sentence_similarity.data import (CoreXFeatures, LDAFeatures, PreprocessingModule,
                                      STSBenchmark, SyntaxFeatures)
from sentence_similarity.data import PipelineConfig, Pipeline
from sentence_similarity.corex import train_corex_model

data_dir = Path('data')
assert data_dir.exists()
output_dir = Path('data/output')
output_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# Load benchmark dataset
train_data = STSBenchmark(data_dir, partition="train")
test_data = STSBenchmark(data_dir, partition="test")

## Preprocessing, feature computation

In [60]:
# train corex topic model
config = PipelineConfig(
    filtered_pos_tags=[],
    use_lemmas=True,
    remove_stop_words=True,
    remove_numbers=False,
    remove_symbols=False,
    remove_punctuation=False,
)
pipeline = Pipeline(config)
# train_corex_model(data_dir, config, train_data)

In [4]:
# features for the model
corex_feat = CoreXFeatures(data_dir)
syntax_feat = SyntaxFeatures()
print(f"#topics: {corex_feat.input_size}")

#topics: 50


In [5]:
# compute topic probabilities
topic_probs_train_1 = corex_feat(train_data.s1)
topic_probs_train_2 = corex_feat(train_data.s2)
topic_probs_test_1 = corex_feat(test_data.s1)
topic_probs_test_2 = corex_feat(test_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_test = np.concatenate([topic_probs_test_1, topic_probs_test_2], axis=1)

Preprocessing: 100%|████████████████████████████████| 5552/5552 [00:03<00:00, 1582.72it/s]
Preprocessing: 100%|████████████████████████████████| 5552/5552 [00:02<00:00, 1926.65it/s]
Preprocessing: 100%|████████████████████████████████| 1095/1095 [00:00<00:00, 1822.85it/s]
Preprocessing: 100%|████████████████████████████████| 1095/1095 [00:00<00:00, 1927.22it/s]


In [6]:
# compute syntax tokens
syntax_train_1 = syntax_feat(train_data.s1)
syntax_train_2 = syntax_feat(train_data.s2)
syntax_test_1 = syntax_feat(test_data.s1)
syntax_test_2 = syntax_feat(test_data.s2)
# mask matching syntax
syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
syntax_test = (syntax_test_1 == syntax_test_2).astype(int)

## Training without data augmentation

In [7]:
# create input vectors
X_train = topic_probs_train  # np.concatenate([topic_probs_train, syntax_train], axis=1)
X_test = topic_probs_test  # np.concatenate([topic_probs_test, syntax_test], axis=1)
# create targets
y_train = train_data.score
y_test = test_data.score

In [44]:
# train model
model = RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=1337)
model.fit(X_train, y_train)
# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_test = spearmanr(model.predict(X_test), y_test)[0]
print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

SpearmanRank-train: 0.9774,	 SpearmanRank-test: 0.4593


```python
Spearmanrank_wo_syntax = 0.4593
Spearmanrank_w_syntax = 0.4541
```

## Training with data augmentation

In [37]:
# load augmentation dataset
augmentation_data = pd.read_feather(data_dir / "df_augment.feather")

In [38]:
# get topics of the augmented sentences
topic_probs_augmented = np.concatenate([
    topic_probs_train_1[augmentation_data.idx1],
    topic_probs_train_2[augmentation_data.idx2]
], axis=1)

# get dependencies of the augmented sentences
syntax_augmented = syntax_train_1[augmentation_data.idx1] == syntax_train_2[augmentation_data.idx2]

# create inputs / targets of augmented dataset
X_augmented = topic_probs_augmented  # np.concatenate([topic_probs_augmented, syntax_augmented], axis=1)
y_augmented = augmentation_data.score
print(f"#augmented: {y_augmented.shape[0]}")

#augmented: 1228


In [39]:
X_train_w_augment = np.concatenate([X_train, X_augmented])
y_train_w_augment = np.concatenate([y_train, y_augmented])
print(f"#(train+augmented): {y_augmented.shape[0]}")

#(train+augmented): 1228


In [45]:
# train model
model = RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=1337)
model.fit(X_train_w_augment, y_train_w_augment)
# evaluate model
spearman_train = spearmanr(model.predict(X_train_w_augment), y_train_w_augment)[0]
spearman_test = spearmanr(model.predict(X_test), y_test)[0]
print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

SpearmanRank-train: 0.9689,	 SpearmanRank-test: 0.4557


```python
SpearmanRank = 0.4677
np.random.seed(??)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4557
np.random.seed(42)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4374
np.random.seed(1337)
f = lambda x: np.exp(4*x) * 2  # sampling function - #2655
```

## Qualitative analysis

In [41]:
# get predictions
y_pred = model.predict(X_test)

In [64]:
df = test_data.df[["s1", "s2"]].copy()
df["s1_processed"] = pipeline(df.s1)
df["s2_processed"] = pipeline(df.s2)
df["y_true"] = y_test
df["y_pred"] = y_pred
df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_test]

Preprocessing: 100%|████████████████████████████████| 1095/1095 [00:00<00:00, 2124.94it/s]
Preprocessing: 100%|████████████████████████████████| 1095/1095 [00:00<00:00, 2140.75it/s]


In [65]:
df[(df.y_true - df.y_pred) > 0.5]

Unnamed: 0,s1,s2,s1_processed,s2_processed,y_true,y_pred,root,nsubj,dobj
290,Blue and red plane in mid-air flight.,a blue and red airplane while in flight.,blue red plane mid-air flight.,blue red airplane flight.,0.96,0.456282,1,1,1
327,Three goats are being rounded up by a dog.,Three goats are chased by a dog.,goat round dog.,goat chase dog.,0.92,0.409502,1,1,1
331,A blue bird standing on a lawn.,Blue bird standing on green grass.,blue bird stand lawn.,blue bird stand green grass.,0.92,0.367149,0,0,1
429,A man and a dog on rocks on a beach.,A man and dog on a rocky seashore.,man dog rock beach.,man dog rocky seashore.,0.92,0.417146,1,1,1
482,A man in a black suit is surfing along a crash...,A surfer wearing a black wet suit is riding a ...,man black suit surf crash wave.,surfer wear black wet suit ride white wave ocean.,0.92,0.418285,1,1,1
864,"""The economy, nonetheless, has yet to exhibit ...",But the economy hasn't shown signs of sustaina...,"""economy, nonetheless, exhibit sustainable gro...",economy show sign sustainable growth.,0.95,0.436345,1,1,1
898,Gu Kailai murder trial ends in China,Gu Kailai's Murder Trial Ends in China,Gu Kailai murder trial end China,Gu KailaiMurder Trial Ends China,1.0,0.463862,0,0,1
1075,Protests continue in tense Ukraine capital,Protests Continue In Ukraine's Capital,protest continue tense Ukraine capital,protest continue Ukrainecapital,1.0,0.44197,1,1,1


### Naive cosine similarity

In [66]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [70]:
df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
print(f"SpearmanRank-test: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

SpearmanRank-test: 0.5218


In [73]:
doc = nlp("Blue and red plane in mid-air flight.")
print("\t".join([token.dep_ for token in doc]))
print("\t".join([token.lemma_ for token in doc]))

amod	cc	conj	ROOT	prep	amod	amod	compound	pobj	punct
blue	and	red	plane	in	mid	-	air	flight	.
