# Random Forest

### Requirements

In [67]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from utils import CoreXProbsFactory, preprocess, SyntaxFactory

data_dir = Path("/home/iailab36/iser/data")
model_dir = Path("/home/iailab36/iser/models")

In [2]:
# Load benchmark dataset
train_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-train.feather")
val_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-dev.feather")

## Pre-compute features

In [60]:
features_train = []
features_val = []

### topic model features

In [71]:
get_topic_probs = CoreXProbsFactory(
    vectorizer_path=model_dir / "sts_vec=10000",
    corex_name="corex_n_hidden=128",
)

In [77]:
# compute topic probabilities
topic_probs_train_1 = get_topic_probs(train_data.s1)
topic_probs_train_2 = get_topic_probs(train_data.s2)
topic_probs_val_1 = get_topic_probs(val_data.s1)
topic_probs_val_2 = get_topic_probs(val_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_val = np.concatenate([topic_probs_val_1, topic_probs_val_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_val)

# topic_probs_train_1 = topic_probs_train_1 / np.linalg.norm(topic_probs_train_1, axis=1)[:, None]
# topic_probs_train_2 = topic_probs_train_2 / np.linalg.norm(topic_probs_train_2, axis=1)[:, None]

# topic_probs_val_1 = topic_probs_val_1 / np.linalg.norm(topic_probs_val_1, axis=1)[:, None]
# topic_probs_val_2 = topic_probs_val_2 / np.linalg.norm(topic_probs_val_2, axis=1)[:, None]

### syntax features

In [78]:
get_syntax_deps = SyntaxFactory()
# compute syntax tokens
syntax_train_1 = get_syntax_deps(train_data.s1)
syntax_train_2 = get_syntax_deps(train_data.s2)
syntax_val_1 = get_syntax_deps(val_data.s1)
syntax_val_2 = get_syntax_deps(val_data.s2)
# mask matching syntax
syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
syntax_val = (syntax_val_1 == syntax_val_2).astype(int)
# append to features list
features_train.append(syntax_train)
features_val.append(syntax_val)

## Training without data augmentation

In [79]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_val = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_val = val_data.score
print("X_train:", X_train.shape)

X_train: (5552, 865)


In [80]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

In [81]:
# train model
model = RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=1337)
# model = DecisionTreeRegressor(random_state=1337)
# model = MLPRegressor((512, 256, 128))
model.fit(X_train, y_train)
# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_val = spearmanr(model.predict(X_val), y_val)[0]
print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-val: {spearman_val:.4f}")

SpearmanRank-train: 0.9791,	 SpearmanRank-val: 0.4904


Only STS-Benchmark:
```python
spearmanrank_wo_syntax = 0.4593
spearmanrank_w_syntax = 0.4541
```

STS-B + NLI:
```python
spearmanrank_wo_syntax = 0.4586
spearmanrank_w_syntax = 0.4614
```

## Training with data augmentation

In [None]:
# # load augmentation dataset
# aug_data = pd.read_feather(benchmark_dir / "df_augment.feather")

# features_aug = []

In [None]:
# # get topics of the augmented sentences
# topic_probs_augmented = np.concatenate([
#     topic_probs_train_1[aug_data.idx1],
#     topic_probs_train_2[aug_data.idx2]
# ], axis=1)
# features_aug.append(topic_probs_augmented)

In [None]:
# syntax_aug = (syntax_train_1[aug_data.idx1] == syntax_train_2[aug_data.idx2]).astype(int)
# features_aug.append(syntax_aug)

In [None]:
# # create inputs / targets of augmented dataset
# X_aug = np.concatenate(features_aug, axis=1)
# y_aug = aug_data.score
# print(f"#augmented: {y_aug.shape[0]}")

In [None]:
# X_train_w_aug = np.concatenate([X_train, X_aug])
# y_train_w_aug = np.concatenate([y_train, y_aug])
# print(f"#(train+augmented): {y_aug.shape[0]}")

In [None]:
# # train model
# model = RandomForestRegressor(criterion="squared_error", n_estimators=100, max_depth=15, random_state=1337)
# model.fit(X_train_w_aug, y_train_w_aug)
# # evaluate model
# spearman_train = spearmanr(model.predict(X_train_w_aug), y_train_w_aug)[0]
# spearman_test = spearmanr(model.predict(X_val), y_val)[0]
# print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-val: {spearman_val:.4f}")

```python
SpearmanRank = 0.4677
np.random.seed(??)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4557
np.random.seed(42)
f = lambda x: np.exp(3*x) * 2  # sampling function - #1228

SpearmanRank = 0.4374
np.random.seed(1337)
f = lambda x: np.exp(4*x) * 2  # sampling function - #2655
```

## Qualitative analysis

In [19]:
topic_probs_train_1 = np.exp(topic_probs_train_1)/np.exp(topic_probs_train_1).sum(0)
topic_probs_train_2 = np.exp(topic_probs_train_2)/np.exp(topic_probs_train_2).sum(0)

In [20]:
x1 = topic_probs_val_1 / np.linalg.norm(topic_probs_val_1, axis=1)[:, None]
x2 = topic_probs_val_2 / np.linalg.norm(topic_probs_val_2, axis=1)[:, None]

In [23]:
from scipy.stats import wasserstein_distance



0.005397021006452231

In [32]:
dists = np.array([wasserstein_distance(topic_probs_val_1[i], topic_probs_val_2[i]) for i in range(len(topic_probs_val_1))])

In [37]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
y_naiv = mms.fit_transform(dists.reshape(-1, 1)).squeeze()

In [53]:
y_naiv = (x1[:, None, ...] @ x2[..., None]).squeeze()

spearman_val = spearmanr(y_naiv, y_val)[0]
pearson_val = pearsonr(y_naiv, y_val)[0]
print(f"SpearmanRank-val: {spearman_val:.4f}")
print(f"PearsonRank-val: {pearson_val:.4f}")

SpearmanRank-val: 0.3802
PearsonRank-val: 0.2768


In [54]:
y_pred = model.predict(X_val)
spearman_val = spearmanr(y_pred, y_val)[0]
pearson_val = pearsonr(y_pred, y_val)[0]
print(f"SpearmanRank-val: {spearman_val:.4f}")
print(f"PearsonRank-val: {pearson_val:.4f}")

SpearmanRank-val: 0.4984
PearsonRank-val: 0.5085


In [57]:
df = val_data[["s1", "s2"]].copy()
df["s1_processed"] = df.s1.apply(preprocess)
df["s2_processed"] = df.s2.apply(preprocess)
df["y_true"] = y_val
df["y_pred"] = y_pred
df["y_naiv"] = y_naiv
df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_val]

In [58]:
df[(df.y_true - df.y_pred).abs() > 0.4]

Unnamed: 0,s1,s2,s1_processed,s2_processed,y_true,y_pred,y_naiv,root,nsubj,dobj
8,Three men are playing guitars.,Three men are on stage playing guitars.,three men play guitar .,three men stage play guitar .,0.75,0.253675,0.998823,0,1,1
13,The man cut down a tree with an axe.,A man chops down a tree with an axe.,man cut tree axe .,man chop tree axe .,1.00,0.587711,0.999436,0,1,1
31,A band is performing on a stage.,A band is playing onstage.,band perform stage .,band play onstag .,1.00,0.513099,0.979967,0,1,1
34,A man sitting on the floor plays a guitar.,A man sitting on the floor in a room is strumm...,man sit floor play guitar .,man sit floor room strum guitar .,0.96,0.360893,0.998714,0,1,1
36,A man plays the violin.,A man is playing violin.,man play violin .,man play violin .,1.00,0.551297,1.000000,1,1,1
...,...,...,...,...,...,...,...,...,...,...
1456,All eyes on new Pak army chief Raheel Sharif,Flood alerts in France as river levels rise,eye new pak armi chief raheel sharif,flood alert franc river level rise,0.00,0.452157,0.952359,0,0,1
1468,Thai government says elections Feb. 2,Philippines government says peace talks with c...,thai govern say elect feb. num,philippin govern say peac talk communist fail,0.00,0.581147,0.895034,1,0,1
1469,Colorado Governor Visits School Shooting Victim,Colorado governor visits school shooting victim,colorado governor visit school shoot victim,colorado governor visit school shoot victim,1.00,0.492724,1.000000,0,0,1
1475,Algeria president gets therapy after stroke: s...,Bulgarian president tries to break election st...,algeria presid get therapi stroke : state media,bulgarian presid tri break elect stalem,0.08,0.509560,0.967273,0,1,1


In [30]:
"ad" in get_topic_probs.vectorizer.get_feature_names()



True

In [None]:
get_syntax_deps(val_data.s1)[7]

In [87]:
df[(df.y_true - df.y_pred) < 0.1]

Unnamed: 0,s1,s2,s1_processed,s2_processed,y_true,y_pred,y_naiv,root,nsubj,dobj
3,A woman is playing the guitar.,A man is playing guitar.,woman play guitar .,man play guitar .,0.4800,0.491335,0.999893,1,0,1
4,A woman is playing the flute.,A man is playing a flute.,woman play flute .,man play flute .,0.5500,0.591738,0.999916,1,0,1
5,A woman is cutting an onion.,A man is cutting onions.,woman cut onion .,man cut onion .,0.5230,0.664033,0.999944,1,0,1
7,A woman is carrying a boy.,A woman is carrying her baby.,woman carri boy .,woman carri babi .,0.4666,0.448435,0.998882,1,1,1
10,People are playing cricket.,Men are playing cricket.,peopl play cricket .,men play cricket .,0.6400,0.652883,0.999716,1,0,1
...,...,...,...,...,...,...,...,...,...,...
1468,Thai government says elections Feb. 2,Philippines government says peace talks with c...,thai govern say elect feb. num,philippin govern say peac talk communist fail,0.0000,0.499054,0.895034,1,0,1
1470,Pakistani Taliban chief Hakimullah Mehsud kill...,"Pakistani Taliban 'elect new chief', vow 'unpr...",pakistani taliban chief hakimullah mehsud kill...,"pakistani taliban elect new chief , vow unprec...",0.2400,0.571863,0.939900,0,0,1
1471,House blaze kills 7 in northern Pakistan,US drone strike kills 8 in NW Pakistan,hous blaze kill num northern pakistan,us drone strike kill num nw pakistan,0.2800,0.468497,0.991205,1,0,1
1475,Algeria president gets therapy after stroke: s...,Bulgarian president tries to break election st...,algeria presid get therapi stroke : state media,bulgarian presid tri break elect stalem,0.0800,0.449823,0.967273,0,1,1


### Naive cosine similarity

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
doc = nlp("Blue and red plane in mid-air flight.")
print("\t".join([token.dep_ for token in doc]))
print("\t".join([token.lemma_ for token in doc]))