# Random Forest

### Requirements

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

from utils import (CoreXProbsFactory, LDAProbs, SyntaxFactory, preprocess,
                   tokenize)

data_dir = Path(".")
model_dir = Path("/home/iailab36/iser/models")

SEED = 1337
COREX_HIDDEN = 64
VEC_FEAT = 10_000

In [2]:
# Load benchmark dataset
train_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-train-sbert.feather")
val_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-dev-sbert.feather")
test_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-test.feather")

train_data = pd.concat([train_data, val_data]).reset_index(drop=True)

In [3]:
random_forest = lambda SEED: RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=SEED)
decision_tree = lambda SEED: DecisionTreeRegressor(random_state=SEED, max_depth=5)
mlp = lambda SEED: MLPRegressor((512, 256), random_state=SEED)

## Pre-compute features

In [4]:
features_train = []
features_val = []

### topic model features

In [5]:
get_topic_probs = CoreXProbsFactory(
    vectorizer_path=model_dir / f"sts_vec={VEC_FEAT}",
    corex_name=f"corex_n_hidden={COREX_HIDDEN}_iter=7",
)

# get_topic_probs = LDAProbs(model_dir / f"sts_lda_hidden={COREX_HIDDEN}")

In [6]:
# compute topic probabilities
topic_probs_train_1 = get_topic_probs(train_data.s1)
topic_probs_train_2 = get_topic_probs(train_data.s2)
topic_probs_test_1 = get_topic_probs(test_data.s1)
topic_probs_test_2 = get_topic_probs(test_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_test = np.concatenate([topic_probs_test_1, topic_probs_test_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_test)

### syntax features

In [7]:
# get_syntax_deps = SyntaxFactory()
# # compute syntax tokens
# syntax_train_1 = get_syntax_deps(train_data.s1)
# syntax_train_2 = get_syntax_deps(train_data.s2)
# syntax_test_1 = get_syntax_deps(test_data.s1)
# syntax_test_2 = get_syntax_deps(test_data.s2)
# # mask matching syntax
# syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
# syntax_test = (syntax_test_1 == syntax_test_2).astype(int)
# # append to features list
# features_train.append(syntax_train)
# features_val.append(syntax_test)

## Training without data augmentation

In [8]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_test = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_test = test_data.score
print("X_train:", X_train.shape)

X_train: (7030, 128)


In [9]:
np.random.seed(SEED)
perm = np.random.permutation(X_train.shape[0])
X_train_ = X_train[perm]
y_train_ = y_train[perm]

model = random_forest(SEED)
model.fit(X_train_, y_train_)

# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_test = spearmanr(model.predict(X_test), y_test)[0]

print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

SpearmanRank-train: 0.9807,	 SpearmanRank-test: 0.5302


## Training with data augmentation

In [10]:
# load augmentation dataset
aug_data = pd.read_feather(data_dir / "stsbenchmark" / "df_augment.feather")

features_aug = []

In [11]:
# get topics of the augmented sentences
topic_probs_augmented = np.concatenate([
    topic_probs_train_1[aug_data.idx1],
    topic_probs_train_2[aug_data.idx2]
], axis=1)
features_aug.append(topic_probs_augmented)

In [12]:
# # syntax features
# syntax_aug = (syntax_train_1[aug_data.idx1] == syntax_train_2[aug_data.idx2]).astype(int)
# features_aug.append(syntax_aug)

In [13]:
# create inputs / targets of augmented dataset
X_aug = np.concatenate(features_aug, axis=1)
y_aug = aug_data.score
print(f"#augmented: {y_aug.shape[0]}")

X_train_w_aug = np.concatenate([X_train, X_aug])
y_train_w_aug = np.concatenate([y_train, y_aug])
print(f"#(train+augmented): {y_train_w_aug.shape[0]}")

#augmented: 1634
#(train+augmented): 8664


In [14]:
# np.random.seed(SEED)
# perm = np.random.permutation(X_train_w_aug.shape[0])
# X_train_w_aug_ = X_train_w_aug[perm]
# y_train_w_aug_ = y_train_w_aug[perm]

# model = random_forest(SEED)
# model.fit(X_train_w_aug_, y_train_w_aug_)

# # evaluate model
# spearman_train = spearmanr(model.predict(X_train), y_train)[0]
# spearman_test = spearmanr(model.predict(X_test), y_test)[0]

# print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

## Qualitative analysis

In [15]:
# # unit vectors
# x1 = topic_probs_train_1 / np.linalg.norm(topic_probs_train_1, axis=1)[:, None]
# x2 = topic_probs_train_2 / np.linalg.norm(topic_probs_train_2, axis=1)[:, None]

# y_naiv = (x1[:, None, ...] @ x2[..., None]).squeeze()

In [16]:
# from scipy.stats import wasserstein_distance
# from sklearn.preprocessing import MinMaxScaler

# # softmax
# x1 = np.exp(topic_probs_train_1)/np.exp(topic_probs_train_1).sum(0)
# x2 = np.exp(topic_probs_train_2)/np.exp(topic_probs_train_2).sum(0)

# dists = np.array([wasserstein_distance(x1[i], x2[i]) for i in range(x1.shape[0])])

# mms = MinMaxScaler()
# y_naiv = mms.fit_transform(dists.reshape(-1, 1)).squeeze()

In [17]:
processed_1 = test_data.s1.apply(tokenize)
processed_2 = test_data.s2.apply(tokenize)

y_naiv = np.array([
    np.array([token in tokens2 for token in tokens1], dtype=float).sum() / max(len(tokens1), len(tokens2))
    for tokens1, tokens2 in zip(processed_1, processed_2)
])

In [19]:
spearman_val = spearmanr(y_naiv, y_test)[0]
print(f"SpearmanRank-val: {spearman_val:.4f}")

SpearmanRank-val: 0.6387


In [20]:
y_pred = model.predict(X_test)
spearman_val = spearmanr(y_pred, y_test)[0]
print(f"SpearmanRank-val: {spearman_val:.4f}")

SpearmanRank-val: 0.5302


In [21]:
df = test_data[["s1", "s2"]].copy()
df["s1_processed"] = df.s1.apply(preprocess)
df["s2_processed"] = df.s2.apply(preprocess)
df["y_true"] = y_test
df["y_pred"] = y_pred
df["y_naiv"] = y_naiv
# df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_test]

In [22]:
df[(df.y_true - df.y_pred).abs() > 0.4]

Unnamed: 0,s1,s2,s1_processed,s2_processed,y_true,y_pred,y_naiv
38,A woman is dancing in the rain.,A woman dances in the rain out side.,woman danc rain .,woman danc rain side .,1.00,0.545929,0.800000
61,A train is moving.,A man is doing yoga.,train move .,man yoga .,0.00,0.482292,0.333333
65,A man is holding a leaf.,A monkey is fighting a man.,man hold leaf .,monkey fight man .,0.00,0.482234,0.500000
66,A woman is peeling shrimp.,A man is squeezing water.,woman peel shrimp .,man squeez water .,0.04,0.541840,0.250000
72,A woman opens a window.,A man is crawling.,woman open window .,man crawl .,0.00,0.488378,0.250000
...,...,...,...,...,...,...,...
1065,5 nations meet on haze,Putin's marriage at an end,num nation meet haze,putin marriag end,0.00,0.470302,0.000000
1072,Prominent AIDS researchers killed in Malaysian...,Top AIDS Researcher Killed in Malaysia Plane C...,promin aid research kill malaysian plane crash,top aid research kill malaysia plane crash,1.00,0.478477,0.714286
1075,Protests continue in tense Ukraine capital,Protests Continue In Ukraine's Capital,protest continu tens ukrain capit,protest continu ukrain capit,1.00,0.535061,0.800000
1089,"3 dead, 4 missing in central China constructio...","One dead, 8 missing in Vietnam boat accident","num dead , num miss central china construct accid","one dead , num miss vietnam boat accid",0.08,0.596987,0.666667


In [None]:
# "ad" in get_topic_probs.vectorizer.get_feature_names()

### Naive cosine similarity

In [None]:
# import spacy
# nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
# df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
# print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
# doc = nlp("Blue and red plane in mid-air flight.")
# print("\t".join([token.dep_ for token in doc]))
# print("\t".join([token.lemma_ for token in doc]))