# Random Forest

### Requirements

In [2]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import torch
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer

from utils import (CoreXProbsFactory, LDAProbs, SyntaxFactory, preprocess,
                   tokenize)

data_dir = Path("data")
model_dir = Path("/home/iailab36/iser/models")

SEED = 1337
COREX_HIDDEN = 128
VEC_FEAT = 10_000

In [3]:
# Load benchmark dataset
train_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-train.feather")
val_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-dev.feather")
test_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-test.feather")

train_data = pd.concat([train_data, val_data]).reset_index(drop=True)

## Pre-compute features

In [4]:
features_train = []
features_val = []

### topic model features

In [5]:
get_topic_probs = CoreXProbsFactory(
    vectorizer_path=model_dir / f"sts_vec={VEC_FEAT}",
    corex_name=f"corex_n_hidden={COREX_HIDDEN}_iter=7",
)

# get_topic_probs = LDAProbs(model_dir / f"sts_lda_hidden={COREX_HIDDEN}")

In [10]:
# compute topic probabilities
topic_probs_train_1 = get_topic_probs(train_data.s1)
topic_probs_train_2 = get_topic_probs(train_data.s2)
topic_probs_test_1 = get_topic_probs(test_data.s1)
topic_probs_test_2 = get_topic_probs(test_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_test = np.concatenate([topic_probs_test_1, topic_probs_test_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_test)

### syntax features

In [11]:
# get_syntax_deps = SyntaxFactory()
# # compute syntax tokens
# syntax_train_1 = get_syntax_deps(train_data.s1)
# syntax_train_2 = get_syntax_deps(train_data.s2)
# syntax_test_1 = get_syntax_deps(test_data.s1)
# syntax_test_2 = get_syntax_deps(test_data.s2)
# # mask matching syntax
# syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
# syntax_test = (syntax_test_1 == syntax_test_2).astype(int)
# # append to features list
# features_train.append(syntax_train)
# features_val.append(syntax_test)

## Training without data augmentation

In [12]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_test = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_test = test_data.score
print("X_train:", X_train.shape)

X_train: (7030, 256)


In [21]:
random_forest = lambda SEED: RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=SEED)
decision_tree = lambda SEED: DecisionTreeRegressor(random_state=SEED, max_depth=5)
mlp = lambda SEED: MLPRegressor((512, 256), random_state=SEED)
knn = lambda SEED: KNeighborsRegressor(n_neighbors=5, weights="uniform")

In [22]:
np.random.seed(SEED)
perm = np.random.permutation(X_train.shape[0])
X_train_ = X_train[perm]
y_train_ = y_train[perm]

model = knn(SEED)
model.fit(X_train_, y_train_)

# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_test = spearmanr(model.predict(X_test), y_test)[0]

print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

SpearmanRank-train: 0.6221,	 SpearmanRank-test: 0.3505


## Training with data augmentation

In [41]:
# load augmentation dataset
aug_data = pd.read_feather(data_dir / "stsbenchmark" / "df_augment.feather")

features_aug = []

In [42]:
# get topics of the augmented sentences
topic_probs_augmented = np.concatenate([
    topic_probs_train_1[aug_data.idx1],
    topic_probs_train_2[aug_data.idx2]
], axis=1)
features_aug.append(topic_probs_augmented)

In [43]:
len(aug_data)

438

In [44]:
# # syntax features
# syntax_aug = (syntax_train_1[aug_data.idx1] == syntax_train_2[aug_data.idx2]).astype(int)
# features_aug.append(syntax_aug)

In [45]:
# create inputs / targets of augmented dataset
X_aug = np.concatenate(features_aug, axis=1)
y_aug = aug_data.score
print(f"#augmented: {y_aug.shape[0]}")

X_train_w_aug = np.concatenate([X_train, X_aug])
y_train_w_aug = np.concatenate([y_train, y_aug])
print(f"#(train+augmented): {y_train_w_aug.shape[0]}")

#augmented: 438
#(train+augmented): 7468


In [None]:
np.random.seed(SEED)
perm = np.random.permutation(X_train_w_aug.shape[0])
X_train_w_aug_ = X_train_w_aug[perm]
y_train_w_aug_ = y_train_w_aug[perm]

model = random_forest(SEED)
model.fit(X_train_w_aug_, y_train_w_aug_)

# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_test = spearmanr(model.predict(X_test), y_test)[0]

print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

## Qualitative analysis

In [None]:
# # unit vectors
# x1 = topic_probs_train_1 / np.linalg.norm(topic_probs_train_1, axis=1)[:, None]
# x2 = topic_probs_train_2 / np.linalg.norm(topic_probs_train_2, axis=1)[:, None]

# y_naiv = (x1[:, None, ...] @ x2[..., None]).squeeze()

In [None]:
# from scipy.stats import wasserstein_distance
# from sklearn.preprocessing import MinMaxScaler

# # softmax
# x1 = np.exp(topic_probs_train_1)/np.exp(topic_probs_train_1).sum(0)
# x2 = np.exp(topic_probs_train_2)/np.exp(topic_probs_train_2).sum(0)

# dists = np.array([wasserstein_distance(x1[i], x2[i]) for i in range(x1.shape[0])])

# mms = MinMaxScaler()
# y_naiv = mms.fit_transform(dists.reshape(-1, 1)).squeeze()

In [30]:
sw = True

# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()
vectorizer.fit(pd.concat([train_data.s1, train_data.s2]).apply(lambda s: preprocess(s, sw)))
bow1 = vectorizer.transform(test_data.s1.apply(lambda s: preprocess(s, sw))).toarray()
bow2 = vectorizer.transform(test_data.s2.apply(lambda s: preprocess(s, sw))).toarray()

bow1 = bow1 / np.linalg.norm(bow1, axis=1)[:, None]
bow2 = bow2 / np.linalg.norm(bow2, axis=1)[:, None]

y_naiv = (bow1[:, None, ...] @ bow2[..., None]).squeeze()

spearman_val = spearmanr(y_naiv, y_test)[0]
print(f"Naive SpearmanRank-val: {spearman_val:.4f}")

Naive SpearmanRank-val: 0.6867


In [39]:
bow1 = vectorizer.transform([preprocess("man play keyboard piano", sw)]).toarray()
bow2 = vectorizer.transform([preprocess("man play music keyboard guitar", sw)]).toarray()

bow1 = bow1 / np.linalg.norm(bow1, axis=1)[:, None]
bow2 = bow2 / np.linalg.norm(bow2, axis=1)[:, None]

y_naiv = (bow1[:, None, ...] @ bow2[..., None]).squeeze()
y_naiv

array(0.67082039)

In [40]:
preprocess("a woman is climbing a cliff", sw)

'woman climb cliff'

In [37]:
bow1 = vectorizer.transform([preprocess("man play keyboard piano", sw)]).toarray()
bow2 = vectorizer.transform([preprocess("man play music keyboard here", sw)]).toarray()

# bow1 = bow1 / np.linalg.norm(bow1, axis=1)[:, None]
# bow2 = bow2 / np.linalg.norm(bow2, axis=1)[:, None]

y_naiv = (bow1[:, None, ...] @ bow2[..., None]).squeeze()
y_naiv

array(3)

In [36]:
bow1 = vectorizer.transform([preprocess("man play keyboard piano", sw)]).toarray()
bow2 = vectorizer.transform([preprocess("man play music keyboard here", sw)]).toarray()
np.linalg.norm(bow1, axis=1)[:, None]

array([[2.]])

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"using device: {device}")

# load sbert model
sbert = SentenceTransformer("stsb-mpnet-base-v2")

# compute sentence embeddings with sbert
emb_1 = sbert.encode(test_data.s1, convert_to_tensor=True, device=device)
emb_2 = sbert.encode(test_data.s2, convert_to_tensor=True, device=device)

# compute similarity scores via cosine similarity
emb_1 = F.normalize(emb_1)
emb_2 = F.normalize(emb_2)
y_pred_sbert = (emb_1.unsqueeze(1) @ emb_2.unsqueeze(2)).squeeze(1).cpu().numpy()
# scale scores to [0, 1]
y_pred_sbert = MinMaxScaler().fit_transform(y_pred_sbert).squeeze()

spearman_val = spearmanr(y_pred_sbert, y_test)[0]
print(f"S-BERT SpearmanRank-val: {spearman_val:.4f}")

In [None]:
_, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].scatter(y_naiv, y_test)
ax[0].set_title("Naive")
ax[1].scatter(y_pred_sbert, y_test)
ax[1].set_title("S-BERT")
plt.show()

In [20]:
y_pred = model.predict(X_test)
spearman_val = spearmanr(y_pred, y_test)[0]
print(f"Topic Modeling SpearmanRank-val: {spearman_val:.4f}")

Topic Modeling SpearmanRank-val: 0.2019


In [21]:
df = test_data[["s1", "s2"]].copy()
df["s1_processed"] = df.s1.apply(preprocess)
df["s2_processed"] = df.s2.apply(preprocess)
df["y_true"] = y_test
df["y_pred"] = y_pred
df["y_naiv"] = y_naiv
# df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_test]

In [22]:
df[(df.y_true - df.y_naiv).abs() > 0.4]

Unnamed: 0,s1,s2,s1_processed,s2_processed,y_true,y_pred,y_naiv
25,A man is dancing.,A man and woman is dancing.,man danc,man woman danc,0.4,0.462516,0.871299
70,A skunk is looking here and there.,A skunk looks at the camera.,skunk look,skunk look camera,0.48,0.512813,0.886614
104,An oriental lady is cutting a carrot into thin...,A woman is slicing a carrot.,orient ladi cut carrot thin piec,woman slice carrot,0.72,0.512813,0.304851
117,A man is cycling.,A boy is riding a bicycle.,man cycl,boy ride bicycl,0.5,0.512813,0.0
142,A woman is climbing a cliff.,A woman is climbing a rock face.,woman climb cliff,woman climb rock face,0.9,0.462516,0.455806
158,A man is playing a musical keyboard.,A man is playing a keyboard piano.,man play music keyboard,man play keyboard piano,1.0,0.512813,0.598335
172,A woman is dicing some peeled potatoes cut int...,A woman is chopping a peeled potato into slices.,woman dice peel potato cut thick strip,woman chop peel potato slice,0.8,0.462516,0.387875
174,A cat cleans itself.,A cat is licking itself.,cat clean,cat lick,0.76,0.462516,0.316246
252,A man sleeps with a baby in his lap.,A man asleep in a chair holding a baby.,man sleep babi lap,man asleep chair hold babi,0.8,0.462516,0.246742
265,Elderly woman sitting on red patterned couch w...,Two women sitting on couch posing for camera.,elder woman sit red pattern couch arm around y...,two women sit couch pose camera,0.72,0.512813,0.273072


In [None]:
# "ad" in get_topic_probs.vectorizer.get_feature_names()

### Naive cosine similarity

In [None]:
# import spacy
# nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
# df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
# print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
# doc = nlp("Blue and red plane in mid-air flight.")
# print("\t".join([token.dep_ for token in doc]))
# print("\t".join([token.lemma_ for token in doc]))