# Random Forest

### Requirements

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

from utils import (CoreXProbsFactory, LDAProbs, SyntaxFactory, preprocess,
                   tokenize)

data_dir = Path("/home/iailab36/iser/data")
model_dir = Path("/home/iailab36/iser/models")

SEED = 1337
COREX_HIDDEN = 64
VEC_FEAT = 10_000

In [2]:
# Load benchmark dataset
train_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-train-sbert.feather")
val_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-dev-sbert.feather")
test_data = pd.read_feather(data_dir / "stsbenchmark" / "sts-test.feather")

train_data = pd.concat([train_data, val_data]).reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: '/home/iailab36/iser/data/stsbenchmark/sts-train-sbert.feather'

In [None]:
train_data

In [None]:
random_forest = lambda SEED: RandomForestRegressor(criterion="squared_error", n_estimators=100, random_state=SEED)
decision_tree = lambda SEED: DecisionTreeRegressor(random_state=SEED, max_depth=5)
mlp = lambda SEED: MLPRegressor((512, 256), random_state=SEED)

## Pre-compute features

In [None]:
features_train = []
features_val = []

### topic model features

In [None]:
get_topic_probs = CoreXProbsFactory(
    vectorizer_path=model_dir / f"sts_vec={VEC_FEAT}",
    corex_name=f"corex_n_hidden={COREX_HIDDEN}_iter=7",
)

# get_topic_probs = LDAProbs(model_dir / f"sts_lda_hidden={COREX_HIDDEN}")

In [None]:
# compute topic probabilities
topic_probs_train_1 = get_topic_probs(train_data.s1)
topic_probs_train_2 = get_topic_probs(train_data.s2)
topic_probs_test_1 = get_topic_probs(test_data.s1)
topic_probs_test_2 = get_topic_probs(test_data.s2)
# concatenate topics of the two sentences
topic_probs_train = np.concatenate([topic_probs_train_1, topic_probs_train_2], axis=1)
topic_probs_test = np.concatenate([topic_probs_test_1, topic_probs_test_2], axis=1)
# add to features list
features_train.append(topic_probs_train)
features_val.append(topic_probs_test)

### syntax features

In [None]:
# get_syntax_deps = SyntaxFactory()
# # compute syntax tokens
# syntax_train_1 = get_syntax_deps(train_data.s1)
# syntax_train_2 = get_syntax_deps(train_data.s2)
# syntax_test_1 = get_syntax_deps(test_data.s1)
# syntax_test_2 = get_syntax_deps(test_data.s2)
# # mask matching syntax
# syntax_train = (syntax_train_1 == syntax_train_2).astype(int)
# syntax_test = (syntax_test_1 == syntax_test_2).astype(int)
# # append to features list
# features_train.append(syntax_train)
# features_val.append(syntax_test)

## Training without data augmentation

In [None]:
# create input vectors
X_train = np.concatenate(features_train, axis=1)
X_test = np.concatenate(features_val, axis=1)
# create targets
y_train = train_data.score
y_test = test_data.score
print("X_train:", X_train.shape)

In [None]:
np.random.seed(SEED)
perm = np.random.permutation(X_train.shape[0])
X_train_ = X_train[perm]
y_train_ = y_train[perm]

model = random_forest(SEED)
model.fit(X_train_, y_train_)

# evaluate model
spearman_train = spearmanr(model.predict(X_train), y_train)[0]
spearman_test = spearmanr(model.predict(X_test), y_test)[0]

print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

## Training with data augmentation

In [None]:
# load augmentation dataset
aug_data = pd.read_feather(data_dir / "stsbenchmark" / "df_augment.feather")

features_aug = []

In [None]:
# get topics of the augmented sentences
topic_probs_augmented = np.concatenate([
    topic_probs_train_1[aug_data.idx1],
    topic_probs_train_2[aug_data.idx2]
], axis=1)
features_aug.append(topic_probs_augmented)

In [None]:
# # syntax features
# syntax_aug = (syntax_train_1[aug_data.idx1] == syntax_train_2[aug_data.idx2]).astype(int)
# features_aug.append(syntax_aug)

In [None]:
# create inputs / targets of augmented dataset
X_aug = np.concatenate(features_aug, axis=1)
y_aug = aug_data.score
print(f"#augmented: {y_aug.shape[0]}")

X_train_w_aug = np.concatenate([X_train, X_aug])
y_train_w_aug = np.concatenate([y_train, y_aug])
print(f"#(train+augmented): {y_train_w_aug.shape[0]}")

In [None]:
# np.random.seed(SEED)
# perm = np.random.permutation(X_train_w_aug.shape[0])
# X_train_w_aug_ = X_train_w_aug[perm]
# y_train_w_aug_ = y_train_w_aug[perm]

# model = random_forest(SEED)
# model.fit(X_train_w_aug_, y_train_w_aug_)

# # evaluate model
# spearman_train = spearmanr(model.predict(X_train), y_train)[0]
# spearman_test = spearmanr(model.predict(X_test), y_test)[0]

# print(f"SpearmanRank-train: {spearman_train:.4f},\t SpearmanRank-test: {spearman_test:.4f}")

## Qualitative analysis

In [None]:
# # unit vectors
# x1 = topic_probs_train_1 / np.linalg.norm(topic_probs_train_1, axis=1)[:, None]
# x2 = topic_probs_train_2 / np.linalg.norm(topic_probs_train_2, axis=1)[:, None]

# y_naiv = (x1[:, None, ...] @ x2[..., None]).squeeze()

In [None]:
# from scipy.stats import wasserstein_distance
# from sklearn.preprocessing import MinMaxScaler

# # softmax
# x1 = np.exp(topic_probs_train_1)/np.exp(topic_probs_train_1).sum(0)
# x2 = np.exp(topic_probs_train_2)/np.exp(topic_probs_train_2).sum(0)

# dists = np.array([wasserstein_distance(x1[i], x2[i]) for i in range(x1.shape[0])])

# mms = MinMaxScaler()
# y_naiv = mms.fit_transform(dists.reshape(-1, 1)).squeeze()

In [3]:
from sklearn.model_selection import train_test_split

df_qqp = pd.read_feather(data_dir / "qqp.feather")
indices = np.arange(len(df_qqp))
train_indices, test_indices = train_test_split(indices, stratify=df_qqp.score, test_size=0.2, train_size=0.8)

train_data = df_qqp.iloc[train_indices]
test_data = df_qqp.iloc[test_indices]

y_test = test_data.score

print(len(df_qqp))
print(len(train_data))
print(len(test_data))

404287
323429
80858


In [10]:
df_qqp[df_qqp.s2 == "What are different fields for computer science?"].s1.values

array(['What are different fields in computer science and which is better?'],
      dtype=object)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm.notebook import tqdm

sw = True
vectorizer = CountVectorizer()
tqdm.pandas()
vectorizer.fit(pd.concat([train_data.s1, train_data.s2]).progress_apply(preprocess))
bow1 = vectorizer.transform(test_data.s1.progress_apply(preprocess))
bow2 = vectorizer.transform(test_data.s2.progress_apply(preprocess))

  0%|          | 0/646858 [00:00<?, ?it/s]

  0%|          | 0/80858 [00:00<?, ?it/s]

  0%|          | 0/80858 [00:00<?, ?it/s]

In [28]:
from sklearn.preprocessing import normalize
import scipy.sparse as ss

bow1: ss.csr_matrix = normalize(bow1, axis=1).toarray()
bow2: ss.csr_matrix = normalize(bow2, axis=1).toarray()

In [29]:
score = (bow1[:, None, ...] @ bow2[..., None]).squeeze()

In [30]:
y_naiv = (score > 0.845).astype(int)

(y_test == y_naiv).sum() / y_test.shape[0]

0.6287194835390437

In [7]:
df = test_data[["s1", "s2"]].copy()
df["s1_processed"] = df.s1.apply(preprocess)
df["s2_processed"] = df.s2.apply(preprocess)
df["y_true"] = y_test
# df["y_pred"] = y_pred
df["y_naiv"] = y_naiv
# df[["root", "nsubj", "dobj"]] = [pd.Series(s) for s in syntax_test]

In [8]:
df[(df.y_true - df.y_naiv).abs() > 0.4]

Unnamed: 0,s1,s2,s1_processed,s2_processed,y_true,y_naiv
166532,Why are people on this site so obsessed with IQ?,Why are most Quora users so obsessed with ques...,whi are peopl on this site so obsess with iq,whi are most quora user so obsess with questio...,1,0
307615,Can your soul cause your body to explode?,Is it true that the soul can cause the body to...,can your soul caus your bodi to explod,is it true that the soul can caus the bodi to ...,1,0
171311,What are different fields in computer science ...,What are different fields for computer science?,what are differ field in comput scienc and whi...,what are differ field for comput scienc,1,0
242855,How does it feel to have an IITian girlfriend?,How does it feel to have an IITian as a girlfr...,how doe it feel to have an iitian girlfriend,how doe it feel to have an iitian as a girlfriend,0,1
187843,Have you ever had sex with your best friend?,Did you have sex with your best friend?,have you ever had sex with your best friend,did you have sex with your best friend,1,0
...,...,...,...,...,...,...
352489,What are career option and job opportunities f...,How are job opportunities in Germany for an In...,what are career option and job opportun for me...,how are job opportun in germani for an indian ...,1,0
227989,Which is the best novel that you have ever rea...,Novels: What are some of the best novels that ...,which is the best novel that you have ever rea...,novel : what are some of the best novel that y...,1,0
22684,"From your perspective, what is the purpose of ...",What could be the basic purpose of life?,from your perspect what is the purpos of life,what could be the basic purpos of life,1,0
111516,Is nitrogen good for tires?,What are some of the advantages of filling nit...,is nitrogen good for tire,what are some of the advantag of fill nitrogen...,1,0


In [None]:
# "ad" in get_topic_probs.vectorizer.get_feature_names()

### Naive cosine similarity

In [None]:
# import spacy
# nlp = spacy.load("en_core_web_lg", exclude=["ner"])

In [None]:
# df["tok2vec"] = df[["s1", "s2"]].apply(lambda row: nlp(row.s1).similarity(nlp(row.s2)), axis=1)
# print(f"SpearmanRank-val: {spearmanr(df.tok2vec, df.y_true)[0]:.4f}")

In [None]:
# doc = nlp("Blue and red plane in mid-air flight.")
# print("\t".join([token.dep_ for token in doc]))
# print("\t".join([token.lemma_ for token in doc]))