In [1]:
from pathlib import Path
import pandas as pd

from datasets import load_dataset
df_train = load_dataset("stsb_multi_mt", name="en", split="train").shard(9, 0).to_pandas()
df_test =  load_dataset("stsb_multi_mt", name="en", split="test").shard(20, 0).to_pandas()

Reusing dataset stsb_multi_mt (/root/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
Reusing dataset stsb_multi_mt (/root/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)


In [2]:
len(df_test)
len(df_train)

639

In [3]:
import spacy 
import string

nlp = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])

punctuations = string.punctuation + '...¡¿'
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer=spacy_tokenizer)
vectorizer = vectorizer.fit(df_train['sentence1'].to_list() + df_train['sentence2'].to_list())



In [5]:
def dataset_preprocessing(dataset):
    dataset['lemm_sentence1'] = dataset['sentence1'].map(lambda x: vectorizer.transform([x]))
    dataset['lemm_sentence2'] = dataset['sentence2'].map(lambda x: vectorizer.transform([x]))

In [6]:
dataset_preprocessing(df_train)
dataset_preprocessing(df_test)

In [7]:
from sklearn.svm import SVR
from scipy.sparse import vstack, hstack

svr = SVR(kernel='rbf', C=100)

X =  vstack(df_train.apply(lambda row: hstack([row['lemm_sentence1'], row['lemm_sentence2']]), axis=1))

svr = svr.fit(X, df_train['similarity_score'])

In [8]:
X_test =  vstack(df_test.apply(lambda row: hstack([row['lemm_sentence1'], row['lemm_sentence2']]), axis=1))

In [9]:
predictions_train = svr.predict(X_test)

In [10]:
import scipy.stats as stats

# Calculate Pearson correlation
pearson_corr, ppvalue = stats.pearsonr(predictions_train, df_test['similarity_score'].to_list())
print("Pearson correlation:", pearson_corr, ppvalue)

# Calculate Spearman correlation
spearman_corr,spvalue = stats.spearmanr(predictions_train, df_test['similarity_score'].to_list())
print("Spearman correlation:", spearman_corr, spvalue)

Pearson correlation: 0.338355430030839 0.004460210633697859
Spearman correlation: 0.284726265427442 0.017729800801653713
