# Evaluation based on SVM with BOW and TF-IDF

### Loading the train, validation and test sets

In [1]:
from pathlib import Path
import pandas as pd

df_train = pd.read_table(Path('../sts/train.tsv'))
df_valid = pd.read_table(Path('../sts/valid.tsv'))
df_test =  pd.read_table(Path('../sts/test.tsv'))

### Loading the model. Tokenizer implementation.

In [2]:
import spacy 
import string

nlp = spacy.load("es_core_news_sm", disable = ['parser', 'ner'])

punctuations = string.punctuation + '...¡¿'
stop_words = spacy.lang.es.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

### Fitting Bag of Words and TF-IDF on the training set

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer=spacy_tokenizer)
vectorizer = vectorizer.fit(df_train['sentence1'].to_list() + df_train['sentence2'].to_list())



### Transform on the training and test sets

In [4]:

def dataset_preprocessing(dataset):    
    dataset['lemm_sentence1'] = dataset['sentence1'].map(lambda x: vectorizer.transform([x]))
    dataset['lemm_sentence2'] = dataset['sentence2'].map(lambda x: vectorizer.transform([x]))

In [5]:
dataset_preprocessing(df_train)
dataset_preprocessing(df_test)

### SVM algorithm training

In [6]:
from sklearn.svm import SVR
from scipy.sparse import vstack, hstack

X_train =  vstack(df_train.apply(lambda row: hstack([row['lemm_sentence1'], row['lemm_sentence2']]), axis=1))
Y_train = df_train['label']

svr = SVR(kernel='rbf', C=100)
svr = svr.fit(X_train, Y_train)

### Predictions on the test set

In [7]:
X_test =  vstack(df_test.apply(lambda row: hstack([row['lemm_sentence1'], row['lemm_sentence2']]), axis=1))

In [8]:
predictions = svr.predict(X_test)

### Evaluation metrics

In [9]:
import scipy.stats as stats

# Calculate Pearson correlation
pearson_corr, ppvalue = stats.pearsonr(predictions, df_test.label.to_list())
print("Pearson correlation:", pearson_corr, ppvalue)

# Calculate Spearman correlation
spearman_corr,spvalue = stats.spearmanr(predictions, df_test.label.to_list())
print("Spearman correlation:", spearman_corr, spvalue)

Pearson correlation: 0.8578232202724745 9.448269909510543e-21
Spearman correlation: 0.8657416550643943 1.6262502575553458e-21


In [None]:
Pearson correlation: 0.8579662484049169 9.161386974495875e-21
Spearman correlation: 0.8683865482175525 8.811753257446161e-22