# <span style="font-family:Courier New; color:#CCCCCC">**Text Similarity**</span>

## <span style="font-family:Courier New; color:#336666">**Load Data and Imports**</span>

In [None]:
"""
%pip install datasets
%pip install 
%pip install -U spacy
!python3 -m spacy download ca_core_news_md
!python3 -m spacy download ca_core_news_trf
%pip install spacy-transformers
%pip install scipy
%pip install tensorflow
%pip install transformers
%pip install pandas
"""

In [None]:
#Requisites
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from gensim.models import KeyedVectors,TfidfModel
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import numpy as np
import spacy
import spacy_transformers
from transformers import pipeline, AutoTokenizer
from typing import Tuple, List
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from gensim.matutils import corpus2csc
import scipy
from scipy.stats import pearsonr,spearmanr
import tensorflow as tf
import pandas as pd

In [3]:
# DATA
dataset = load_dataset("projecte-aina/sts-ca")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## <span style="font-family:Courier New; color:#336666">**Preprocessing and dataframes creation**</span>

In order to preprocess the data in incontextual embedding models, we will stablish stopword treatment and simple_preprocess.

In [4]:
STOPWORDS_CAT ={
    "a", "abans", "algun", "alguna", "algunes", "alguns", "altre", "amb", "ambdós", "anar", 
    "ans", "aquell", "aquelles", "aquells", "aquí", "bastant", "bé", "cada", "com", 
    "consegueixo", "conseguim", "conseguir", "consigueix", "consigueixen", "consigueixes", 
    "dalt", "de", "des", "dins", "el", "elles", "ells", "els", "en", "ens", "entre", 
    "era", "erem", "eren", "eres", "es", "és", "éssent", "està", "estan", "estat", 
    "estava", "estem", "esteu", "estic", "ets", "fa", "faig", "fan", "fas", "fem", 
    "fer", "feu", "fi", "haver", "i", "inclòs", "jo", "la", "les", "llarg", "llavors", 
    "mentre", "meu", "mode", "molt", "molts", "nosaltres", "o", "on", "per", "però", 
    "perquè", "podem", "poden", "poder", "podeu", "potser", "primer", "puc", "quan", 
    "quant", "que", "què", "qui", "sabem", "saben", "saber", "sabeu", "sap", "saps", 
    "sense", "ser", "seu", "seus", "si", "soc", "solament", "sols", "som", "sota", 
    "també", "te", "tene", "tenim", "tenir", "teniu", "teu", "tinc", "tot", "últim", 
    "un", "una", "unes", "uns", "ús", "va", "vaig", "van", "vosaltres"
}

In [5]:
# Define preprocessing
def preprocess(sentence: str, stop:bool = True) -> List[str]:
    preprocessed = simple_preprocess(sentence)
    if stop: preprocessed = [token for token in preprocessed if token not in STOPWORDS_CAT]
    return preprocessed

### <span style="font-family:Courier New; color:#336633">**Dataset format creation**</span>

#### <span style="font-family:Courier New; color:#994C00">**Count-Vectorizer/TF-IDF**</span>

In [6]:
input_pairs = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["train"].to_list()]
input_pairs_val = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["validation"].to_list()]
input_pairs_test = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["test"].to_list()]

In [7]:

all_input_pairs = input_pairs + input_pairs_val + input_pairs_test
# Preprocessament de les frases i creació dels diccionaris

# Frases per a models contextuals, amb stopwords
sentences_1 = [preprocess(sentence_1,stop = False) for sentence_1, _, _ in all_input_pairs]
sentences_2 = [preprocess(sentence_2,stop = False) for _, sentence_2, _ in all_input_pairs]

# Frases per a models no contextuals, sense stopwords
sentences_1_preproc = [preprocess(sentence_1) for sentence_1, _, _ in all_input_pairs]
sentences_2_preproc = [preprocess(sentence_2) for _, sentence_2, _ in all_input_pairs]

sentence_pairs_preproc = list(zip(sentences_1_preproc, sentences_2_preproc))
sentence_pairs = list(zip(sentences_1, sentences_2))

# Versió aplanada de les frases
sentences_pairs_flattened_preproc = sentences_1_preproc + sentences_2_preproc
sentences_pairs_flattened = sentences_1 + sentences_2

dict_preproc = Dictionary(sentences_pairs_flattened_preproc)
dict_preproc_complete = Dictionary(sentences_pairs_flattened)

# Filtrem tamany de diccionari per les variants estàndard de TF-IDF i BOW

dict_preproc.filter_extremes(keep_n=300)

In [8]:
# Cálculo de pesos TF-IDF per les frases preprocessades
corpus = [dict_preproc.doc2bow(sent) for sent in sentences_pairs_flattened_preproc]
corpus_complete = [dict_preproc_complete.doc2bow(sent) for sent in sentences_pairs_flattened_preproc]
model_tfidf = TfidfModel(corpus)
model_tfidf_complete = TfidfModel(corpus_complete)

In [9]:
#Get count vector from dictionary
def map_to_bow(sentence: List[str], dictionary: Dictionary) -> np.ndarray:
    vec = np.zeros(len(dictionary))   
    bow = dictionary.doc2bow(sentence)
    for token_id, count in bow:
        vec[token_id] = count
    return vec

In [10]:
def map_to_tf_idf(sentence: List[str], dictionary: Dictionary, tfidf: TfidfModel) -> np.ndarray:
    vec = np.zeros(len(dictionary))   
    bow = dictionary.doc2bow(sentence)   
    for token_id, value in tfidf[bow]:
        vec[token_id] = value
    return vec

In [11]:
#Process all pairs
def bow_pairs(sentence_pairs: List[Tuple[str, str, float]], dictionary: Dictionary = None,tf:bool = False,model_tfidf:TfidfModel = None) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:

    pair_vectors = []
    for (s1,s2,sim) in sentence_pairs:
        
        s1_preproc = preprocess(s1)
        s2_preproc = preprocess(s2)
        
        if tf == False:
            vectors_1 = map_to_bow(s1_preproc, dictionary)
            vectors_2 = map_to_bow(s2_preproc, dictionary)
        else:
            vectors_1 = map_to_tf_idf(s1_preproc, dictionary, model_tfidf)
            vectors_2 = map_to_tf_idf(s2_preproc, dictionary, model_tfidf)

        pair_vectors.append(((vectors_1, vectors_2), sim))
    return pair_vectors

In [12]:
#BOW pairs
df_bow_train = bow_pairs(input_pairs, dict_preproc)
df_bow_val = bow_pairs(input_pairs_val, dict_preproc)
df_bow_test = bow_pairs(input_pairs_test, dict_preproc)

In [13]:
#TF-IDF pairs
df_tfidf_train = bow_pairs(input_pairs, dict_preproc,tf = True,model_tfidf = model_tfidf)
df_tfidf_val = bow_pairs(input_pairs_val, dict_preproc,tf = True,model_tfidf = model_tfidf)
df_tfidf_test = bow_pairs(input_pairs_test, dict_preproc,tf = True,model_tfidf = model_tfidf)

We erased stopwords in TF-IDF and BOW models. We therefore expect that the differences between TF-IDF and BOW are not that notable.

#### <span style="font-family:Courier New; color:#994C00">**Word2Vec/GloVe**</span>

##### <span style="font-family:Courier New; color:#994C00">**Load Vectors**</span>

We will used pretrained catalan Word2Vec Continous Skipgram.

In [14]:
WORD_EMBEDDING_FILE = 'C:/Users/Jordi/Desktop/Universitat/PLH/Models/cat_w2vec/model.bin'

In [15]:
# Create a dictionary with the words and their corresponding vectors
wv_model = KeyedVectors.load_word2vec_format(WORD_EMBEDDING_FILE, binary=True)

In [16]:
def map_tf_idf(sentence_preproc: List[str], dictionary: Dictionary, tf_idf_model: TfidfModel) -> Tuple[List[np.ndarray], List[float]]:
    bow = dictionary.doc2bow(sentence_preproc)
    tf_idf = tf_idf_model[bow]
    vectors, weights = [], []
    for word_index, weight in tf_idf:
        word = dictionary.get(word_index)
        if word in wv_model:
            vectors.append(wv_model[word])
            weights.append(weight)
    return vectors, weights

def map_pairs(
        sentence_pairs: List[Tuple[str, str, float]],
        dictionary: Dictionary = None,
        tf_idf_model: TfidfModel = None,
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        sentence_1_preproc = preprocess(sentence_1)
        sentence_2_preproc = preprocess(sentence_2)
        # Si usamos TF-IDF
        if tf_idf_model is not None:
            # Cálculo del promedio ponderado por TF-IDF de los word embeddings
            vectors1, weights1 = map_tf_idf(sentence_1_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, )
            vectors2, weights2 = map_tf_idf(sentence_2_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, )
            vector1 = np.average(vectors1, weights=weights1, axis=0, )
            vector2 = np.average(vectors2, weights=weights2, axis=0, )
        else:
            # Cálculo del promedio de los word embeddings
            vectors1 = [wv_model[word] for word in sentence_1_preproc if word in wv_model]
            vectors2 = [wv_model[word] for word in sentence_2_preproc if word in wv_model]
            vector1 = np.mean(vectors1, axis=0)
            vector2 = np.mean(vectors2, axis=0)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [17]:
#TF-IDF + W2VEC pairs
df_w2vec_tf_train = map_pairs(input_pairs,  tf_idf_model=model_tfidf_complete, dictionary= dict_preproc_complete, )
df_w2vec_tf_val = map_pairs(input_pairs_val, tf_idf_model=model_tfidf_complete, dictionary= dict_preproc_complete, )
df_w2vec_tf_test = map_pairs(input_pairs_test, tf_idf_model=model_tfidf_complete, dictionary= dict_preproc_complete, )

In [18]:
# Mean Pairs
df_w2vec_train = map_pairs(sentence_pairs = input_pairs,dictionary= dict_preproc_complete)
df_w2vec_val = map_pairs(sentence_pairs = input_pairs_val,dictionary= dict_preproc_complete)
df_w2vec_test = map_pairs(sentence_pairs = input_pairs_test,dictionary= dict_preproc_complete)

#### <span style="font-family:Courier New; color:#994C00">**spaCy**</span>

In [19]:
nlp = spacy.load('ca_core_news_md') # Load catalan model

In [20]:
def map_to_spacy(sentence_pairs: List[Tuple[str, str, float]], nlp: spacy.language.Language) -> np.ndarray:

    pares_vectores = []
    #Per cada frase
    for s1,s2,sim in sentence_pairs:

        vector1 = nlp(s1).vector
        vector2 = nlp(s2).vector

        #Afegim vector a llista
        pares_vectores.append(((vector1, vector2), sim))
        
    return pares_vectores
    

In [21]:
# SPACY DATAFRAMES
df_spacy_train = map_to_spacy(input_pairs, nlp)
df_spacy_val = map_to_spacy(input_pairs_val, nlp)
df_spacy_test = map_to_spacy(input_pairs_test, nlp)

#### <span style="font-family:Courier New; color:#994C00">**RoBERTa CLS/Mitjana**</span>

In [22]:
nlp_r = spacy.load('ca_core_news_trf') # Catalan transformer model

In [23]:
def map_transformer(sentence_pairs: List[Tuple[str, str, float]], nlp: spacy.language.Language,cls:str = True) -> np.ndarray:

    pares_vectores = []
    #Per cada frase
    for s1,s2,sim in sentence_pairs:

        #Si volem el vector CLS
        
        if cls:
            vector1 = nlp(s1)._.trf_data.last_hidden_layer_state.data[0]
            vector2 = nlp(s2)._.trf_data.last_hidden_layer_state.data[0]

        #Si volem la mitjana dels valors de les frases
        else:

            vectors1 = nlp(s1)._.trf_data.last_hidden_layer_state.data[1:]
            vectors2 = nlp(s2)._.trf_data.last_hidden_layer_state.data[1:]

            vector1 = np.mean(vectors1, axis=0)
            vector2 = np.mean(vectors2, axis=0)

        #Afegim vector a llista
        pares_vectores.append(((vector1, vector2), sim))
        
    return pares_vectores

In [24]:
# CLS BERT
df_BERT_CLS = map_transformer(input_pairs, nlp_r)
df_BERT_CLS_val = map_transformer(input_pairs_val, nlp_r)
df_BERT_CLS_test = map_transformer(input_pairs_test, nlp_r)

In [25]:
# MEAN BERT
df_BERT_MEAN = map_transformer(input_pairs, nlp_r,cls = False)
df_BERT_MEAN_val = map_transformer(input_pairs_val, nlp_r,cls = False)
df_BERT_MEAN_test = map_transformer(input_pairs_test, nlp_r,cls = False)

**As Roberta cased Finetuned returns the probability, we will be showing the results at last, just after the model embedding representation comparison.**

## <span style="font-family:Courier New; color:#336666">**Model creation**</span>

We will be using the example delievered to us.


In [26]:
def build_and_compile_model_better(embedding_size: int = 300, learning_rate: float = 1e-3) -> tf.keras.Model:
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,))
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Hidden layer
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    projected_1 =  first_projection(input_1)
    projected_2 = first_projection(input_2)
    
    # Compute the cosine distance using a Lambda layer
    def normalized_product(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return x1_normalized * x2_normalized

    output = tf.keras.layers.Lambda(normalized_product)([projected_1, projected_2])
    output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.Dense(
        16,
        activation="relu",
    )(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(
        1,
        activation="sigmoid",
    )(output)
    
    output = tf.keras.layers.Lambda(lambda x: x * 5)(output)

    # Define output
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)


    # Compile the model
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate))
    return model

In [27]:
# Define training constants
batch_size: int = 64
num_epochs: int = 64

In [28]:
# Define callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  
    patience=5,          
    restore_best_weights=True  
    )

In [29]:
#Entrada vàlida per al model
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:

    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.array(_x_1), np.array(_x_2)), np.array(_y, dtype=np.float32, )

## <span style="font-family:Courier New; color:#336666">**Model evaluation**</span>

In [30]:
results_df = pd.DataFrame(columns = ["Pearson_train","Spearman_train","Pearson_val","Spearman_val"],index = ["BOW","TF-IDF","W2VEC+TF-IDF","W2VEC+MEAN","SPACY_MD","RoBERTa + CLS","RoBERTa + MEAN"])

In [31]:
def compute_pearson_spearman(x_, y_,model):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = model.predict(x_)
    print(np.max(y_pred))
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    pearson, _ = pearsonr(y_pred.flatten(), y_.flatten())
    spearman,_ = spearmanr(y_pred.flatten(), y_.flatten())
    return pearson, spearman

#### <span style="font-family:Courier New; color:#994C00">**BOW**</span>

In [32]:
# Separació x-y
x_train_bow, y_train_bow = pair_list_to_x_y(df_bow_train)
x_val_bow, y_val_bow = pair_list_to_x_y(df_bow_val)

#Preparar el conjunt

train_bow = tf.data.Dataset.from_tensor_slices((x_train_bow, y_train_bow))
train_bow = train_bow.shuffle(buffer_size=len(x_train_bow)).batch(batch_size)

val__bow = tf.data.Dataset.from_tensor_slices((x_val_bow, y_val_bow))
val__bow = val__bow.batch(batch_size)

In [33]:
# Construir y compilar el modelo
model_bow = build_and_compile_model_better()




In [34]:
# Train the model
model_bow.fit(train_bow, epochs=num_epochs, validation_data=val__bow, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.7350 - val_loss: 0.7245
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7256 - val_loss: 0.7210
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7138 - val_loss: 0.7174
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7041 - val_loss: 0.7140
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6900 - val_loss: 0.7087
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6648 - val_loss: 0.7039
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6293 - val_loss: 0.7000
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5795 - val_loss: 0.6996
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1f996d68b90>

In [35]:
pearson_train_bow, spearman_train_bow = compute_pearson_spearman(x_train_bow, y_train_bow,model_bow)
pearson_val_bow, spearman_val_bow = compute_pearson_spearman(x_val_bow, y_val_bow,model_bow)

results_df.loc["BOW"] = [pearson_train_bow,spearman_train_bow,pearson_val_bow,spearman_val_bow]

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
3.1999726
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
3.2412047


In [36]:
results_df

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,,,,
W2VEC+TF-IDF,,,,
W2VEC+MEAN,,,,
SPACY_MD,,,,
RoBERTa + CLS,,,,
RoBERTa + MEAN,,,,


#### <span style="font-family:Courier New; color:#994C00">**TF-IDF**</span>

In [37]:
# Separació x-y
x_train_tfidf, y_train_tfidf = pair_list_to_x_y(df_tfidf_train)
x_val_tfidf, y_val_tfidf = pair_list_to_x_y(df_tfidf_val)

#Preparar el conjunt

train_tfidf = tf.data.Dataset.from_tensor_slices((x_train_tfidf, y_train_tfidf))
train_tfidf = train_tfidf.shuffle(buffer_size=len(x_train_tfidf)).batch(batch_size)

val_tfidf = tf.data.Dataset.from_tensor_slices((x_val_tfidf, y_val_tfidf))
val_tfidf = val_tfidf.batch(batch_size)

In [38]:
# Construir y compilar el modelo
model_tfidf = build_and_compile_model_better()

In [39]:
model_tfidf.fit(train_tfidf, epochs=num_epochs, validation_data=val_tfidf, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.7382 - val_loss: 0.7207
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7235 - val_loss: 0.7181
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7176 - val_loss: 0.7162
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7081 - val_loss: 0.7144
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6908 - val_loss: 0.7106


<keras.src.callbacks.history.History at 0x1f99ffbf850>

In [40]:
# Avaluació

pearson_train_tfidf, spearman_train_tfidf = compute_pearson_spearman(x_train_tfidf, y_train_tfidf,model_tfidf)
pearson_val_tfidf, spearman_val_tfidf = compute_pearson_spearman(x_val_tfidf, y_val_tfidf,model_tfidf)

results_df.loc["TF-IDF"] = [pearson_train_tfidf,spearman_train_tfidf,pearson_val_tfidf,spearman_val_tfidf]

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
2.6807377
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
2.686615


In [41]:
results_df

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,0.138521,0.170491,0.157639,0.137848
W2VEC+TF-IDF,,,,
W2VEC+MEAN,,,,
SPACY_MD,,,,
RoBERTa + CLS,,,,
RoBERTa + MEAN,,,,


#### <span style="font-family:Courier New; color:#994C00">**W2VEC + TF-IDF**</span>

In [42]:
# Separació x-y
x_train_w2vec_tf, y_train_w2vec_tf = pair_list_to_x_y(df_w2vec_tf_train)
x_val_w2vec_tf, y_val_w2vec_tf = pair_list_to_x_y(df_w2vec_tf_val)

#Preparar el conjunt

train_w2vec_tf = tf.data.Dataset.from_tensor_slices((x_train_w2vec_tf, y_train_w2vec_tf))
train_w2vec_tf = train_w2vec_tf.shuffle(buffer_size=len(x_train_w2vec_tf)).batch(batch_size)

val_w2vec_tf = tf.data.Dataset.from_tensor_slices((x_val_w2vec_tf, y_val_w2vec_tf))
val_w2vec_tf = val_w2vec_tf.batch(batch_size)

In [43]:
# Construir y compilar el modelo
model_w2vec_tf = build_and_compile_model_better(embedding_size=100)

In [44]:
model_w2vec_tf.fit(train_w2vec_tf, epochs=num_epochs, validation_data=val_w2vec_tf, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.7373 - val_loss: 0.7247
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7269 - val_loss: 0.7162
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7113 - val_loss: 0.7099
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6932 - val_loss: 0.7028
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6804 - val_loss: 0.6958
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6626 - val_loss: 0.6875
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6413 - val_loss: 0.6776
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6230 - val_loss: 0.6699
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1f99ffde490>

In [45]:
# Avaluació

pearson_train_w2vec_tf, spearman_train_w2vec_tf = compute_pearson_spearman(x_train_w2vec_tf, y_train_w2vec_tf,model_w2vec_tf)
pearson_val_w2vec_tf, spearman_val_w2vec_tf = compute_pearson_spearman(x_val_w2vec_tf, y_val_w2vec_tf,model_w2vec_tf)

results_df.loc["W2VEC+TF-IDF"] = [pearson_train_w2vec_tf,spearman_train_w2vec_tf,pearson_val_w2vec_tf,spearman_val_w2vec_tf]

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
3.5313864
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 997us/step
3.3289063


In [46]:
results_df


Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,0.138521,0.170491,0.157639,0.137848
W2VEC+TF-IDF,0.676864,0.662533,0.3299,0.34085
W2VEC+MEAN,,,,
SPACY_MD,,,,
RoBERTa + CLS,,,,
RoBERTa + MEAN,,,,


#### <span style="font-family:Courier New; color:#994C00">**W2VEC + MEAN**</span>

In [47]:
# Separació x-y
x_train_w2vec, y_train_w2vec = pair_list_to_x_y(df_w2vec_train)
x_val_w2vec, y_val_w2vec = pair_list_to_x_y(df_w2vec_val)

#Preparar el conjunt

train_w2vec = tf.data.Dataset.from_tensor_slices((x_train_w2vec, y_train_w2vec))
train_w2vec = train_w2vec.shuffle(buffer_size=len(x_train_w2vec)).batch(batch_size)

val_w2vec = tf.data.Dataset.from_tensor_slices((x_val_w2vec, y_val_w2vec))
val_w2vec = val_w2vec.batch(batch_size)

In [48]:
# Construir y compilar el modelo
model_w2vec = build_and_compile_model_better(embedding_size=100)

In [49]:
model_w2vec.fit(train_w2vec, epochs=num_epochs, validation_data=val_w2vec, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.7322 - val_loss: 0.7200
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7187 - val_loss: 0.7115
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6993 - val_loss: 0.7028
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6810 - val_loss: 0.6948
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6666 - val_loss: 0.6871


<keras.src.callbacks.history.History at 0x1f99d665710>

In [50]:
# Avaluació

pearson_train_w2vec, spearman_train_w2vec = compute_pearson_spearman(x_train_w2vec, y_train_w2vec,model_w2vec)
pearson_val_w2vec, spearman_val_w2vec = compute_pearson_spearman(x_val_w2vec, y_val_w2vec,model_w2vec)

results_df.loc["W2VEC+MEAN"] = [pearson_train_w2vec,spearman_train_w2vec,pearson_val_w2vec,spearman_val_w2vec]

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
2.6370664
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
2.6323435


In [51]:
results_df

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,0.138521,0.170491,0.157639,0.137848
W2VEC+TF-IDF,0.676864,0.662533,0.3299,0.34085
W2VEC+MEAN,0.264724,0.261133,0.161652,0.192895
SPACY_MD,,,,
RoBERTa + CLS,,,,
RoBERTa + MEAN,,,,


#### <span style="font-family:Courier New; color:#994C00">**SPACY_MD**</span>

In [52]:
# Separació x-y
x_train_spacy, y_train_spacy = pair_list_to_x_y(df_spacy_train)
x_val_spacy, y_val_spacy = pair_list_to_x_y(df_spacy_val)

#Preparar el conjunt

train_spacy = tf.data.Dataset.from_tensor_slices((x_train_spacy, y_train_spacy))
train_spacy = train_spacy.shuffle(buffer_size=len(x_train_spacy)).batch(batch_size)

val_spacy = tf.data.Dataset.from_tensor_slices((x_val_spacy, y_val_spacy))
val_spacy = val_spacy.batch(batch_size)


In [53]:
# Construir y compilar el modelo
model_spacy = build_and_compile_model_better()

In [54]:
model_spacy.fit(train_spacy, epochs=num_epochs, validation_data=val_spacy, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.7326 - val_loss: 0.7149
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7145 - val_loss: 0.7016
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7004 - val_loss: 0.6870
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6730 - val_loss: 0.6776
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6495 - val_loss: 0.6694


<keras.src.callbacks.history.History at 0x1f99c3958d0>

In [55]:
# Avaluació

pearson_train_spacy, spearman_train_spacy = compute_pearson_spearman(x_train_spacy, y_train_spacy,model_spacy)
pearson_val_spacy, spearman_val_spacy = compute_pearson_spearman(x_val_spacy, y_val_spacy,model_spacy)

results_df.loc["SPACY_MD"] = [pearson_train_spacy,spearman_train_spacy,pearson_val_spacy,spearman_val_spacy]

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
2.6531262
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
2.646244


In [56]:
results_df

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,0.138521,0.170491,0.157639,0.137848
W2VEC+TF-IDF,0.676864,0.662533,0.3299,0.34085
W2VEC+MEAN,0.264724,0.261133,0.161652,0.192895
SPACY_MD,0.251059,0.239841,0.205978,0.163019
RoBERTa + CLS,,,,
RoBERTa + MEAN,,,,


#### <span style="font-family:Courier New; color:#994C00">**RoBERTa + CLS**</span>

In [57]:
# Separació x-y
x_train_bert_cls, y_train_bert_cls = pair_list_to_x_y(df_BERT_CLS)
x_val_bert_cls, y_val_bert_cls = pair_list_to_x_y(df_BERT_CLS_val)

#Preparar el conjunt

train_bert_cls = tf.data.Dataset.from_tensor_slices((x_train_bert_cls, y_train_bert_cls))
train_bert_cls = train_bert_cls.shuffle(buffer_size=len(x_train_bert_cls)).batch(batch_size)

val_bert_cls = tf.data.Dataset.from_tensor_slices((x_val_bert_cls, y_val_bert_cls))
val_bert_cls = val_bert_cls.batch(batch_size)

In [58]:
# Construir y compilar el modelo
model_bert_cls = build_and_compile_model_better(embedding_size=768)

In [59]:
# Train the model
model_bert_cls.fit(train_bert_cls, epochs=num_epochs, validation_data=val_bert_cls, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.7335 - val_loss: 0.7203
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.7231 - val_loss: 0.7133
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.7075 - val_loss: 0.7054
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6898 - val_loss: 0.7007
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6705 - val_loss: 0.6976


<keras.src.callbacks.history.History at 0x1f996fb2710>

In [60]:
# Avaluació

pearson_train_bert_cls, spearman_train_bert_cls = compute_pearson_spearman(x_train_bert_cls, y_train_bert_cls,model_bert_cls)
pearson_val_bert_cls, spearman_val_bert_cls = compute_pearson_spearman(x_val_bert_cls, y_val_bert_cls,model_bert_cls)

results_df.loc["RoBERTa + CLS"] = [pearson_train_bert_cls,spearman_train_bert_cls,pearson_val_bert_cls,spearman_val_bert_cls]

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
2.643705
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
2.6436658


#### <span style="font-family:Courier New; color:#994C00">**RoBERTa + MEAN**</span>

In [61]:
# Separació x-y
x_train_bert_mean, y_train_bert_mean = pair_list_to_x_y(df_BERT_MEAN)
x_val_bert_mean, y_val_bert_mean = pair_list_to_x_y(df_BERT_MEAN_val)

#Preparar el conjunt

train_bert_mean = tf.data.Dataset.from_tensor_slices((x_train_bert_mean, y_train_bert_mean))
train_bert_mean = train_bert_mean.shuffle(buffer_size=len(x_train_bert_mean)).batch(batch_size)

val_bert_mean = tf.data.Dataset.from_tensor_slices((x_val_bert_mean, y_val_bert_mean))
val_bert_mean = val_bert_mean.batch(batch_size)

In [62]:
# Construir y compilar el modelo
model_bert_mean = build_and_compile_model_better(embedding_size=768)

In [63]:
# Train the model
model_bert_mean.fit(train_bert_mean, epochs=num_epochs, validation_data=val_bert_mean, callbacks=[early_stopping])

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.7349 - val_loss: 0.7254
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.7280 - val_loss: 0.7193
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.7160 - val_loss: 0.7116
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6931 - val_loss: 0.7029
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6725 - val_loss: 0.6943


<keras.src.callbacks.history.History at 0x1f9a0031dd0>

In [64]:
# Avaluació

pearson_train_bert_mean, spearman_train_bert_mean = compute_pearson_spearman(x_train_bert_mean, y_train_bert_mean,model_bert_mean)
pearson_val_bert_mean, spearman_val_bert_mean = compute_pearson_spearman(x_val_bert_mean, y_val_bert_mean,model_bert_mean)

results_df.loc["RoBERTa + MEAN"] = [pearson_train_bert_mean,spearman_train_bert_mean,pearson_val_bert_mean,spearman_val_bert_mean]


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
2.5859168
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
2.58448


In [65]:
results_df

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,0.138521,0.170491,0.157639,0.137848
W2VEC+TF-IDF,0.676864,0.662533,0.3299,0.34085
W2VEC+MEAN,0.264724,0.261133,0.161652,0.192895
SPACY_MD,0.251059,0.239841,0.205978,0.163019
RoBERTa + CLS,0.221038,0.229657,0.150123,0.157386
RoBERTa + MEAN,0.262524,0.242089,0.155967,0.141411


#### <span style="font-family:Courier New; color:#994C00">**RoBERTa finetuned**</span>

In [66]:
# Get x and y from pairs
def get_x_y(pairs):
    x = [(a[0],a[1]) for a in pairs]
    y = [a[2] for a in pairs]
    return x,y

In [67]:
# Load model and preprocessing
model = 'projecte-aina/roberta-base-ca-v2-cased-sts'
tokenizer = AutoTokenizer.from_pretrained(model)
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

def prepare(sentence_pairs):
    sentence_pairs_prep = []
    for s1, s2 in sentence_pairs:
        sentence_pairs_prep.append(f"{tokenizer.cls_token} {s1}{tokenizer.sep_token}{tokenizer.sep_token} {s2}{tokenizer.sep_token}")
    return sentence_pairs_prep

  _torch_pytree._register_pytree_node(


In [68]:
x_train_finetuned,y_train_finetuned = get_x_y(input_pairs)
x_val_finetuned,y_val_finetuned = get_x_y(input_pairs_val)

In [69]:
#Making predictions
predictions_train = pipe(prepare(x_train_finetuned), add_special_tokens=False)
predictions_val = pipe(prepare(x_val_finetuned), add_special_tokens=False)

# convert back to scores to the original 0 and 5 interval
for prediction in predictions_train:
    prediction['score'] = scipy.special.logit(prediction['score'])
for prediction in predictions_val:
    prediction['score'] = scipy.special.logit(prediction['score'])

In [70]:
def pearson_spearman(y_true, y_pred):
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    pearson, _ = pearsonr(y_pred.flatten(), y_true.flatten())
    spearman,_ = spearmanr(y_pred.flatten(), y_true.flatten())
    return pearson, spearman

In [71]:
# Compute Pearson and Spearman correlation

pearson_train_finetuned, spearman_train_finetuned = pearson_spearman(np.array(y_train_finetuned), np.array([p['score'] for p in predictions_train]))
pearson_val_finetuned, spearman_val_finetuned = pearson_spearman(np.array(y_val_finetuned), np.array([p['score'] for p in predictions_val]))

results_df.loc["RoBERTa + Finetuned"] = [pearson_train_finetuned,spearman_train_finetuned,pearson_val_finetuned,spearman_val_finetuned]


In [72]:
results_df

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
BOW,0.617894,0.655386,0.204644,0.212015
TF-IDF,0.138521,0.170491,0.157639,0.137848
W2VEC+TF-IDF,0.676864,0.662533,0.3299,0.34085
W2VEC+MEAN,0.264724,0.261133,0.161652,0.192895
SPACY_MD,0.251059,0.239841,0.205978,0.163019
RoBERTa + CLS,0.221038,0.229657,0.150123,0.157386
RoBERTa + MEAN,0.262524,0.242089,0.155967,0.141411
RoBERTa + Finetuned,0.947429,0.961433,0.75226,0.731941


In [73]:
# Make another table, but the predictions are made with cosine distance
from scipy.spatial.distance import cosine

results_cosine = pd.DataFrame(columns = ["Pearson_train","Spearman_train","Pearson_val","Spearman_val"],index = ["BOW","TF-IDF","W2VEC+TF-IDF","W2VEC+MEAN","SPACY_MD","RoBERTa + CLS","RoBERTa + MEAN","RoBERTa + Finetuned"]) 

# Baseline
def compute_pearson_baseline(x_, y_):
    y_pred_baseline = []
    for v1, v2 in zip(*x_):
        d = 1.0 - scipy.spatial.distance.cosine(v1, v2)
        y_pred_baseline.append(d)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    pearson, _ = pearsonr(y_pred_baseline, y_.flatten()*5)
    spearman,_ = spearmanr(y_pred_baseline, y_.flatten()*5)
    return pearson,spearman

pearson_train_bow,spearman_train_bow = compute_pearson_baseline(x_train_bow, y_train_bow)
pearson_val_bow,spearman_val_bow = compute_pearson_baseline(x_val_bow, y_val_bow)

results_cosine.loc["BOW"] = [pearson_train_bow,spearman_train_bow,pearson_val_bow,spearman_val_bow]


# TF-IDF
pearson_train_tfidf,sperman_train_tfidf = compute_pearson_baseline(x_train_tfidf, y_train_tfidf)
pearson_val_tfidf,sperman_val_tfidf = compute_pearson_baseline(x_val_tfidf, y_val_tfidf)

results_cosine.loc["TF-IDF"] = [pearson_train_tfidf,sperman_train_tfidf,pearson_val_tfidf,sperman_val_tfidf]


# W2VEC + TF-IDF

pearson_train_w2vec_tf,sperman_train_w2vec_tf = compute_pearson_baseline(x_train_w2vec_tf, y_train_w2vec_tf)
pearson_val_w2vec_tf,sperman_val_w2vec_tf = compute_pearson_baseline(x_val_w2vec_tf, y_val_w2vec_tf)

results_cosine.loc["W2VEC+TF-IDF"] = [pearson_train_w2vec_tf,sperman_train_w2vec_tf,pearson_val_w2vec_tf,sperman_val_w2vec_tf]
# W2VEC + MEAN

pearson_train_w2vec,sperman_train_w2vec = compute_pearson_baseline(x_train_w2vec, y_train_w2vec)
pearson_val_w2vec,sperman_val_w2vec = compute_pearson_baseline(x_val_w2vec, y_val_w2vec)

results_cosine.loc["W2VEC+MEAN"] = [pearson_train_w2vec,sperman_train_w2vec,pearson_val_w2vec,sperman_val_w2vec]

# SPACY

pearson_train_spacy,sperman_train_spacy = compute_pearson_baseline(x_train_spacy, y_train_spacy)
pearson_val_spacy,sperman_val_spacy = compute_pearson_baseline(x_val_spacy, y_val_spacy)

results_cosine.loc["SPACY_MD"] = [pearson_train_spacy,sperman_train_spacy,pearson_val_spacy,sperman_val_spacy]


# RoBERTa + CLS

pearson_train_bert_cls,sperman_train_bert_cls = compute_pearson_baseline(x_train_bert_cls, y_train_bert_cls)
pearson_val_bert_cls,sperman_val_bert_cls = compute_pearson_baseline(x_val_bert_cls, y_val_bert_cls)

results_cosine.loc["RoBERTa + CLS"] = [pearson_train_bert_cls,sperman_train_bert_cls,pearson_val_bert_cls,sperman_val_bert_cls]

# RoBERTa + MEAN

pearson_train_bert_mean,sperman_train_bert_mean = compute_pearson_baseline(x_train_bert_mean, y_train_bert_mean)
pearson_val_bert_mean,sperman_val_bert_mean = compute_pearson_baseline(x_val_bert_mean, y_val_bert_mean)

results_cosine.loc["RoBERTa + MEAN"] = [pearson_train_bert_mean,sperman_train_bert_mean,pearson_val_bert_mean,sperman_val_bert_mean]

  dist = 1.0 - uv / math.sqrt(uu * vv)


In [74]:
results_cosine.loc["RoBERTa + Finetuned"] = [pearson_train_finetuned,spearman_train_finetuned,pearson_val_finetuned,spearman_val_finetuned]

In [75]:
results_cosine.sort_values(by = "Pearson_val",ascending = False)

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
RoBERTa + Finetuned,0.947429,0.961433,0.75226,0.731941
W2VEC+TF-IDF,0.427605,0.469647,0.45547,0.460746
W2VEC+MEAN,0.42764,0.47288,0.426903,0.438519
RoBERTa + MEAN,0.366229,0.38737,0.304729,0.305552
TF-IDF,0.220896,0.236627,0.215571,0.231584
BOW,0.22873,0.243772,0.210217,0.22536
SPACY_MD,0.236751,0.350948,0.194279,0.283975
RoBERTa + CLS,0.144417,0.260255,0.094534,0.200044


In [76]:
results_df.sort_values(by = "Pearson_val",ascending = False)

Unnamed: 0,Pearson_train,Spearman_train,Pearson_val,Spearman_val
RoBERTa + Finetuned,0.947429,0.961433,0.75226,0.731941
W2VEC+TF-IDF,0.676864,0.662533,0.3299,0.34085
SPACY_MD,0.251059,0.239841,0.205978,0.163019
BOW,0.617894,0.655386,0.204644,0.212015
W2VEC+MEAN,0.264724,0.261133,0.161652,0.192895
TF-IDF,0.138521,0.170491,0.157639,0.137848
RoBERTa + MEAN,0.262524,0.242089,0.155967,0.141411
RoBERTa + CLS,0.221038,0.229657,0.150123,0.157386
