# <span style="font-family:Courier New; color:#CCCCCC">**Text Similarity w.Trainable Embeddings**</span>

## <span style="font-family:Courier New; color:#336666">**Load Data and Imports**</span>

In [113]:
from gensim.utils import simple_preprocess
from typing import Tuple, List, Optional
from gensim.corpora import Dictionary
from datasets import load_dataset # type: ignore
from scipy.stats import pearsonr
import tensorflow as tf
import fasttext # type: ignore
import pandas as pd
import numpy as np
import string
import re
import os

dataset = load_dataset("projecte-aina/sts-ca", trust_remote_code=True)

### <span style="font-family:Courier New; color:#336633">**Data Preparation**</span>

In [114]:
input_pairs = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["train"].to_list()]
input_pairs_val = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["validation"].to_list()]
input_pairs_test = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["test"].to_list()]

all_input_pairs = input_pairs + input_pairs_val + input_pairs_test

# Preprocesamiento de las oraciones y creación del diccionario
sentences_1_preproc = [simple_preprocess(sentence_1) for sentence_1, _, _ in all_input_pairs]
sentences_2_preproc = [simple_preprocess(sentence_2) for _, sentence_2, _ in all_input_pairs]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc))

# Versión mapeada de palabras a ids para poder entrenar el modelo
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc
diccionario = Dictionary(sentences_pairs_flattened)

In [75]:
def map_word_embeddings(
        sentence: str,
        fixed_dictionary: Dictionary,
        sequence_len: int = 96
) -> np.ndarray:
    """
    Map to word-embedding indices
    :param sentence:
    :param sequence_len:
    :param fixed_dictionary:
    :return:
    """
    sentence_preproc = simple_preprocess(sentence)[:sequence_len]
    _vectors = np.zeros(sequence_len, dtype=np.int32)
    index = 0
    for word in sentence_preproc:
        if word in fixed_dictionary.token2id:
            # Sumo 1 porque el valor 0 está reservado a padding
            _vectors[index] = fixed_dictionary.token2id[word] + 1
            index += 1    
    return _vectors

def map_pairs(
    sentence_pairs: List[Tuple[str, str, float]],
    fixed_dictionary: Dictionary,
    sequence_len: int = 96
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for (sentence_1, sentence_2, similitud) in sentence_pairs:
        vector1 = map_word_embeddings(sentence_1, fixed_dictionary, sequence_len)
        vector2 = map_word_embeddings(sentence_2, fixed_dictionary, sequence_len)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

mapped_train = map_pairs(input_pairs, fixed_dictionary = diccionario)
mapped_val = map_pairs(input_pairs_val, fixed_dictionary = diccionario)
mapped_test = map_pairs(input_pairs_test, fixed_dictionary = diccionario)

In [76]:
# Obtener x_train e y_train
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.row_stack(_x_1), np.row_stack(_x_2)), np.array(_y)

# Obtener las listas de train y test
x_train, y_train = pair_list_to_x_y(mapped_train)
x_val, y_val = pair_list_to_x_y(mapped_val)

# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(64)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(64)

### <span style="font-family:Courier New; color:#336633">**Train Models**</span>

In [107]:
#Regression similarity model template
def model(
    input_length: int = 96,
    dictionary_size: int = 1000,
    embedding_size: int = 16,
    pretrained_weights: Optional[np.ndarray] = None,
    learning_rate: float = 1e-3,
    trainable: bool = False,
) -> tf.keras.Model:
    # Input layers
    input_1 = tf.keras.Input(shape=(input_length,), dtype=tf.int32)
    input_2 = tf.keras.Input(shape=(input_length,), dtype=tf.int32)

    # Embedding layer
    if pretrained_weights is None:
        embedding = tf.keras.layers.Embedding(
            dictionary_size, embedding_size, input_length=input_length, mask_zero=True
        )
    else:
        dictionary_size = pretrained_weights.shape[0]
        embedding_size = pretrained_weights.shape[1]
        initializer = tf.keras.initializers.Constant(pretrained_weights)
        embedding = tf.keras.layers.Embedding(
            dictionary_size,
            embedding_size,
            input_length=input_length,
            mask_zero=True,
            embeddings_initializer=initializer,
            trainable=trainable,
        )

    # Apply embedding to input sequences
    embedded_1 = embedding(input_1)
    embedded_2 = embedding(input_2)
    # Global average pooling
    _input_mask_1 = tf.keras.layers.Lambda(lambda x: tf.not_equal(x, 0))(input_1)
    _input_mask_2 = tf.keras.layers.Lambda(lambda x: tf.not_equal(x, 0))(input_2)
    pooled_1 = tf.keras.layers.GlobalAveragePooling1D()(embedded_1, mask=_input_mask_1)
    pooled_2 = tf.keras.layers.GlobalAveragePooling1D()(embedded_2, mask=_input_mask_2)

    # Compute the cosine distance using a Lambda layer
    def normalized_product(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return x1_normalized * x2_normalized
    
    output = tf.keras.layers.Lambda(normalized_product)([pooled_1, pooled_2])
    output = tf.keras.layers.Dense(64, activation="relu")(output)
    output = tf.keras.layers.Dropout(0.5)(output)
    output = tf.keras.layers.Dense(128, activation="relu")(output)
    output = tf.keras.layers.Dropout(0.25)(output)
    output = tf.keras.layers.Dense(1)(output)

    # Define the model
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate))

    return model

#### <span style="font-family:Courier New; color:#994C00">**Random Initialized**</span>

In [100]:
#Add 1 because 0-index is for padding
model_random = model(dictionary_size = len(diccionario) + 1, embedding_size = 16)
model_random.summary()

In [108]:
model_random.fit(train_dataset, epochs=128, validation_data=val_dataset)

Epoch 1/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1215 - val_loss: 0.6794
Epoch 2/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1256 - val_loss: 0.6869
Epoch 3/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1098 - val_loss: 0.6881
Epoch 4/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1242 - val_loss: 0.6941
Epoch 5/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1165 - val_loss: 0.6890
Epoch 6/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1074 - val_loss: 0.6898
Epoch 7/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1227 - val_loss: 0.6929
Epoch 8/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1148 - val_loss: 0.6869
Epoch 9/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2816e57ce50>

#### <span style="font-family:Courier New; color:#994C00">**Word2Vec & FastText**</span>

In [80]:
cbow = fasttext.load_model('Word2Vec/dist_models/cbow_whole.bin')
fasttext = fasttext.load_model('Word2Vec/dist_models/fasttext_whole.bin')

In [81]:
#Map trained embedddings with input data ids
embedding_size = 100

#cbow
_pretrained_weights_cbow = np.zeros(
            (len(diccionario.token2id) + 1, embedding_size),  dtype=np.float32)
for token, _id in diccionario.token2id.items():
    if token in cbow:
        _pretrained_weights_cbow[_id + 1] = cbow[token]

#fasttext
_pretrained_weights_fasttext = np.zeros(
            (len(diccionario.token2id) + 1, embedding_size),  dtype=np.float32)
for token, _id in diccionario.token2id.items():
    if token in fasttext:
        _pretrained_weights_fasttext[_id + 1] = fasttext[token]

In [110]:
model_cbow = model(pretrained_weights=_pretrained_weights_cbow, trainable=True)
model_fasttext = model(pretrained_weights=_pretrained_weights_fasttext, trainable=True)
model_fasttext.summary()

In [111]:
model_cbow.fit(train_dataset, epochs=128, validation_data=val_dataset)
model_fasttext.fit(train_dataset, epochs=128, validation_data=val_dataset)

Epoch 1/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 6.2766 - val_loss: 1.8101
Epoch 2/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 1.1401 - val_loss: 0.7293
Epoch 3/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.8345 - val_loss: 0.7066
Epoch 4/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.8250 - val_loss: 0.7033
Epoch 5/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7742 - val_loss: 0.6976
Epoch 6/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.7992 - val_loss: 0.6904
Epoch 7/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7355 - val_loss: 0.6848
Epoch 8/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7302 - val_loss: 0.6788
Epoch 9/128
[1m33/33[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x281cb6cc580>

### <span style="font-family:Courier New; color:#336633">**Evaluation**</span>

In [84]:
index = ['random', 'cbow', 'fasttext']
results_df = pd.DataFrame(index=index, columns=['cf Pearson (train)', 'cf Pearson (val)', 'cf Pearson (test)'])

x_test, y_test = pair_list_to_x_y(mapped_test)
def compute_pearson(x_, y_, model):
    
    # Get predictions for the model
    y_pred = model.predict(x_)
    # Compute pearson correlation
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation

In [112]:
models = [model_random, model_cbow, model_fasttext]
for i, model in enumerate(index):
    results_df.loc[model, 'cf Pearson (train)'] = compute_pearson(x_train, y_train, models[i])
    results_df.loc[model, 'cf Pearson (val)'] = compute_pearson(x_val, y_val, models[i])
    results_df.loc[model, 'cf Pearson (test)'] = compute_pearson(x_test, y_test, models[i])
results_df

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


Unnamed: 0,cf Pearson (train),cf Pearson (val),cf Pearson (test)
random,0.982905,0.302388,0.391531
cbow,0.989596,0.306419,0.339157
fasttext,0.985586,0.281861,0.315383
