# <span style="font-family:Courier New; color:#CCCCCC">**Catalan Word2Vec**</span>

## <span style="font-family:Courier New; color:#336666">**Load Data and Imports**</span>

In [38]:
from gensim.utils import simple_preprocess
from typing import Tuple, List, Optional
from gensim.corpora import Dictionary
from datasets import load_dataset # type: ignore
from scipy.stats import pearsonr
import tensorflow as tf
import fasttext # type: ignore
import pandas as pd
import numpy as np
import string
import re
import os

# Load the dataset
dataset = load_dataset("projecte-aina/catalan_general_crawling", split = 'train', trust_remote_code=True)
# Extract text data from the dataset
docs = dataset['text']

## <span style="font-family:Courier New; color:#336666">**Basic Preprocessing**</span>

In [2]:
# Preprocess the sentences
def preprocess(s):

    # Convert to lowercase
    s = s.lower()
    # Replace apostrophes with a space to split words connected by apostrophes
    s = re.sub(r"'", " ", s)
    # Replace hyphens with underscores
    s = re.sub(r"[-]", "_", s) 
    # Remove punctuation
    s =  re.sub(r'[{}]+'.format(re.escape(string.punctuation.replace('_', ''))), '', s)
    #Normalize spaces to 1
    s = re.sub(" +", " ", s.strip())
    # Replace pure number tokens with <num>
    s =  re.sub(r'\b\d+\b', '<num>', s)
    
    return s

In [3]:
# Save the text data to a text file
with open('data/input_1.txt', 'w', encoding='utf-8') as f:
    for doc in docs:
        f.write("%s\n" % preprocess(doc))

In [5]:
#Fetch different size data files
def read_and_write_partial_file(in_file, out_file, size):
    with open(in_file, 'rb') as inf:
        with open(out_file, 'wb') as outf:
            partial_data = inf.read(size)
            outf.write(partial_data)

whole_data_size = os.path.getsize('data/input_1.txt')         
MB = 1024*1024 #bytes
sizes = [100*MB, 500*MB, 1024*MB]
for size in sizes:
    out_file = f'data/input_{round(size/whole_data_size, 2)}.txt'
    read_and_write_partial_file('data/input_1.txt', out_file, size)

### <span style="font-family:Courier New; color:#336633">**Train Models**</span>

#### <span style="font-family:Courier New; color:#994C00">**Skip-gram**</span>

In [6]:
#First model -> 100MB
model_100 = fasttext.train_unsupervised(
    'data/input_0.04.txt', model='skipgram', 
    dim = 100, 
    ws = 5, 
    minCount = 5, 
    minn = 0, maxn = 0, #skip-gram
    neg = 4, 
    t = 1e-5, #prevent stopwords from impacting training time
)
model_100.save_model("dist_models/skip-gram_100MB.bin")

In [22]:
#Second model -> 500MB
model_500 = fasttext.train_unsupervised(
    'data/input_0.2.txt', model='skipgram', 
    dim = 100, 
    ws = 5, 
    minCount = 5, 
    minn = 0, maxn = 0, #skip-gram
    neg = 4, 
    t = 1e-5, #prevent stopwords from impacting training time
)
model_500.save_model("dist_models/skip-gram_500MB.bin")

In [26]:
#Third model -> 1GB
model_1GB = fasttext.train_unsupervised(
    'data/input_0.4.txt', model='skipgram', 
    dim = 100, 
    ws = 5, 
    minCount = 5, 
    minn = 0, maxn = 0, #skip-gram
    neg = 4, 
    t = 1e-6, #prevent stopwords from impacting training time
)
model_1GB.save_model("dist_models/skip-gram_1GB.bin")

In [29]:
#Fourth model -> whole dataset
model_whole = fasttext.train_unsupervised(
    'data/input_1.txt', model='skipgram', 
    dim = 100, 
    ws = 5, 
    minCount = 5, 
    minn = 0, maxn = 0, #skip-gram
    neg = 4, 
    t = 1e-7, #prevent stopwords from impacting training time
)
model_whole.save_model("dist_models/skip-gram_whole.bin")

#### <span style="font-family:Courier New; color:#994C00">**CBOW**</span>

In [45]:
model_cbow = fasttext.train_unsupervised(
    'data/input_1.txt', model='cbow', 
    dim = 100, 
    ws = 5, 
    minCount = 5, 
    minn = 0, maxn = 0, #cbow
    neg = 4, 
    t = 1e-7, #prevent stopwords from impacting training time
)
model_cbow.save_model("dist_models/cbow_whole.bin")

#### <span style="font-family:Courier New; color:#994C00">**FastText**</span>

In [3]:
model_fasttext = fasttext.train_unsupervised(
    'data/input_1.txt', model='skipgram', 
    dim = 100, 
    ws = 5, 
    minCount = 5, 
    minn = 3, maxn = 6, #fasttext
    neg = 5, 
    t = 1e-7, #prevent stopwords from impacting training time
)   
model_fasttext.save_model("dist_models/fasttext_whole.bin")

## <span style="font-family:Courier New; color:#336666">**Intrinsic Evaluation**</span>

### <span style="font-family:Courier New; color:#336633">**Skip-gram**</span>

In [39]:
model_100 = fasttext.load_model('dist_models/skip-gram_100MB.bin')
model_500 = fasttext.load_model('dist_models/skip-gram_500MB.bin')
model_1GB = fasttext.load_model('dist_models/skip-gram_1GB.bin')
model_whole = fasttext.load_model('dist_models/skip-gram_whole.bin')
models = [model_100, model_500, model_1GB, model_whole]

In [26]:
def cosine_similarity(vec1, vec2):
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    magnitude1 = sum(a ** 2 for a in vec1) ** 0.5
    magnitude2 = sum(b ** 2 for b in vec2) ** 0.5
    return dot_product / (magnitude1 * magnitude2)

index = ['100MB', '500MB', '1GB', 'whole']
columns = [('home', 'dona'), ('blau', 'vermell'), ('dormir', 'llit'), ('feliç', 'felicitat')]

sim_df = pd.DataFrame()
for i, model in enumerate(models):
    for w1, w2 in columns:
        vec1 = model[w1]
        vec2 = model[w2]
        sim_df.loc[index[i], f'({w1}, {w2})'] = cosine_similarity(vec1, vec2)
sim_df

Unnamed: 0,"(home, dona)","(blau, vermell)","(dormir, llit)","(feliç, felicitat)"
100MB,0.452221,0.672529,0.597621,0.495984
500MB,0.556385,0.714803,0.706539,0.532513
1GB,0.649668,0.811982,0.743771,0.614395
whole,0.641329,0.835277,0.843755,0.729914


<span style="font-family:Courier New">As we can see, the more training data is used to train the models, the better they capture similarity between words. </span>

### <span style="font-family:Courier New; color:#336633">**Skip-gram, CBOW or FastText?**</span>

In [44]:
skipgram = fasttext.load_model('dist_models/skip-gram_whole.bin')
cbow = fasttext.load_model('dist_models/cbow_whole.bin')
fasttext = fasttext.load_model('dist_models/fasttext_whole.bin')
models = [skipgram, cbow, fasttext]

#### <span style="font-family:Courier New; color:#994C00">**Distància**</span>

In [49]:
index = ['skip-gram', 'cbow', 'fasttext']
columns = [('opípar', 'majestuós'), ('blau', 'vermell'), ('dormir', 'llit'), ('feliç', 'felicitat'), ('treball', 'plàtan')]

dist_df = pd.DataFrame()
for i, model in enumerate(models):
    for w1, w2 in columns:
        vec1 = model[w1]
        vec2 = model[w2]
        dist_df.loc[index[i], f'({w1}, {w2})'] = cosine_similarity(vec1, vec2)
dist_df

Unnamed: 0,"(opípar, majestuós)","(blau, vermell)","(dormir, llit)","(feliç, felicitat)","(treball, plàtan)"
skip-gram,0.764406,0.835277,0.843755,0.729914,0.177631
cbow,0.721126,0.941667,0.92506,0.838973,0.055132
fasttext,0.515112,0.703621,0.609546,0.665305,0.22475


<span style="font-family:Courier New">At this point, CBOW seems to be the best embeddding model, since its coisine similarity corresponds well with actual words' semantics, in both senses: see that `treball` and `platan` coisine value is very low, but `blau` and `vermell` very high. </span>

#### <span style="font-family:Courier New; color:#994C00">**Similitud**</span>

In [10]:
index = ['skip-gram', 'cbow', 'fasttext']

sim_df = pd.DataFrame(columns=list(range(1,6)))
for i, model in enumerate(models):
    nn = model.get_nearest_neighbors('croqueta', k = 5)
    sim_df.loc[index[i]] = [w for _,w in nn]
sim_df

Unnamed: 0,1,2,3,4,5
skip-gram,salmorejo,gaspatxo,caneló,verduretes,escalivat
cbow,bitxo,carbonara,flam,melós,xarrup
fasttext,croquet,broqueta,croquetes,coqueta,bajoqueta


#### <span style="font-family:Courier New; color:#994C00">**Analogia**</span>

In [55]:
index = ['skip-gram', 'cbow', 'fasttext']

sim_df = pd.DataFrame(columns=list(range(1,6)))
for i, model in enumerate(models):
    nn = model.get_analogies('fill', 'filla', 'pare', k = 5)
    sim_df.loc[index[i]] = [w for _,w in nn]
sim_df

Unnamed: 0,1,2,3,4,5
skip-gram,amic,germà,cosí,sacerdot,tiet
cbow,amic,amiga,germana,mare,germà
fasttext,filll,fill_pare,filló,pare_mare,espòs


## <span style="font-family:Courier New; color:#336666">**Extrinsic Evaluation**</span>

<span style="font-family:Courier New">In this section, we are going to test the embeddings in a specific task: similarity regression model.</span>

### <span style="font-family:Courier New; color:#336633">**Data Preparation**</span>

In [22]:
dataset = load_dataset("projecte-aina/sts-ca", trust_remote_code=True)

input_pairs = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["train"].to_list()]
input_pairs_val = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["validation"].to_list()]
input_pairs_test = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["test"].to_list()]

all_input_pairs = input_pairs + input_pairs_val + input_pairs_test

# Preprocesamiento de las oraciones y creación del diccionario
sentences_1_preproc = [simple_preprocess(sentence_1) for sentence_1, _, _ in all_input_pairs]
sentences_2_preproc = [simple_preprocess(sentence_2) for _, sentence_2, _ in all_input_pairs]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc))

# Versión mapeada de palabras a ids para poder entrenar el modelo
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc
diccionario = Dictionary(sentences_pairs_flattened)

In [24]:
def map_word_embeddings(
        sentence: str,
        fixed_dictionary: Dictionary,
        sequence_len: int = 96
) -> np.ndarray:
    """
    Map to word-embedding indices
    :param sentence:
    :param sequence_len:
    :param fixed_dictionary:
    :return:
    """
    sentence_preproc = simple_preprocess(sentence)[:sequence_len]
    _vectors = np.zeros(sequence_len, dtype=np.int32)
    index = 0
    for word in sentence_preproc:
        if word in fixed_dictionary.token2id:
            # Sumo 1 porque el valor 0 está reservado a padding
            _vectors[index] = fixed_dictionary.token2id[word] + 1
            index += 1    
    return _vectors

def map_pairs(
    sentence_pairs: List[Tuple[str, str, float]],
    fixed_dictionary: Dictionary,
    sequence_len: int = 96
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for (sentence_1, sentence_2, similitud) in sentence_pairs:
        vector1 = map_word_embeddings(sentence_1, fixed_dictionary, sequence_len)
        vector2 = map_word_embeddings(sentence_2, fixed_dictionary, sequence_len)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

mapped_train = map_pairs(input_pairs, fixed_dictionary = diccionario)
mapped_val = map_pairs(input_pairs_val, fixed_dictionary = diccionario)
mapped_test = map_pairs(input_pairs_test, fixed_dictionary = diccionario)

In [26]:
# Obtener x_train e y_train
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.row_stack(_x_1), np.row_stack(_x_2)), np.array(_y)

# Obtener las listas de train y test
x_train, y_train = pair_list_to_x_y(mapped_train)
x_val, y_val = pair_list_to_x_y(mapped_val)

# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(64)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(64)

### <span style="font-family:Courier New; color:#336633">**Build Models and Train**</span>

In [31]:
#Regression similarity model template
def model(
    input_length: int = 96,
    dictionary_size: int = 1000,
    embedding_size: int = 16,
    pretrained_weights: Optional[np.ndarray] = None,
    learning_rate: float = 1e-3,
) -> tf.keras.Model:
    # Input layers
    input_1 = tf.keras.Input(shape=(input_length,), dtype=tf.int32)
    input_2 = tf.keras.Input(shape=(input_length,), dtype=tf.int32)

    # Embedding layer
    if pretrained_weights is None:
        embedding = tf.keras.layers.Embedding(
            dictionary_size, embedding_size, input_length=input_length, mask_zero=True
        )
    else:
        dictionary_size = pretrained_weights.shape[0]
        embedding_size = pretrained_weights.shape[1]
        initializer = tf.keras.initializers.Constant(pretrained_weights)
        embedding = tf.keras.layers.Embedding(
            dictionary_size,
            embedding_size,
            input_length=input_length,
            mask_zero=True,
            embeddings_initializer=initializer,
            trainable=False,
        )

    # Apply embedding to input sequences
    embedded_1 = embedding(input_1)
    embedded_2 = embedding(input_2)
    # Global average pooling
    _input_mask_1 = tf.keras.layers.Lambda(lambda x: tf.not_equal(x, 0))(input_1)
    _input_mask_2 = tf.keras.layers.Lambda(lambda x: tf.not_equal(x, 0))(input_2)
    pooled_1 = tf.keras.layers.GlobalAveragePooling1D()(embedded_1, mask=_input_mask_1)
    pooled_2 = tf.keras.layers.GlobalAveragePooling1D()(embedded_2, mask=_input_mask_2)

    # Compute the cosine distance using a Lambda layer
    def normalized_product(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return x1_normalized * x2_normalized
    
    output = tf.keras.layers.Lambda(normalized_product)([pooled_1, pooled_2])
    output = tf.keras.layers.Dense(64, activation="relu")(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(128, activation="relu")(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(1)(output)

    # Define the model
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate))

    return model

In [29]:
#Map trained embedddings with input data ids

embedding_size = 100
#skipgram
_pretrained_weights_skipgram = np.zeros(
            (len(diccionario.token2id) + 1, embedding_size),  dtype=np.float32)
for token, _id in diccionario.token2id.items():
    if token in skipgram:
        _pretrained_weights_skipgram[_id + 1] = skipgram[token]

#cbow
_pretrained_weights_cbow = np.zeros(
            (len(diccionario.token2id) + 1, embedding_size),  dtype=np.float32)
for token, _id in diccionario.token2id.items():
    if token in cbow:
        _pretrained_weights_cbow[_id + 1] = cbow[token]

#fasttext
_pretrained_weights_fasttext = np.zeros(
            (len(diccionario.token2id) + 1, embedding_size),  dtype=np.float32)
for token, _id in diccionario.token2id.items():
    if token in fasttext:
        _pretrained_weights_fasttext[_id + 1] = fasttext[token]

In [32]:
# Build and compile the models
model_skipgram = model(pretrained_weights=_pretrained_weights_skipgram)
model_cbow = model(pretrained_weights=_pretrained_weights_cbow)
model_fasttext = model(pretrained_weights=_pretrained_weights_fasttext)
model_skipgram.summary()

In [112]:
# Train the models
model_skipgram.fit(train_dataset, epochs=128, steps_per_epoch=100, validation_data=val_dataset)
model_cbow.fit(train_dataset, epochs=128, steps_per_epoch=100, validation_data=val_dataset)
model_fasttext.fit(train_dataset, epochs=128, steps_per_epoch=100, validation_data=val_dataset)
models = [model_skipgram, model_cbow, model_fasttext]

Epoch 1/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5.7681 - val_loss: 2.4657
Epoch 2/128
[1m 17/100[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step - loss: 1.7355

  self.gen.throw(typ, value, traceback)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1801 - val_loss: 0.7164
Epoch 3/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.7710 - val_loss: 0.7110
Epoch 4/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.7460 - val_loss: 0.7075
Epoch 5/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.7480 - val_loss: 0.7047
Epoch 6/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7418 - val_loss: 0.7021
Epoch 7/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.7501 - val_loss: 0.6991
Epoch 8/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.7332 - val_loss: 0.6964
Epoch 9/128
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7250 - val_loss: 0.6941
Epoch 10/128
[1m100/100[0m [32m━━━━━━━━━━

### <span style="font-family:Courier New; color:#336633">**Evaluation**</span>

In [36]:
index = ['skipgram', 'cbow', 'fasttext']
results_df = pd.DataFrame(index=index, columns=['cf Pearson (train)', 'cf Pearson (val)', 'cf Pearson (test)'])

x_test, y_test = pair_list_to_x_y(mapped_test)
def compute_pearson(x_, y_, model):
    
    # Get predictions for the model
    y_pred = model.predict(x_)
    # Compute pearson correlation
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation

In [37]:
for i, model in enumerate(index):
    results_df.loc[model, 'cf Pearson (train)'] = compute_pearson(x_train, y_train, models[i])
    results_df.loc[model, 'cf Pearson (val)'] = compute_pearson(x_val, y_val, models[i])
    results_df.loc[model, 'cf Pearson (test)'] = compute_pearson(x_test, y_test, models[i])
results_df

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


Unnamed: 0,cf Pearson (train),cf Pearson (val),cf Pearson (test)
skipgram,0.730865,0.423181,0.441365
cbow,0.637713,0.342591,0.461602
fasttext,0.740428,0.475816,0.504037
