In [None]:
import numpy as np
import gensim.downloader as api #word2vec
from scipy.spatial.distance import cosine,euclidean,cityblock #distances

# Embeddings

Here we can see two different pre-trained models to compute embeddings.
The first one loads the pre-trained Google News Word2Vec model with 300-dimensional vectors while the second model loads the pre-trained Facebook model with, again, 300-dimensional vectors.

In both functions the input will be the text (the question) for which we want to create the emedding and as an output we will obtain the mean of embeddings of all the words in the text.

In [None]:
import gensim.downloader as api

w2vec_model = api.load("word2vec-google-news-300" )
def word2vec_embeddings(text,model):

    # Lowercase and split it into individual words
    words = text.lower().split()

    # Get embeddings for all words that are present in the model's vocabulary
    embeddings = [model.get_vector(word) for word in words if word in model.index_to_key]

    # If no embeddings were found, return a null embedding
    if len(embeddings) == 0:
        return None
    else:
        # Take the mean of all embeddings to get a single embedding for the entire text
        return np.mean(embeddings, axis=0)

In [None]:
!pip install fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')
ft_model = fasttext.load_model('cc.en.300.bin')

def fasttext_embeddings(text,model):

    # Lowercase and split it into individual words
    words = text.lower().split()

    # Get embeddings for all words that are present in the model's vocabulary
    embeddings = [model.get_word_vector(word) for word in words if word in model]
    # If no embeddings were found, return None
    if len(embeddings) == 0:
        return None
    else:
        # Take the mean of all embeddings to get a single embedding for the entire text
        return np.mean(embeddings, axis=0)

We realized that the second model was way faster than the first one when computing the embeddings

# Distances

The structure of the following functions will be similar. We want to measure the distance between two embeddings. Both functions will have the two embeddings (previously calculated) as an input and will return the distance (substracting to one if necessary) as an output. If either of the embeddings is None, it will return None.

In [None]:
#Cosine Distance

def cos_distance(embedding1, embedding2):

    if embedding1 is None or embedding2 is None:
        return None
    else:
        # Compute the cosine distance between the two embeddings
        return cosine(embedding1, embedding2) #we want the distance, if we set 1 - cosine(embedding1, embedding2), for two identical questions we will obtain 1 as an output

#Euclidean Distance

def euclidean_distance(embedding1, embedding2):

    if embedding1 is None or embedding2 is None:
        return None
    else:
        # Compute the Euclidean distance between the two embeddings
        return euclidean(embedding1,embedding2)

def manhattan_distance(embedding1, embedding2):

    if embedding1 is None or embedding2 is None:
        return None
    else:
        # Compute the Manhattan distance between the two embeddings
        return cityblock(embedding1,embedding2)

This function computes the distance between embeddings according to the previous defined distances.

In [None]:
def distance_embeddings(embedding1, embedding2, index):
    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    if embedding1.shape != (100,) or embedding2.shape != (100,):
        print("Row:", index)
        print("embedding1 shape:", embedding1.shape)
        print("embedding2 shape:", embedding2.shape)

    # Compute distances between corresponding elements in the embeddings
    distances_cos = cosine(embedding1, embedding2)
    distances_euc = euclidean(embedding1, embedding2)
    distances_manh = cityblock(embedding1, embedding2)

    return distances_cos, distances_euc, distances_manh

# Final Approach

We realized that Fast Text took a considerable time to calculate the embeddings. Therefore, we chose not to use a pre-trained model and train the fasttext ourselves based on the questions we had available. This final approach will have two variants: the supervised and the unsupervised way.

## Supervised

In order to train the fasttext model we need to prepare a certain format for the questions, that is:

__ label __positive questions...

__ label __negative questions...


In our case, our labels are 0 (different questions) or 1 (similar questions).

In [None]:
def training_format_sup(x,y,char):
  with open(char, 'w') as f:
    for i, question_pair in enumerate(x):
        label_value = y[i]
        f.write("__label__{} {} {}\n".format(label_value, question_pair[0], question_pair[1]))

Again, to use the function .predict (to get our results) we need to have the same format than before but without the labels.

In [None]:
def questions_fasttext_format(questions):
  combined_questions = []

  for pregunta_pair in questions:
    aux = pregunta_pair[0] + ' ' + pregunta_pair[1]
    combined_questions.append(aux.replace('\n', ''))

  return(combined_questions)

Just computing the predictions and the probabilities (for computing the ROC metric) for each question in our test dataset, using the trained model.

In [None]:
def predict_fasttext(questions,loaded_model):
  predicted_label = []
  predicted_prob= []
  for i in range(len(questions)):
    aux_label, aux_prob = loaded_model.predict(questions[i])
    predicted_label.append(int(aux_label[0][-1]))
    if (predicted_label[i] == 0):
      predicted_prob.append(1-float(aux_prob[0]))
    else:
      predicted_prob.append(float(aux_prob[0]))

  return(predicted_label, predicted_prob)

A proper function to represent our results in a proper way.

In [None]:
def get_results(preds,probs, label):
  _ = find_threshold(probs,label)  #this is a function from David Íñiguez.
  cm = confusion_matrix(label,preds)

  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])

  disp.plot(cmap=plt.cm.Blues)
  plt.title('Confusion Matrix')
  plt.show()

  print (classification_report(label, preds))
  accuracy = accuracy_score(label,preds)
  f1 = f1_score(label,preds, average='weighted')
  precision = precision_score(label,preds, average='weighted')
  recall = recall_score(label,preds, average='weighted')

  results = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}

  return(results)

## Unsupervised

In this last section we will combine the feature extractions (Alejandro Vara) obtained from the questions and the embeddings obtained from Fast Text.

This is a function analogous to *training_format_sup* but now we dont need the label anymore.

In [None]:
#def training_format_unsup(x, char):
def format_data(questions, output_file):
    with open(output_file, 'w') as f:
        for question in questions:
            f.write("{}\n".format(question))