In [3]:
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import logging
import numpy as np
import pandas as pd
import torch
from scipy.stats import pearsonr
from scipy import spatial

In [4]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [8]:
# Load pre-trained model

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

train_data = pd.read_csv('/content/drive/MyDrive/train-en-es.csv')
val_data = pd.read_csv('/content/drive/MyDrive/validation-en-es.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test-en-es.csv')

train_data['sentence1'] = train_data['sentence1'].apply(eval)
train_data['sentence2'] = train_data['sentence2'].apply(eval)
val_data['sentence1'] = val_data['sentence1'].apply(eval)
val_data['sentence2'] = val_data['sentence2'].apply(eval)
test_data['sentence1'] = test_data['sentence1'].apply(eval)
test_data['sentence2'] = test_data['sentence2'].apply(eval)

In [10]:
# train_dataset.examples
train_sentences1 = []
train_sentences2 = []
train_scores = []
val_sentences1 = []
val_sentences2 = []
val_scores = []
test_sentences1 = []
test_sentences2 = []
test_scores = []

for i in range(len(train_data['sentence1'])):
    sentence1 = train_data['sentence1'][i]
    sentence2 = train_data['sentence2'][i]
    score = train_data['similarity_score'][i]
    train_sentences1.append(sentence1)
    train_sentences2.append(sentence2)
    train_scores.append(score)

for i in range(len(val_data['sentence1'])):
    sentence1 = val_data['sentence1'][i]
    sentence2 = val_data['sentence2'][i]
    score = val_data['similarity_score'][i]
    val_sentences1.append(sentence1)
    val_sentences2.append(sentence2)
    val_scores.append(score)

for i in range(len(test_data['sentence1'])):
    sentence1 = test_data['sentence1'][i]
    sentence2 = test_data['sentence2'][i]
    score = test_data['similarity_score'][i]
    test_sentences1.append(sentence1)
    test_sentences2.append(sentence2)
    test_scores.append(score)

In [11]:
sentences1_train = [' '.join(sentence) for sentence in train_sentences1]
sentences2_train = [' '.join(sentence) for sentence in train_sentences2]
sentences1_val = [' '.join(sentence) for sentence in val_sentences1]
sentences2_val = [' '.join(sentence) for sentence in val_sentences2]
sentences1_test = [' '.join(sentence) for sentence in test_sentences1]
sentences2_test = [' '.join(sentence) for sentence in test_sentences2]

In [12]:
train_embeddings1 = model.encode(sentences1_train)
val_embeddings1 = model.encode(sentences1_val)
test_embeddings1 = model.encode(sentences1_test)

train_embeddings2 = model.encode(sentences2_train)
val_embeddings2 = model.encode(sentences2_val)
test_embeddings2 = model.encode(sentences2_test)

train_embeddings1, train_embeddings2

(array([[ 0.72736835,  0.58095926, -0.10241377, ...,  0.15410013,
         -0.158421  , -0.08950108],
        [-0.08956472, -0.483125  ,  0.48392937, ...,  0.5616503 ,
          0.26511204,  0.20353746],
        [-0.53974545,  0.65282625, -0.34588847, ..., -0.18636337,
          0.675012  ,  0.02623916],
        ...,
        [ 0.5140099 ,  0.32263708, -0.10364849, ..., -0.1770591 ,
         -0.10197934,  0.25872755],
        [-0.14184807,  0.25322706, -0.03215466, ...,  0.04245295,
          0.01343592, -0.28098705],
        [-0.37139413, -0.0188434 , -0.5944204 , ..., -0.20026802,
          0.26640326, -0.3896915 ]], dtype=float32),
 array([[ 0.35549068,  0.36532965, -0.17509872, ..., -0.05713743,
          0.03342357,  0.66291934],
        [-0.26169643,  0.66992396, -0.17582755, ..., -0.07969701,
         -0.14187792, -0.09293452],
        [-0.61321384,  0.7031321 , -0.3078763 , ...,  0.06372205,
          0.25859806,  0.19075784],
        ...,
        [-0.511744  ,  0.48447034, -0.6

In [13]:
def get_sts_scores(emb1, emb2):
    y_pred = []
    for i in range(len(emb1)):
        sim_score = 1 - spatial.distance.cosine(emb1[i], emb2[i])
        y_pred.append((sim_score+1) * 2.5)
    return y_pred

In [14]:
train_scores_pred = get_sts_scores(train_embeddings1, train_embeddings2)
corr = pearson_corr(train_scores, train_scores_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.07


In [15]:
val_scores_pred = get_sts_scores(val_embeddings1, val_embeddings2)
corr = pearson_corr(val_scores, val_scores_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.03


In [16]:
test_scores_pred = get_sts_scores(test_embeddings1, test_embeddings2)
corr = pearson_corr(test_scores, test_scores_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.10


### Fine Tuning

In [20]:
train_examples = []
for i in range(len(sentences1_train)):
    train_examples.append(InputExample(texts=[train_embeddings1[i], train_embeddings2[i]], label=train_scores[i]))

val_examples = []
for i in range(len(sentences1_val)):
    val_examples.append(InputExample(texts=[val_embeddings1[i], val_embeddings2[i]], label=val_scores[i]))

test_examples = []
for i in range(len(sentences1_test)):
    test_examples.append(InputExample(texts=[test_embeddings1[i], test_embeddings2[i]], label=test_scores[i]))


train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
# train_loss = losses.CosineSimilarityLoss(model)
train_loss = losses.BatchAllTripletLoss(model=model)

In [21]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='sts-validation')
# Fine-tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [22]:
train_embeddings1_new = model.encode(sentences1_train)
val_embeddings1_new = model.encode(sentences1_val)
test_embeddings1_new = model.encode(sentences1_test)

train_embeddings2_new = model.encode(sentences2_train)
val_embeddings2_new = model.encode(sentences2_val)
test_embeddings2_new = model.encode(sentences2_test)

train_embeddings1_new, train_embeddings2_new

(array([[ 0.7728581 ,  0.5034828 , -0.15835942, ...,  0.19553468,
         -0.18499167, -0.0984347 ],
        [-0.08323029, -0.5573397 ,  0.43658128, ...,  0.59005773,
          0.2782744 ,  0.25168827],
        [-0.52690345,  0.58797306, -0.42314515, ..., -0.13699782,
          0.71439135, -0.03186857],
        ...,
        [ 0.5922658 ,  0.27670616, -0.19626884, ..., -0.1670536 ,
         -0.15663414,  0.2515642 ],
        [-0.07743616,  0.27035046, -0.03615162, ...,  0.0795643 ,
          0.03971394, -0.31790376],
        [-0.39025328, -0.11267895, -0.68718314, ..., -0.15799937,
          0.20186956, -0.47602615]], dtype=float32),
 array([[ 0.34487268,  0.27802965, -0.29282162, ..., -0.04818962,
          0.02765996,  0.61724675],
        [-0.2821691 ,  0.58281523, -0.2992091 , ...,  0.01890866,
         -0.09783009, -0.09391684],
        [-0.61634773,  0.5499934 , -0.47230074, ...,  0.14843462,
          0.3163442 ,  0.13269414],
        ...,
        [-0.5102062 ,  0.44142073, -0.6

In [2]:
train_scores_pred_new = get_sts_scores(train_embeddings1_new, train_embeddings2_new)
corr = pearson_corr(train_scores, train_scores_pred_new)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.65


In [3]:
val_scores_pred_new = get_sts_scores(val_embeddings1_new, val_embeddings2_new)
corr = pearson_corr(val_scores, val_scores_pred_new)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.19


In [4]:
test_scores_pred_new = get_sts_scores(test_embeddings1_new, test_embeddings2_new)
corr = pearson_corr(test_scores, test_scores_pred_new)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.14


In [26]:
model.save('Sentence_Transformer_cross_lingual_fine-tuned')