In [1]:
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import logging
import numpy as np
import pandas as pd
import torch
from scipy.stats import pearsonr
from scipy import spatial

In [2]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [3]:
# Load pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

train_data = pd.read_csv('./data/train.csv')
val_data = pd.read_csv('./data/validation.csv')
test_data = pd.read_csv('./data/test.csv')

train_data['sentence1'] = train_data['sentence1'].apply(eval)
train_data['sentence2'] = train_data['sentence2'].apply(eval)
val_data['sentence1'] = val_data['sentence1'].apply(eval)
val_data['sentence2'] = val_data['sentence2'].apply(eval)
test_data['sentence1'] = test_data['sentence1'].apply(eval)
test_data['sentence2'] = test_data['sentence2'].apply(eval)

In [4]:
# train_dataset.examples
train_sentences1 = []
train_sentences2 = []
train_scores = []
val_sentences1 = []
val_sentences2 = []
val_scores = []
test_sentences1 = []
test_sentences2 = []
test_scores = []

for i in range(len(train_data['sentence1'])):
    sentence1 = train_data['sentence1'][i]
    sentence2 = train_data['sentence2'][i]
    score = train_data['score'][i]
    train_sentences1.append(sentence1)
    train_sentences2.append(sentence2)
    train_scores.append(score)

for i in range(len(val_data['sentence1'])):
    sentence1 = val_data['sentence1'][i]
    sentence2 = val_data['sentence2'][i]
    score = val_data['score'][i]
    val_sentences1.append(sentence1)
    val_sentences2.append(sentence2)
    val_scores.append(score)

for i in range(len(test_data['sentence1'])):
    sentence1 = test_data['sentence1'][i]
    sentence2 = test_data['sentence2'][i]
    score = test_data['score'][i]
    test_sentences1.append(sentence1)
    test_sentences2.append(sentence2)
    test_scores.append(score)

In [5]:
sentences1_train = [' '.join(sentence) for sentence in train_sentences1]
sentences2_train = [' '.join(sentence) for sentence in train_sentences2]
sentences1_val = [' '.join(sentence) for sentence in val_sentences1]
sentences2_val = [' '.join(sentence) for sentence in val_sentences2]
sentences1_test = [' '.join(sentence) for sentence in test_sentences1]
sentences2_test = [' '.join(sentence) for sentence in test_sentences2]

In [6]:
train_embeddings1 = model.encode(sentences1_train)
val_embeddings1 = model.encode(sentences1_val)
test_embeddings1 = model.encode(sentences1_test)

train_embeddings2 = model.encode(sentences2_train)
val_embeddings2 = model.encode(sentences2_val)
test_embeddings2 = model.encode(sentences2_test)

train_embeddings1, train_embeddings2

(array([[ 0.7273679 ,  0.5809589 , -0.10241333, ...,  0.15410018,
         -0.1584214 , -0.08950065],
        [-0.08956437, -0.48312497,  0.4839292 , ...,  0.5616502 ,
          0.26511225,  0.20353739],
        [-0.53974545,  0.65282625, -0.34588847, ..., -0.18636337,
          0.675012  ,  0.02623916],
        ...,
        [ 0.51400983,  0.32263702, -0.10364814, ..., -0.17705898,
         -0.10197908,  0.25872764],
        [-0.14184807,  0.25322706, -0.03215466, ...,  0.04245295,
          0.01343592, -0.28098705],
        [-0.37139413, -0.0188434 , -0.5944204 , ..., -0.20026802,
          0.26640326, -0.3896915 ]], dtype=float32),
 array([[ 0.5792457 ,  0.4986233 , -0.09275138, ...,  0.02616616,
         -0.42155725,  0.16717212],
        [-0.1793691 , -0.4859043 ,  0.0205935 , ...,  0.2684416 ,
         -0.02468947,  0.3557675 ],
        [-0.63112575,  0.66310066, -0.53988034, ..., -0.21829115,
          0.7638065 ,  0.14374717],
        ...,
        [-0.3908653 ,  0.46852383, -0.0

In [7]:
def get_sts_scores(emb1, emb2):
    y_pred = []
    for i in range(len(emb1)):
        sim_score = 1 - spatial.distance.cosine(emb1[i], emb2[i])
        y_pred.append((sim_score+1) * 2.5)
    return y_pred

In [8]:
train_scores_pred = get_sts_scores(train_embeddings1, train_embeddings2)
corr = pearson_corr(train_scores, train_scores_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.78


In [9]:
val_scores_pred = get_sts_scores(val_embeddings1, val_embeddings2)
corr = pearson_corr(val_scores, val_scores_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.82


In [10]:
test_scores_pred = get_sts_scores(test_embeddings1, test_embeddings2)
corr = pearson_corr(test_scores, test_scores_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.76


### Fine Tuning

In [11]:
train_examples = []
for i in range(len(sentences1_train)):
    train_examples.append(InputExample(texts=[train_embeddings1[i], train_embeddings2[i]], label=train_scores[i]))

val_examples = []
for i in range(len(sentences1_val)):
    val_examples.append(InputExample(texts=[val_embeddings1[i], val_embeddings2[i]], label=val_scores[i]))

test_examples = []
for i in range(len(sentences1_test)):
    test_examples.append(InputExample(texts=[test_embeddings1[i], test_embeddings2[i]], label=test_scores[i]))


train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
# train_loss = losses.CosineSimilarityLoss(model)
train_loss = losses.BatchAllTripletLoss(model=model)

In [12]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='sts-validation')
# Fine-tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [13]:
train_embeddings1_new = model.encode(sentences1_train)
val_embeddings1_new = model.encode(sentences1_val)
test_embeddings1_new = model.encode(sentences1_test)

train_embeddings2_new = model.encode(sentences2_train)
val_embeddings2_new = model.encode(sentences2_val)
test_embeddings2_new = model.encode(sentences2_test)

train_embeddings1_new, train_embeddings2_new

(array([[ 0.78093404,  0.53268516, -0.07050468, ...,  0.11919867,
         -0.20796065, -0.15117025],
        [-0.03922986, -0.5683854 ,  0.5405783 , ...,  0.6088767 ,
          0.3047607 ,  0.17288359],
        [-0.46371073,  0.6312734 , -0.30159017, ..., -0.24200507,
          0.68020475,  0.0210998 ],
        ...,
        [ 0.4366062 ,  0.27964896, -0.0187587 , ..., -0.16782968,
         -0.05177963,  0.17690502],
        [-0.11956879,  0.27475452,  0.06068354, ...,  0.03580553,
         -0.10370468, -0.2319527 ],
        [-0.30826476, -0.11164042, -0.53258145, ..., -0.22514361,
          0.2261619 , -0.40871456]], dtype=float32),
 array([[ 0.62934506,  0.48844576, -0.08590074, ..., -0.03293747,
         -0.46402526,  0.12816106],
        [-0.1644081 , -0.56664217,  0.02702678, ...,  0.26746607,
         -0.0028876 ,  0.34831822],
        [-0.55743027,  0.62024754, -0.4802386 , ..., -0.28644818,
          0.7474572 ,  0.13868213],
        ...,
        [-0.41019505,  0.35224232,  0.0

In [14]:
train_scores_pred_new = get_sts_scores(train_embeddings1_new, train_embeddings2_new)
corr = pearson_corr(train_scores, train_scores_pred_new)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.78


In [15]:
val_scores_pred_new = get_sts_scores(val_embeddings1_new, val_embeddings2_new)
corr = pearson_corr(val_scores, val_scores_pred_new)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.82


In [16]:
test_scores_pred_new = get_sts_scores(test_embeddings1_new, test_embeddings2_new)
corr = pearson_corr(test_scores, test_scores_pred_new)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.76


In [17]:
model.save('Sentence_Transformer_fine-tuned')