<h1> Doc2Vec embeddings </h1>
<p>Doc2Vec model is trained to generate representative embeddings of sentences and with these embeddings following approaches are applied to generate STS scores:</p>
<li>Normalized cosine similarity score </li>
<li>BiLSTM Regression neural network model</li>
<li>BiGRU Regression neural network model</li>

In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial
from scipy.stats import pearsonr
import torch
import torch.nn as nn
import torch.utils.data as data
from sklearn.linear_model import LinearRegression
import logging
import warnings
import sys

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

## Mono-Lingual Semantic Similarity

In [3]:
train_data = pd.read_csv('./data/train.csv')
val_data = pd.read_csv('./data/validation.csv')
test_data = pd.read_csv('./data/test.csv')

train_data['sentence1'] = train_data['sentence1'].apply(eval)
train_data['sentence2'] = train_data['sentence2'].apply(eval)
val_data['sentence1'] = val_data['sentence1'].apply(eval)
val_data['sentence2'] = val_data['sentence2'].apply(eval)
test_data['sentence1'] = test_data['sentence1'].apply(eval)
test_data['sentence2'] = test_data['sentence2'].apply(eval)

In [4]:
total_sents_unk = list(train_data['sentence1'])
total_sents_unk.extend(list(train_data['sentence2']))
total_sents_unk

[['plane', 'take'],
 ['man', 'play', 'larg', 'flute'],
 ['man', 'spread', 'shrede', 'chees', 'pizza'],
 ['three', 'men', 'play', 'chess'],
 ['man', 'play', 'cello'],
 ['men', 'fight'],
 ['man', 'smoke'],
 ['man', 'play', 'piano'],
 ['man', 'play', 'guitar', 'sing'],
 ['person', 'throw', 'cat', 'ceil'],
 ['man', 'hit', 'man', 'stick'],
 ['woman', 'pick', 'hold', 'babi', 'kangaroo'],
 ['man', 'play', 'flute'],
 ['person', 'fold', 'piec', 'paper'],
 ['man', 'run', 'road'],
 ['dog', 'tri', 'get', 'bacon', 'back'],
 ['polar', 'bear', 'slide', 'snow'],
 ['woman', 'write'],
 ['cat', 'rub', 'babi', 'face'],
 ['man', 'ride', 'hors'],
 ['man', 'pour', 'oil', 'pot'],
 ['man', 'play', 'guitar'],
 ['panda', 'slide', 'slide'],
 ['woman', 'eat', 'someth'],
 ['woman', 'peel', 'potato'],
 ['boy', 'fell', 'bike'],
 ['woman', 'play', 'flute'],
 ['rabbit', 'run', 'eagl'],
 ['woman', 'fri', 'bread', 'pork', 'chop'],
 ['girl', 'fli', 'kite'],
 ['man', 'ride', 'mechan', 'bull'],
 ['man', 'play', 'guitar'],
 

In [5]:
documents = []
for idx, sent in enumerate(total_sents_unk):
    documents.append(TaggedDocument(sent, [idx]))

In [7]:
model = Doc2Vec(documents, vector_size=25, window=6, min_count=1, workers=1, epochs=30, alpha=0.1, min_alpha=0.001, hs=1)

In [8]:
model.random.seed(42)
train_data['sentence1'] = train_data['sentence1'].apply(lambda x: model.infer_vector(x))
train_data['sentence2'] = train_data['sentence2'].apply(lambda x: model.infer_vector(x))

In [9]:
y_pred = train_data.apply(lambda x: ((1 - spatial.distance.cosine(x['sentence1'], x['sentence2']))+1)*2.5, axis=1)
y_train = train_data['score']
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.50


In [11]:
val_data['sentence1'] = val_data['sentence1'].apply(lambda x: model.infer_vector(x))
val_data['sentence2'] = val_data['sentence2'].apply(lambda x: model.infer_vector(x))

In [12]:
y_pred = val_data.apply(lambda x: ((1 - spatial.distance.cosine(x['sentence1'], x['sentence2']))+1)*2.5, axis=1)
y_train = val_data['score']
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.61


In [13]:
test_data['sentence1'] = test_data['sentence1'].apply(lambda x: model.infer_vector(x))
test_data['sentence2'] = test_data['sentence2'].apply(lambda x: model.infer_vector(x))

In [14]:
y_pred = test_data.apply(lambda x: ((1 - spatial.distance.cosine(x['sentence1'], x['sentence2']))+1)*2.5, axis=1)
y_train = test_data['score']
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.54


### BiLSTM Model

In [15]:
train_embeddings1 = torch.Tensor(list(train_data['sentence1']))
train_embeddings2 = torch.Tensor(list(train_data['sentence2']))
train_score = torch.Tensor(list(train_data['score']))

val_embeddings1 = torch.Tensor(list(val_data['sentence1']))
val_embeddings2 = torch.Tensor(list(val_data['sentence2']))
val_score = torch.Tensor(list(val_data['score']))

test_embeddings1 = torch.Tensor(list(test_data['sentence1']))
test_embeddings2 = torch.Tensor(list(test_data['sentence2']))
test_score = torch.Tensor(list(test_data['score']))

In [19]:
input_dim = 25
hidden_dim = 25
lr = 0.001
num_epochs = 10
batch_size = 10

In [17]:
class BiLSTMRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [20]:
model = BiLSTMRegression(input_dim*2, hidden_dim, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

class SentenceSimilarityDataset(data.Dataset):
    def __init__(self, embeddings1, embeddings2, scores):
        self.embeddings1 = embeddings1
        self.embeddings2 = embeddings2
        self.scores = scores

    def __len__(self):
        return len(self.embeddings1)

    def __getitem__(self, index):
        return self.embeddings1[index], self.embeddings2[index], self.scores[index]
    
train_dataset = SentenceSimilarityDataset(train_embeddings1, train_embeddings2, train_score)
val_dataset = SentenceSimilarityDataset(val_embeddings1, val_embeddings2, val_score)

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        embeddings1_batch, embeddings2_batch, scores_batch = batch
        output = model(embeddings1_batch, embeddings2_batch)
        loss = criterion(output.squeeze(), scores_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(embeddings1_batch)
    train_loss /= len(train_embeddings1)

    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            embeddings1_batch, embeddings2_batch, scores_batch = batch
            val_output = model(embeddings1_batch, embeddings2_batch)
            val_loss += criterion(val_output.squeeze(), scores_batch).item() * len(embeddings1_batch)
    val_loss /= len(val_embeddings1)

    print(f"Epoch = {epoch}\tTraining Loss = {train_loss}\tValidation Loss = {val_loss}")

Epoch = 0	Training Loss = 2.7887633158489193	Validation Loss = 2.1403739698727926
Epoch = 1	Training Loss = 1.8022376451104138	Validation Loss = 2.027794901529948
Epoch = 2	Training Loss = 1.5637056298060177	Validation Loss = 1.9546881838639578
Epoch = 3	Training Loss = 1.3493448697311772	Validation Loss = 1.962295016447703
Epoch = 4	Training Loss = 1.176512693001677	Validation Loss = 2.11602485537529
Epoch = 5	Training Loss = 1.0422908059538583	Validation Loss = 2.134304936329524
Epoch = 6	Training Loss = 0.9178078691271787	Validation Loss = 2.202509037653605
Epoch = 7	Training Loss = 0.8183255956875467	Validation Loss = 2.349531631072362
Epoch = 8	Training Loss = 0.729214679673353	Validation Loss = 2.4347394144535066
Epoch = 9	Training Loss = 0.654054634225101	Validation Loss = 2.5370150164763134


In [24]:
model.eval()
output = model(train_embeddings1, train_embeddings2)
y_pred = output.squeeze().tolist()
y_train = train_score
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.87


In [25]:
model.eval()
output = model(val_embeddings1, val_embeddings2)
y_pred = output.squeeze().tolist()
y_val = val_score
corr = pearson_corr(y_val, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.38


In [26]:
model.eval()
output = model(test_embeddings1, test_embeddings2)
y_pred = output.squeeze().tolist()
y_test = test_score
corr = pearson_corr(y_test, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.34
