In [1]:
import sys
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

from scipy.stats import pearsonr
from gensim.models import KeyedVectors

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
# Load all the data
trainpath = 'data/train-en-es.csv'
testpath = 'data/test-en-es.csv'
valpath = 'data/validation-en-es.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

Load the pretrained Word2Vec model <br />

In [4]:
es_model = KeyedVectors.load_word2vec_format("./data/SBW-vectors-300-min5.bin.gz", binary=True)
en_model = KeyedVectors.load_word2vec_format("/home/ishitbansal/Semester-6/INLP/Project/Advanced_Semantic_similarity-main/GoogleNews-vectors-negative300.bin", binary=True)

In [5]:
def get_sentence_embedding_es(sentence):
    words = sentence
    unk_token = "unk"
    words = [word if word in es_model.key_to_index else unk_token for word in words]
    if len(words) == 0:
        words = ["unk"]
    embeddings = [es_model[word] for word in words]
    embedding = np.mean(embeddings, axis=0)
    return embedding


def get_sentence_embedding_en(sentence):
    words = sentence
    unk_token = "unk"
    words = [word if word in en_model.key_to_index else unk_token for word in words]
    if len(words) == 0:
        words = ["unk"]
    embeddings = [en_model[word] for word in words]
    embedding = np.mean(embeddings, axis=0)
    return embedding

In [6]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [7]:
x_train1 = np.array([get_sentence_embedding_en(sentence) for sentence in traindata['sentence1']])
x_train2 = np.array([get_sentence_embedding_en(sentence) for sentence in traindata['sentence2']])
y_train = list(traindata['similarity_score'])

x_val1 = np.array([get_sentence_embedding_en(sentence) for sentence in valdata['sentence1']])
x_val2 = np.array([get_sentence_embedding_es(sentence) for sentence in valdata['sentence2']])
y_val = list(valdata['similarity_score'])

x_test1 = np.array([get_sentence_embedding_en(sentence) for sentence in testdata['sentence1']])
x_test2 = np.array([get_sentence_embedding_es(sentence) for sentence in testdata['sentence2']])
y_test = list(testdata['similarity_score'])

In [8]:
class SentenceSimilarityDataset(data.Dataset):
    def __init__(self, embeddings1, embeddings2, scores):
        self.embeddings1 = embeddings1
        self.embeddings2 = embeddings2
        self.scores = scores

    def __len__(self):
        return max(len(self.embeddings1), len(self.embeddings2))

    def __getitem__(self, index):
        return torch.tensor(self.embeddings1[index]), torch.tensor(self.embeddings2[index]), torch.tensor(self.scores[index], dtype=torch.float)


In [9]:
input_dim = 300 
hidden_dim = 150
lr = 0.001
num_epochs = 10
batch_size = 10

trainset = SentenceSimilarityDataset(x_train1, x_train2, y_train)
valset = SentenceSimilarityDataset(x_val1, x_val2, y_val)
testset = SentenceSimilarityDataset(x_test1, x_test2, y_test)

trainloader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = data.DataLoader(valset, batch_size=batch_size, shuffle=True)
testloader = data.DataLoader(testset, batch_size=10, shuffle=True)


In [10]:
class BiLSTMRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout_prob):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        out = self.dropout(out) 
        out = self.fc(out[:, -1, :])
        return out

    

class GRURegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out


class BiLSTMAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, attention_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.attention_dim = attention_dim
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.attention = SelfAttention(hidden_dim*2, attention_dim, 1)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        att_weights = self.attention(out)
        out = torch.sum(out * att_weights, dim=1)
        out = self.fc(out)
        return out


class SelfAttention(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SelfAttention, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size, bias=False)
        self.layer2 = nn.Linear(hidden_size, output_size, bias=False)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, attention_input):
        output = self.layer1(attention_input)
        output = torch.tanh(output)
        output = self.layer2(output)
        output = self.softmax(output)
        return output

In [11]:
def train(model, optimizer, num_epochs, train_dataloader,val_dataloader):
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for embeddings1_batch, embeddings2_batch, scores_batch in train_dataloader:
            optimizer.zero_grad()
            output = model(embeddings1_batch, embeddings2_batch)
            loss_fn = nn.MSELoss()
            loss = loss_fn(output.squeeze(), scores_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * len(embeddings1_batch)
        train_loss /= len(train_dataloader)
        train_losses.append(train_loss)

    # Evaluate the model on the validation set
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for val_embeddings1_batch, val_embeddings2_batch, val_scores_batch in val_dataloader:
                val_output = model(val_embeddings1_batch, val_embeddings2_batch)
                val_loss += loss_fn(val_output.squeeze(), val_scores_batch).item() * len(val_embeddings1_batch)
            val_loss /= len(val_dataloader)
            val_losses.append(val_loss)

        print('Epoch {} - Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch+1, train_loss, val_loss))
    return train_losses, val_losses

def evaluate(model, data_loader):
    y_pred_test = []
    model.eval()
    with torch.no_grad():
        for emb1, emb2, scores in data_loader:
            test_output = model(emb1, emb2)
            y_pred_test.extend(test_output.squeeze().tolist())
    return y_pred_test

def print_values(model, loader, y_true):
    y_pred = evaluate(model, loader)
    corr = pearson_corr(y_true, y_pred)
    print("Pearson correlation coefficient: {:.2f}".format(corr))

In [12]:
# Define the hyperparameters
input_dim = 300
hidden_dim = 150
lr = 0.001
num_epochs = 10
batch_size = 10

### Word2Vec using BiLSTM Regression

In [13]:
model = BiLSTMRegression(input_dim*2, hidden_dim, num_layers=2, dropout_prob = 0.3)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
train_losses, val_losses = train(model, optimizer, num_epochs, trainloader,valloader)

Epoch 1 - Training Loss: 23.1795, Validation Loss: 25.8372


Epoch 2 - Training Loss: 21.5389, Validation Loss: 24.1443
Epoch 3 - Training Loss: 21.3365, Validation Loss: 23.5341
Epoch 4 - Training Loss: 21.1835, Validation Loss: 23.1461
Epoch 5 - Training Loss: 21.0966, Validation Loss: 22.6942
Epoch 6 - Training Loss: 21.1763, Validation Loss: 22.5892
Epoch 7 - Training Loss: 21.1276, Validation Loss: 24.2814
Epoch 8 - Training Loss: 20.9908, Validation Loss: 24.8942
Epoch 9 - Training Loss: 20.9868, Validation Loss: 23.5566
Epoch 10 - Training Loss: 20.9327, Validation Loss: 22.9056


In [14]:
print_values(model, trainloader, y_train)
print_values(model, valloader, y_val)
print_values(model, testloader,  y_test)

Pearson correlation coefficient: -0.01
Pearson correlation coefficient: 0.02
Pearson correlation coefficient: -0.01


### Word2Vec using GRU Regression

In [15]:
model = GRURegression(input_dim*2, hidden_dim, num_layers=2)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
train_losses, val_losses = train(model, optimizer, num_epochs, trainloader,valloader)

Epoch 1 - Training Loss: 22.3771, Validation Loss: 24.1071
Epoch 2 - Training Loss: 21.3319, Validation Loss: 22.5483
Epoch 3 - Training Loss: 21.1420, Validation Loss: 23.0312
Epoch 4 - Training Loss: 21.0430, Validation Loss: 22.6348
Epoch 5 - Training Loss: 20.9786, Validation Loss: 23.8589
Epoch 6 - Training Loss: 20.8826, Validation Loss: 22.6966
Epoch 7 - Training Loss: 20.9325, Validation Loss: 22.6483
Epoch 8 - Training Loss: 20.8470, Validation Loss: 24.4998
Epoch 9 - Training Loss: 20.7392, Validation Loss: 23.0386
Epoch 10 - Training Loss: 20.7176, Validation Loss: 23.4805


In [16]:
print_values(model, trainloader, y_train)
print_values(model, valloader, y_val)
print_values(model, testloader,  y_test)

Pearson correlation coefficient: 0.01
Pearson correlation coefficient: 0.01
Pearson correlation coefficient: 0.06


### Word2Vec using BiLSTM Attention

In [17]:
model = BiLSTMAttention(input_dim*2, hidden_dim, num_layers=2, attention_dim=600)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
train_losses, val_losses = train(model, optimizer, num_epochs, trainloader,valloader)

Epoch 1 - Training Loss: 22.7185, Validation Loss: 24.5618
Epoch 2 - Training Loss: 21.4504, Validation Loss: 25.5685
Epoch 3 - Training Loss: 21.0165, Validation Loss: 22.6655
Epoch 4 - Training Loss: 21.0623, Validation Loss: 22.6399
Epoch 5 - Training Loss: 21.0307, Validation Loss: 22.5291
Epoch 6 - Training Loss: 20.7555, Validation Loss: 22.5205
Epoch 7 - Training Loss: 20.7062, Validation Loss: 23.3519
Epoch 8 - Training Loss: 20.8818, Validation Loss: 23.2409
Epoch 9 - Training Loss: 20.7222, Validation Loss: 23.3023
Epoch 10 - Training Loss: 21.0281, Validation Loss: 24.5475


In [18]:
print_values(model, trainloader, y_train)
print_values(model, valloader, y_val)
print_values(model, testloader,  y_test)

Pearson correlation coefficient: 0.00
Pearson correlation coefficient: -0.03
Pearson correlation coefficient: 0.03
