In [18]:
import sys
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

from scipy.stats import pearsonr
from gensim.models import KeyedVectors
from sklearn.linear_model import LinearRegression

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [19]:
# Load all the data
trainpath = 'data/train.csv'
testpath = 'data/test.csv'
valpath = 'data/validation.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

Load the pretrained Word2Vec model <br />
Downloaded from [link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g)

In [20]:
modelpath = "data/GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(modelpath, binary=True)

In [21]:
def get_sentences_embedding(sentences):
    sentence_embedding=[]
    for sentence in sentences:
        words = sentence
        unk_token = "unk"
        words = [word if word in model.key_to_index else unk_token for word in words]
        if len(words) == 0:
            words = ["unk"]
        embeddings = [model[word] for word in words]
        embedding = np.mean(embeddings, axis=0)
        sentence_embedding.append(embedding)
    return np.array(sentence_embedding)

In [24]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [25]:
x_train1 = get_sentences_embedding(traindata['sentence1'].apply(eval))
x_train2 = get_sentences_embedding(traindata['sentence2'].apply(eval))
x_train = np.concatenate([x_train1, x_train2], axis=1)
y_train = list(traindata['score'])

x_val1 = get_sentences_embedding(valdata['sentence1'].apply(eval))
x_val2 = get_sentences_embedding(valdata['sentence2'].apply(eval))
x_val = np.concatenate([x_val1, x_val2], axis=1)
y_val = list(valdata['score'])

test_x1 = get_sentences_embedding(testdata['sentence1'].apply(eval))
test_x2 = get_sentences_embedding(testdata['sentence2'].apply(eval))
test_X = np.concatenate([test_x1, test_x2], axis=1)
y_test = list(testdata['score'])

### Word2Vec using Linear Regression

In [26]:
reg = LinearRegression().fit(x_train, y_train)

y_pred = reg.predict(x_train)
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

y_pred = reg.predict(x_val)
corr = pearson_corr(y_val, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

y_pred = reg.predict(test_X)
corr = pearson_corr(y_test, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.47
Pearson correlation coefficient: 0.16
Pearson correlation coefficient: 0.19


In [None]:
x_train1 = torch.tensor(x_train1, dtype=torch.float)
x_train2 = torch.tensor(x_train2, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.float)

x_val1 = torch.tensor(x_val1, dtype=torch.float)
x_val2 = torch.tensor(x_val2, dtype=torch.float)
y_val = torch.tensor(y_val, dtype=torch.float)

x_test1 = torch.tensor(test_x1, dtype=torch.float)
x_test2 = torch.tensor(test_x2, dtype=torch.float)
y_test = torch.tensor(y_test, dtype=torch.float)

In [27]:
class BiLSTMRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout_prob):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout_prob
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out



class GRURegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out



class BiLSTMAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, attention_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.attention_dim = attention_dim
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.attention = SelfAttention(hidden_dim*2, attention_dim, 1)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        att_weights = self.attention(out)
        out = torch.sum(out * att_weights, dim=1)
        out = self.fc(out)
        return out



class SelfAttention(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SelfAttention, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size, bias=False)
        self.layer2 = nn.Linear(hidden_size, output_size, bias=False)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, attention_input):
        output = self.layer1(attention_input)
        output = torch.tanh(output)
        output = self.layer2(output)
        output = self.softmax(output)
        return output

In [29]:
class MyDataset(data.Dataset):
    def __init__(self, embeds1, embeds2, scores):
        self.embeds1 = embeds1
        self.embeds2 = embeds2
        self.scores = scores

    def __len__(self):
        return len(self.embeds1)

    def __getitem__(self, index):
        return self.embeds1[index], self.embeds2[index], self.scores[index]

In [None]:
# Hyperparameters
input_dim = 300 
hidden_dim = 150
lr = 0.001
num_epochs = 10
batch_size = 10

trainset = MyDataset(x_train1, x_train2, y_train)
valset = MyDataset(x_val1, x_val2, y_val)
trainloader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = data.DataLoader(valset, batch_size=batch_size, shuffle=True)

In [30]:
def train(model, optimizer, num_epochs, trainloader, valloader):
    loss_fn = nn.MSELoss()
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in trainloader:
            optimizer.zero_grad()
            embeds1, embeds2, scores = batch
            output = model(embeds1, embeds2)
            loss = loss_fn(output.squeeze(), scores)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * len(embeds1)
        train_loss /= len(trainloader)
        train_losses.append(train_loss)

    # Evaluate the model on the validation set
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in valloader:
                embeds1, embeds2, scores = batch
                val_output = model(embeds1, embeds2)
                val_loss += loss_fn(val_output.squeeze(), scores).item() * len(embeds1)
            val_loss /= len(valloader)
            val_losses.append(val_loss)

        print('Epoch {} - Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch+1, train_loss, val_loss))
    return train_losses, val_losses

def evaluate(model, embeds1, embeds2):
    val_output = model(embeds1, embeds2)
    val_output = val_output.data.numpy().flatten().tolist()
    return val_output

def print_values(model, x_1, x_2, y_true):
    y_pred = evaluate(model, x_1, x_2)
    corr = pearson_corr(y_true, y_pred)
    print("Pearson correlation coefficient: {:.2f}".format(corr))

### Word2Vec using GRU Regression

In [31]:
model = GRURegression(input_dim*2, hidden_dim, num_layers=2)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
train_losses, val_losses = train(model, optimizer, num_epochs, trainloader, valloader)

Epoch 1 - Training Loss: 21.4006, Validation Loss: 23.4083
Epoch 2 - Training Loss: 19.2976, Validation Loss: 22.5980
Epoch 3 - Training Loss: 18.6838, Validation Loss: 23.0561
Epoch 4 - Training Loss: 17.7664, Validation Loss: 22.5400
Epoch 5 - Training Loss: 16.5726, Validation Loss: 23.2552
Epoch 6 - Training Loss: 15.4372, Validation Loss: 21.5421
Epoch 7 - Training Loss: 14.0234, Validation Loss: 22.1058
Epoch 8 - Training Loss: 12.9557, Validation Loss: 20.8532


Epoch 9 - Training Loss: 11.8800, Validation Loss: 20.1100
Epoch 10 - Training Loss: 10.8930, Validation Loss: 19.3320


In [32]:
print_values(model, x_train1, x_train2, y_train)
print_values(model, x_val1, x_val2, y_val)
print_values(model, x_test1, x_test2, y_test)

Pearson correlation coefficient: 0.75
Pearson correlation coefficient: 0.40
Pearson correlation coefficient: 0.37


### Word2Vec using BiLSTM Regression

In [33]:
model = BiLSTMRegression(input_dim*2, hidden_dim, num_layers=2, dropout_prob = 0.3)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
train_losses, val_losses = train(model, optimizer, num_epochs, trainloader, valloader)

Epoch 1 - Training Loss: 22.6845, Validation Loss: 24.1329
Epoch 2 - Training Loss: 19.5565, Validation Loss: 22.0196
Epoch 3 - Training Loss: 18.7331, Validation Loss: 22.5787
Epoch 4 - Training Loss: 17.7128, Validation Loss: 21.8706
Epoch 5 - Training Loss: 16.3361, Validation Loss: 20.6676
Epoch 6 - Training Loss: 15.0035, Validation Loss: 22.2397
Epoch 7 - Training Loss: 13.8024, Validation Loss: 21.1904
Epoch 8 - Training Loss: 12.6910, Validation Loss: 21.2946
Epoch 9 - Training Loss: 11.4926, Validation Loss: 21.5128
Epoch 10 - Training Loss: 10.3919, Validation Loss: 21.1640


In [34]:
print_values(model, x_train1, x_train2, y_train)
print_values(model, x_val1, x_val2, y_val)
print_values(model, x_test1, x_test2, y_test)

Pearson correlation coefficient: 0.77
Pearson correlation coefficient: 0.36
Pearson correlation coefficient: 0.34


### Word2Vec using BiLSTM Attention

In [35]:
model = BiLSTMAttention(input_dim*2, hidden_dim, num_layers=2, attention_dim=600)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
train_losses, val_losses = train(model, optimizer, num_epochs, trainloader, valloader)

Epoch 1 - Training Loss: 21.8825, Validation Loss: 22.6637
Epoch 2 - Training Loss: 19.3532, Validation Loss: 22.6007
Epoch 3 - Training Loss: 18.4088, Validation Loss: 23.6965
Epoch 4 - Training Loss: 17.4539, Validation Loss: 21.3850
Epoch 5 - Training Loss: 16.2254, Validation Loss: 21.8735
Epoch 6 - Training Loss: 14.8872, Validation Loss: 21.9184
Epoch 7 - Training Loss: 13.6521, Validation Loss: 20.4485
Epoch 8 - Training Loss: 12.4625, Validation Loss: 20.7423
Epoch 9 - Training Loss: 11.4247, Validation Loss: 20.5441
Epoch 10 - Training Loss: 10.3129, Validation Loss: 21.0346


In [36]:
print_values(model, x_train1, x_train2, y_train)
print_values(model, x_val1, x_val2, y_val)
print_values(model, x_test1, x_test2, y_test)

Pearson correlation coefficient: 0.77
Pearson correlation coefficient: 0.37
Pearson correlation coefficient: 0.37
