In [1]:
# pip install torch
# pip install gensim
# pip install pandas


# preprocessing

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import argparse
import random
random.seed(577)

import numpy as np
np.random.seed(577)

import torch
torch.set_default_tensor_type(torch.FloatTensor)
torch.use_deterministic_algorithms(True)
torch.manual_seed(577)
torch_device = torch.device("cpu")

'''
NOTE: Do not change any of the statements above regarding random/numpy/pytorch.
You can import other built-in libraries (e.g. collections) or pre-specified external libraries
such as pandas, nltk and gensim below. 
Also, if you'd like to declare some helper functions, please do so in utils.py and
change the last import statement below.
'''
from torch import nn
from torch import optim
from torch import tensor

import gensim.downloader as api

from torch.utils.data import Subset

# from neural_archs import DAN, RNN, LSTM
# from utils import WiCDataset, sen2vec, sen2glove



In [2]:
# from utils.py
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from torch import tensor


import nltk
# import tempfile
#
# nltk.download('punkt', download_dir=tempfile.gettempdir())
# nltk.download('averaged_perceptron_tagger', download_dir=tempfile.gettempdir())
# nltk.download('tagsets', download_dir=tempfile.gettempdir())
# nltk.data.path.append(tempfile.gettempdir())

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

from nltk.data import load
paras = load('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')

from nltk import pos_tag

ss = 'NN NNS NNP NNPS VB VBD VBG VBN VBP VBZ JJ JJR JJS RBR BR RBS WDT WP WP$ PRP PRP$ DT CD UH SYM FW  LS'

selected_tags = ss.split()
pos_dict = {}
i = 0
for pos in selected_tags:
    pos_dict[pos] = i
    i += 1
n = len(pos_dict)

# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
class WiCDataset(Dataset):

    def __init__(self, mode='train', root_dir='./WiC_dataset/'):

        self.root_dir = root_dir
        self.mode = mode

        file_name = root_dir + self.mode + '/'+ self.mode+'.data.txt'
        with open(file_name,'r', encoding = 'cp850') as file:    
            self.data = file.readlines()
           
        file_name = root_dir + self.mode + '/'+ self.mode +'.gold.txt'
        with open(file_name,'r', encoding = 'cp850') as file:
            self.labels = file.readlines()

    def __len__(self):
        return len(self.labels)
        

    def __getitem__(self, idx):

        line = self.data[idx]
        sample = {}
        parts = line.replace("\n","\t").strip().split("\t")
        sample['word'] = parts[0]

        if parts[1] == "F":
            sample['label'] = False
        else:
            sample['label'] = True

        sample['sentence1'] = parts[3]
        sample['sentence2'] = parts[4]

        idxs = parts[2].split('-')
        sample['idx1'] = idxs[0]
        sample['idx2'] = idxs[1]


        line = self.labels[idx]
        if line.split()[0] == 'F':
            sample['label'] = False
        else:
            sample['label'] = True

        return sample

    def get_vocab(self):  
        vocab = {"<UNK>":0}
        for line in self.data:
            parts = line.replace("\n","\t").strip().split("\t")

            #create vocabulary from all unique words in all sentences
            sentence = parts[3] + " " + parts[4]
            words = sentence.replace("'s","").lower().split()
            #add if not already in vocab
            for word in words:
                if word not in vocab:
                    #add word to vocab dict
                    vocab[word] = len(vocab)
        return vocab,len(vocab)
    


def sen2vec(s,vocab):
    v = []
    words = s.replace("'s","").lower().split()
    for word in words:
        try:
            v.append(vocab[word])
        except:
            v.append(vocab["<UNK>"])
    return tensor(v).unsqueeze(0)


def sen2glove(s,glove_embs):
    v = []
    words = s.replace("'s","").lower().split()
    for word in words:
        try:
            v.append(glove_embs.get_index(word, default=None))
        except:
            v.append(40000)
    return tensor(v).unsqueeze(0)

def sen2pos(s,pos_dict):
    v = []
    words = s.replace("'s","").lower().split()
    pos_tags = pos_tag(words)

    for word, pos in pos_tags:
        if pos in pos_dict.keys():
            embed = np.zeros((n,), dtype=np.float32)
            embed[pos_dict[pos]]=1
        else:
            # print(pos)
            embed = np.zeros((n,), dtype=np.float32)

        v.append(embed)
    return tensor(np.array(v)).unsqueeze(0)

[nltk_data] Downloading package punkt to
[nltk_data]     \\nas01.itap.purdue.edu\puhome\ecn.data\bad\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     \\nas01.itap.purdue.edu\puhome\ecn.data\bad\nltk_data.
[nltk_data]     ..
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     \\nas01.itap.purdue.edu\puhome\ecn.data\bad\nltk_data.
[nltk_data]     ..
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
init_word_embs = 'glove'
neural_arch = 'lstm'
rnn_bidirect = True

In [4]:
if init_word_embs == "glove":
    # TODO: Feed the GloVe embeddings to NN modules appropriately
    # for initializing the embeddings
    glove_embs = api.load("glove-wiki-gigaword-50")
    all_weights = glove_embs.get_normed_vectors()
    avg_wegihts = np.mean(all_weights,axis=0)
    update_weights = np.vstack((all_weights,avg_wegihts))
    weights = torch.FloatTensor(update_weights)
else:
    # vocab size is 7459 based on experiment
    weights = torch.FloatTensor(np.random.rand(7459, 50))

In [5]:
# TODO: Read off the WiC dataset files from the `WiC_dataset' directory
# (will be located in /homes/cs577/WiC_dataset/(train, dev, test))
# and initialize PyTorch dataloader appropriately
# Take a look at this page
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
# and implement a PyTorch Dataset class for the WiC dataset in
# utils.py
root = 'D:/OneDrive - purdue.edu/Courses/CS577_NLP/hw/hw2/WiC_dataset/'
train_data = WiCDataset(root_dir=root)
vocab, _ = train_data.get_vocab()

test_data = WiCDataset('test',root_dir=root)
dev_data = WiCDataset('dev',root_dir=root)

# LSTM baseline

In [6]:
class LSTM(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0)



In [8]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw = model(s1,s2)

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw = model(s1,s2)

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw = model(s1,s2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc



In [15]:
lr0 = 0.001
epochs = 20
model = LSTM(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc = training(model,train_data, dev_data,lr0,epochs)
train_acc[-1]

---------epoch: 0 ---------
 train loss  0.6838354168345846
 val loss  0.6998994641916879
 train accuracy  0.5315033161385408
 val accuracy  0.5156739811912225
------Early stopping after epoch: 6 ---------
 train loss  0.6512916694253178
 val loss  0.7083322172254605
 train accuracy  0.6000368459837878
 val accuracy  0.5376175548589341


0.6000368459837878

In [None]:
def LSTM_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM(*inputs).to(torch_device)
        
        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc = training(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [None]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6884259130173556
 val loss  0.7244971490675404
 train accuracy  0.5261340087497122
 val accuracy  0.4009216589861751
---------epoch: 10 ---------
 train loss  0.6305356905149091
 val loss  0.7261966828377017
 train accuracy  0.6419525673497583
 val accuracy  0.5216589861751152
------Early stopping after epoch: 10 ---------
 train loss  0.6305356905149091
 val loss  0.7261966828377017
 train accuracy  0.6419525673497583
 val accuracy  0.5216589861751152
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6859285730989524
 val loss  0.7075609901533698
 train accuracy  0.5330416762606494
 val accuracy  0.4894009216589862
------Early stopping after epoch: 5 ---------
 train loss  0.6529455288143277
 val loss  0.7144673571608583
 train accuracy  0.6097167856320516
 val accuracy  0.5207373271889401
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6853642333028724
 val loss  0.6988290813112039
 train accur

In [79]:
# plt.plot(train_loss,label = "train_loss")
# plt.plot(val_loss,label = "val_loss")
# plt.xlabel('epoch')
# plt.ylabel('loss')
# plt.ylim(bottom = 0)
# plt.legend()

In [80]:
# plt.plot(train_acc,label = "train_acc")
# plt.plot(dev_acc,label = "val_acc")
# plt.xlabel('epoch')
# plt.ylabel('accuracy')
# plt.ylim(bottom = 0,top = 1)

# plt.legend()

# 1-layer hidden attention

In [41]:
class LSTM_attn(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)

        cat_rep = torch.cat((context1[-1,:].unsqueeze(0), context2[-1,:].unsqueeze(0)),1)
        ratio1, ratio2 = attn1[-1,:], attn2[-1,:]

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [42]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [30]:
lr0 = 0.001
epochs = 20
model = LSTM_attn(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6818771193637159
 val loss  0.6949111316645034
 train accuracy  0.5523212969786293
 val accuracy  0.5172413793103449
---------epoch: 10 ---------
 train loss  0.5850144613289886
 val loss  0.735172953351538
 train accuracy  0.6825718496683861
 val accuracy  0.5579937304075235
------Early stopping after epoch: 10 ---------
 train loss  0.5850144613289886
 val loss  0.735172953351538
 train accuracy  0.6825718496683861
 val accuracy  0.5579937304075235


0.5579937304075235

In [31]:
dev_data[0]['word']

'board'

In [32]:
ratios1[0]

tensor([0.1769, 0.2491, 0.2834, 0.2905], grad_fn=<SliceBackward0>)

In [33]:
dev_data[0]['sentence1']

'Room and board .'

In [34]:
ratios2[0]

tensor([0.1628, 0.6319, 0.1563, 0.0234, 0.0101, 0.0098, 0.0056],
       grad_fn=<SliceBackward0>)

In [35]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [45]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [46]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6853163945573336
 val loss  0.7199211999567973
 train accuracy  0.5535344232097629
 val accuracy  0.46912442396313364
---------epoch: 10 ---------
 train loss  0.6048165465619963
 val loss  0.7207437963529666
 train accuracy  0.6797144830762146
 val accuracy  0.5465437788018433
------Early stopping after epoch: 10 ---------
 train loss  0.6048165465619963
 val loss  0.7207437963529666
 train accuracy  0.6797144830762146
 val accuracy  0.5465437788018433
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6816294382663194
 val loss  0.7032380134828629
 train accuracy  0.5447847110292424
 val accuracy  0.5105990783410138
------Early stopping after epoch: 9 ---------
 train loss  0.6014142617250461
 val loss  0.7172439786146313
 train accuracy  0.6509325351139765
 val accuracy  0.5548387096774193
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6804142441860465
 val loss  0.6957866615963422
 train accu

# 1-layer target attention

In [6]:
class LSTM_attn_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context1[idx1,:].unsqueeze(0), context2[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn1[idx1,:], attn2[idx2,:]
        except:
            cat_rep = torch.cat((context1[-1,:].unsqueeze(0), context2[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn1[-1,:], attn2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')



        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [7]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [8]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6828579097964259
 val loss  0.6974765215547855
 train accuracy  0.5418201915991157
 val accuracy  0.5156739811912225
------Early stopping after epoch: 6 ---------
 train loss  0.6168415741439066
 val loss  0.7245336072198276
 train accuracy  0.6573323507737656
 val accuracy  0.5470219435736677


0.5470219435736677

In [11]:
wrong_idx = []
wrong_ratio1 = []
wrong_ratio2 = []

for i in range(len(train_data)):
    sample = train_data[i]

    # a) calculate probs / get an output
    if init_word_embs == "glove":
        s1 = sen2glove(sample["sentence1"],glove_embs)
        s2 = sen2glove(sample["sentence2"],glove_embs)
    else:
        s1 = sen2vec(sample["sentence1"])
        s2 = sen2vec(sample["sentence2"])

    
    y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))



    result = True if y_raw >= 0.5 else False

    if bool(result) != sample["label"]:
        wrong_idx.append(i)
        wrong_ratio1.append(ratio1)
        wrong_ratio2.append(ratio2)
        

In [10]:
i = 0
print(i)
sample = train_data[wrong_idx[i]]
print(sample["label"])
print(sample["sentence2"])
print(wrong_ratio1[i])
print(sample['word'])
print(sample['sentence1'])
print(wrong_ratio2[i])


0
False
Do you think the sofa will go through the door ?
tensor([0.2125, 0.1916, 0.1303, 0.1023, 0.1120, 0.1073, 0.1439],
       grad_fn=<SliceBackward0>)
go
Messages must go through diplomatic channels .
tensor([0.0855, 0.2059, 0.1595, 0.0445, 0.3542, 0.0501, 0.0507, 0.0046, 0.0025,
        0.0010, 0.0415], grad_fn=<SliceBackward0>)


In [50]:
dev_data[0]['word']

'board'

In [51]:
ratios1[0]

tensor([0.2276, 0.2472, 0.2793, 0.2459], grad_fn=<SliceBackward0>)

In [52]:
dev_data[0]['sentence1']

'Room and board .'

In [53]:
ratios2[0]

tensor([0.0351, 0.3365, 0.1489, 0.1453, 0.1200, 0.1672, 0.0469],
       grad_fn=<SliceBackward0>)

In [54]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [55]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_target(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [56]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6856035394600507
 val loss  0.7217603551627304
 train accuracy  0.5493898227032006
 val accuracy  0.44608294930875575
------Early stopping after epoch: 9 ---------
 train loss  0.6168800053606378
 val loss  0.7266809700820852
 train accuracy  0.6437946120193415
 val accuracy  0.5124423963133641
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6807003210086634
 val loss  0.7029710901497696
 train accuracy  0.5415611328574718
 val accuracy  0.4967741935483871
---------epoch: 10 ---------
 train loss  0.577672752399695
 val loss  0.7263838948192685
 train accuracy  0.6792539719088188
 val accuracy  0.5714285714285714
------Early stopping after epoch: 10 ---------
 train loss  0.577672752399695
 val loss  0.7263838948192685
 train accuracy  0.6792539719088188
 val accuracy  0.5714285714285714
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6852901422727665
 val loss  0.6963813184043779
 train accura

# 1-layer attention + mean states

In [57]:
class LSTM_attn_mean(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_mean, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        cat_rep = torch.cat((context1.mean(dim = 0).unsqueeze(0), context2.mean(dim = 0).unsqueeze(0)),1)
        ratio1, ratio2 = None,None

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [58]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [59]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_mean(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6811482507513127
 val loss  0.6969489214188627
 train accuracy  0.5467943994104643
 val accuracy  0.5219435736677116
------Early stopping after epoch: 5 ---------
 train loss  0.6410744202514739
 val loss  0.6991034720011265
 train accuracy  0.6151436993367723
 val accuracy  0.5329153605015674


0.5329153605015674

In [60]:
dev_data[0]['word']

'board'

In [61]:
ratios1[0]

In [62]:
dev_data[0]['sentence1']

'Room and board .'

In [63]:
ratios2[0]

In [64]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [65]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_mean(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [66]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6853550702999079
 val loss  0.7247401083669355
 train accuracy  0.5562974902141377
 val accuracy  0.4525345622119816
------Early stopping after epoch: 9 ---------
 train loss  0.6035726829632454
 val loss  0.7373504216769873
 train accuracy  0.67142528206309
 val accuracy  0.5216589861751152
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6805212770550311
 val loss  0.7020435245355703
 train accuracy  0.5576790237163252
 val accuracy  0.5124423963133641
---------epoch: 10 ---------
 train loss  0.5909926106845786
 val loss  0.7289483276929724
 train accuracy  0.67142528206309
 val accuracy  0.5483870967741935
------Early stopping after epoch: 11 ---------
 train loss  0.5797322356917741
 val loss  0.7427091396349367
 train accuracy  0.6824775500805894
 val accuracy  0.5585253456221199
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6801883733522335
 val loss  0.6967186149913595
 train accuracy 

# 2-layer attention + target

In [67]:
class LSTM_attn2_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn2_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws1 = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.ws2 = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws1 = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.ws2 = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws11 = self.ws1(lstm_out1)
        # print('ws1',ws1.shape)

        similar11 = torch.bmm(ws11.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn11 = self.softmax(torch.squeeze(similar11))
        # print('attn1',attn1.shape)

        context11 = torch.squeeze(torch.bmm(attn11.unsqueeze(0), lstm_out1.unsqueeze(0)))

        ws12 = self.ws2(context11)
        similar12 = torch.bmm(ws12.unsqueeze(0), torch.transpose(context11,0,1).unsqueeze(0))        
        attn12 = self.softmax(torch.squeeze(similar12))
        context12 = torch.squeeze(torch.bmm(attn12.unsqueeze(0), context11.unsqueeze(0)))


        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws21 = self.ws1(lstm_out2)
        similar21 = torch.bmm(ws21.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))        
        attn21 = self.softmax(torch.squeeze(similar21))
        context21 = torch.squeeze(torch.bmm(attn21.unsqueeze(0), lstm_out2.unsqueeze(0)))

        ws22 = self.ws2(context21)
        similar22 = torch.bmm(ws22.unsqueeze(0), torch.transpose(context21,0,1).unsqueeze(0))        
        attn22 = self.softmax(torch.squeeze(similar22))
        context22 = torch.squeeze(torch.bmm(attn22.unsqueeze(0), context21.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context12[idx1,:].unsqueeze(0), context22[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn12[idx1,:], attn22[idx1,:]
        except:
            cat_rep = torch.cat((context12[-1,:].unsqueeze(0), context22[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn12[-1,:], attn22[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0), ratio1, ratio2



In [68]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [69]:
lr0 = 0.001
epochs = 20
model = LSTM_attn2_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6810116075643653
 val loss  0.6977732786937941
 train accuracy  0.5379513633014001
 val accuracy  0.512539184952978
------Early stopping after epoch: 7 ---------
 train loss  0.6231228428547577
 val loss  0.7146856612920014
 train accuracy  0.6374355195283714
 val accuracy  0.5235109717868338


0.5235109717868338

In [70]:
dev_data[0]['word']

'board'

In [71]:
ratios1[0]

tensor([0.2502, 0.2499, 0.2500, 0.2500], grad_fn=<SliceBackward0>)

In [72]:
dev_data[0]['sentence1']

'Room and board .'

In [73]:
ratios2[0]

tensor([0.1539, 0.1954, 0.1412, 0.1365, 0.1261, 0.1249, 0.1221],
       grad_fn=<SliceBackward0>)

In [74]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [75]:
def LSTM_attn2_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn2_target(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [76]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn2_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6837330501309579
 val loss  0.7207113942792339
 train accuracy  0.5595210683859083
 val accuracy  0.4626728110599078
------Early stopping after epoch: 9 ---------
 train loss  0.6135031295871229
 val loss  0.7146558067216302
 train accuracy  0.6631360810499655
 val accuracy  0.5447004608294931
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6825090303361732
 val loss  0.705116208687356
 train accuracy  0.5369560211835137
 val accuracy  0.49216589861751153
------Early stopping after epoch: 7 ---------
 train loss  0.6237642875388556
 val loss  0.7122252574164747
 train accuracy  0.628137232327884
 val accuracy  0.5410138248847927
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6793690434859256
 val loss  0.6962838309151785
 train accuracy  0.5404098549389823
 val accuracy  0.5142857142857142
------Early stopping after epoch: 9 ---------
 train loss  0.5977694664906171
 val loss  0.69366528207805

# 2-head attention + target (sum)

In [77]:
class LSTM_attn2_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn2_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws1 = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.ws2 = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws1 = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.ws2 = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws11 = self.ws1(lstm_out1)
        # print('ws1',ws1.shape)

        similar11 = torch.bmm(ws11.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn11 = self.softmax(torch.squeeze(similar11))
        # print('attn1',attn1.shape)

        context11 = torch.squeeze(torch.bmm(attn11.unsqueeze(0), lstm_out1.unsqueeze(0)))

        ws12 = self.ws2(lstm_out1)
        similar12 = torch.bmm(ws12.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))        
        attn12 = self.softmax(torch.squeeze(similar12))
        context12 = torch.squeeze(torch.bmm(attn12.unsqueeze(0), lstm_out1.unsqueeze(0)))


        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws21 = self.ws1(lstm_out2)
        similar21 = torch.bmm(ws21.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))        
        attn21 = self.softmax(torch.squeeze(similar21))
        context21 = torch.squeeze(torch.bmm(attn21.unsqueeze(0), lstm_out2.unsqueeze(0)))

        ws22 = self.ws2(lstm_out2)
        similar22 = torch.bmm(ws22.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))        
        attn22 = self.softmax(torch.squeeze(similar22))
        context22 = torch.squeeze(torch.bmm(attn22.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context11[idx1,:].unsqueeze(0) + context12[idx1,:].unsqueeze(0), context21[idx2,:].unsqueeze(0) + context22[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn12[idx1,:], attn22[idx1,:]
        except:
            cat_rep = torch.cat((context11[-1,:].unsqueeze(0) + context12[-1,:].unsqueeze(0), context21[-1,:].unsqueeze(0) + context22[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn12[-1,:], attn22[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0), ratio1, ratio2



In [78]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [79]:
lr0 = 0.001
epochs = 20
model = LSTM_attn2_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6819875853502672
 val loss  0.6942834166523805
 train accuracy  0.5408990420044215
 val accuracy  0.5188087774294671
------Early stopping after epoch: 7 ---------
 train loss  0.6058645564664702
 val loss  0.7168540356674912
 train accuracy  0.6479366249078851
 val accuracy  0.5297805642633229


0.5297805642633229

In [80]:
dev_data[0]['word']

'board'

In [81]:
ratios1[0]

tensor([0.2655, 0.2508, 0.2646, 0.2191], grad_fn=<SliceBackward0>)

In [82]:
dev_data[0]['sentence1']

'Room and board .'

In [83]:
ratios2[0]

tensor([0.1507, 0.4432, 0.1769, 0.1183, 0.0509, 0.0306, 0.0294],
       grad_fn=<SliceBackward0>)

In [84]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [85]:
def LSTM_attn2_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn2_target(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [86]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn2_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6845903249481925
 val loss  0.7170086135512673
 train accuracy  0.5597513239696063
 val accuracy  0.48202764976958523
------Early stopping after epoch: 6 ---------
 train loss  0.6148619523334964
 val loss  0.7278208367835541
 train accuracy  0.6624453142988718
 val accuracy  0.5299539170506913
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6794381314039835
 val loss  0.7039046186455933
 train accuracy  0.5562974902141377
 val accuracy  0.5142857142857142
------Early stopping after epoch: 7 ---------
 train loss  0.6105448849981292
 val loss  0.7117853542626729
 train accuracy  0.6516233018650702
 val accuracy  0.5631336405529954
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6800806096916014
 val loss  0.6925905007920506
 train accuracy  0.5486990559521069
 val accuracy  0.5428571428571428
------Early stopping after epoch: 7 ---------
 train loss  0.6041916634886599
 val loss  0.716495636970

# 2-head attention + target (concat)

In [87]:
class LSTM_attn2_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn2_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
                        
            self.softmax = nn.Softmax(dim=1)
            self.ws1 = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.ws2 = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

            self.hidden_layer = nn.Linear(2*4*lstm_dim,hidden_dim)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers

            self.softmax = nn.Softmax(dim=1)
            self.ws1 = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.ws2 = nn.Linear(lstm_dim, lstm_dim, bias=False)

            self.hidden_layer = nn.Linear(2*2*lstm_dim,hidden_dim)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws11 = self.ws1(lstm_out1)
        # print('ws1',ws1.shape)

        similar11 = torch.bmm(ws11.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn11 = self.softmax(torch.squeeze(similar11))
        # print('attn1',attn1.shape)

        context11 = torch.squeeze(torch.bmm(attn11.unsqueeze(0), lstm_out1.unsqueeze(0)))

        ws12 = self.ws2(lstm_out1)
        similar12 = torch.bmm(ws12.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))        
        attn12 = self.softmax(torch.squeeze(similar12))
        context12 = torch.squeeze(torch.bmm(attn12.unsqueeze(0), lstm_out1.unsqueeze(0)))


        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws21 = self.ws1(lstm_out2)
        similar21 = torch.bmm(ws21.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))        
        attn21 = self.softmax(torch.squeeze(similar21))
        context21 = torch.squeeze(torch.bmm(attn21.unsqueeze(0), lstm_out2.unsqueeze(0)))

        ws22 = self.ws2(lstm_out2)
        similar22 = torch.bmm(ws22.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))        
        attn22 = self.softmax(torch.squeeze(similar22))
        context22 = torch.squeeze(torch.bmm(attn22.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context11[idx1,:].unsqueeze(0), context12[idx1,:].unsqueeze(0), context21[idx2,:].unsqueeze(0), context22[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn12[idx1,:], attn22[idx1,:]
        except:
            cat_rep = torch.cat((context11[-1,:].unsqueeze(0), context12[-1,:].unsqueeze(0), context21[-1,:].unsqueeze(0), context22[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn12[-1,:], attn22[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0), ratio1, ratio2



In [88]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [89]:
lr0 = 0.001
epochs = 20
model = LSTM_attn2_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6816559175225682
 val loss  0.6878931410260335
 train accuracy  0.5620854826823877
 val accuracy  0.5203761755485894
------Early stopping after epoch: 7 ---------
 train loss  0.6037682214904201
 val loss  0.7159603202604575
 train accuracy  0.6685703758290347
 val accuracy  0.5579937304075235


0.5579937304075235

In [90]:
dev_data[0]['word']

'board'

In [91]:
ratios1[0]

tensor([0.2518, 0.2513, 0.2304, 0.2665], grad_fn=<SliceBackward0>)

In [92]:
dev_data[0]['sentence1']

'Room and board .'

In [93]:
ratios2[0]

tensor([0.0083, 0.9028, 0.0624, 0.0143, 0.0030, 0.0048, 0.0045],
       grad_fn=<SliceBackward0>)

In [94]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [95]:
def LSTM_attn2_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn2_target(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [96]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn2_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6852519724628713
 val loss  0.723383046514977
 train accuracy  0.5537646787934607
 val accuracy  0.4488479262672811
------Early stopping after epoch: 8 ---------
 train loss  0.6013444992300829
 val loss  0.7148447063112039
 train accuracy  0.6783329495740271
 val accuracy  0.5382488479262673
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6806315141643161
 val loss  0.7068355437247984
 train accuracy  0.5493898227032006
 val accuracy  0.49953917050691243
---------epoch: 10 ---------
 train loss  0.5687789056203949
 val loss  0.7040557158158122
 train accuracy  0.6886944508404329
 val accuracy  0.5751152073732719
------Early stopping after epoch: 12 ---------
 train loss  0.5331153737875605
 val loss  0.7353909400201613
 train accuracy  0.7195486990559521
 val accuracy  0.5806451612903226
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6803192412718743
 val loss  0.6966179773005472
 train accur

# 1-layer multi-head attention

In [97]:
class LSTM_attn_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.multihead = nn.MultiheadAttention(2*lstm_dim, 1)
            
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            # self.softmax = nn.Softmax(dim=0)
            # self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.multihead = nn.MultiheadAttention(lstm_dim, 1)

            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            # self.softmax = nn.Softmax(dim=0)
            # self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        context1,weights1 = self.multihead(lstm_out1, lstm_out1, lstm_out1)
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        context2, weights2 = self.multihead(lstm_out2, lstm_out2, lstm_out2)

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context1[idx1,:].unsqueeze(0), context2[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = weights1[idx1,:], weights2[idx2,:]
        except:
            cat_rep = torch.cat((context1[-1,:].unsqueeze(0), context2[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = weights1[-1,:], weights2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')



        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [98]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [99]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6826701266148903
 val loss  0.698405155193843
 train accuracy  0.5373986735445836
 val accuracy  0.5031347962382445
------Early stopping after epoch: 5 ---------
 train loss  0.6369852896727616
 val loss  0.719779035514425
 train accuracy  0.6254605747973471
 val accuracy  0.5250783699059561


0.5250783699059561

In [100]:
dev_data[0]['word']

'board'

In [101]:
ratios1[0]

tensor([0.2572, 0.2505, 0.2476, 0.2448], grad_fn=<SliceBackward0>)

In [102]:
dev_data[0]['sentence1']

'Room and board .'

In [103]:
ratios2[0]

tensor([0.0253, 0.7720, 0.0816, 0.0693, 0.0182, 0.0160, 0.0176],
       grad_fn=<SliceBackward0>)

In [104]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [105]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_target(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [106]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6852999798526365
 val loss  0.7107114730342742
 train accuracy  0.5691918029012204
 val accuracy  0.5013824884792627
---------epoch: 10 ---------
 train loss  0.5840586346743323
 val loss  0.7394361926663306
 train accuracy  0.6758001381533503
 val accuracy  0.5188940092165899
------Early stopping after epoch: 10 ---------
 train loss  0.5840586346743323
 val loss  0.7394361926663306
 train accuracy  0.6758001381533503
 val accuracy  0.5188940092165899
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6802585293503914
 val loss  0.7049229775705645
 train accuracy  0.5629749021413769
 val accuracy  0.5271889400921659
---------epoch: 10 ---------
 train loss  0.5728284469980428
 val loss  0.7314239361319125
 train accuracy  0.700437485609026
 val accuracy  0.5585253456221199
------Early stopping after epoch: 10 ---------
 train loss  0.5728284469980428
 val loss  0.7314239361319125
 train accuracy  0.700437485609

In [None]:
# TODO: Testing loop
# Write predictions (F or T) for each test example into test.pred.txt
# One line per each example, in the same order as test.data.txt.

# score = 0
# for i in range(len(test_data)):
#     sample = test_data[i]
#     # a) calculate probs / get an output
#     if init_word_embs == "glove":
#         s1 = sen2glove(sample["sentence1"],glove_embs)
#         s2 = sen2glove(sample["sentence2"],glove_embs)
#     else:
#         s1 = sen2vec(sample["sentence1"])
#         s2 = sen2vec(sample["sentence2"])

#     y_raw = model(s1,s2)
#     result = True if y_raw >= 0.5 else False
#     if bool(result) == sample["label"]:
#         score += 1

#     output = "T" if y_raw >= 0.5 else "F"
#     test_output.append(result)

# print(" test accuracy ",score/len(test_data))

# with open('test.pred.txt', 'w') as f:
#     for line in test_output:
#         f.write(f"{line}\n")