In [1]:
# pip install torch
# pip install gensim
# pip install pandas


# preprocessing

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import argparse
import random
random.seed(577)

import numpy as np
np.random.seed(577)

import torch
torch.set_default_tensor_type(torch.FloatTensor)
torch.use_deterministic_algorithms(True)
torch.manual_seed(577)
torch_device = torch.device("cpu")

'''
NOTE: Do not change any of the statements above regarding random/numpy/pytorch.
You can import other built-in libraries (e.g. collections) or pre-specified external libraries
such as pandas, nltk and gensim below. 
Also, if you'd like to declare some helper functions, please do so in utils.py and
change the last import statement below.
'''
from torch import nn
from torch import optim
from torch import tensor

import gensim.downloader as api

from torch.utils.data import Subset

# from neural_archs import DAN, RNN, LSTM
# from utils import WiCDataset, sen2vec, sen2glove



In [2]:
# from utils.py
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from torch import tensor


import nltk
# import tempfile
#
# nltk.download('punkt', download_dir=tempfile.gettempdir())
# nltk.download('averaged_perceptron_tagger', download_dir=tempfile.gettempdir())
# nltk.download('tagsets', download_dir=tempfile.gettempdir())
# nltk.data.path.append(tempfile.gettempdir())

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

from nltk.data import load
paras = load('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')

from nltk import pos_tag

ss = 'NN NNS NNP NNPS VB VBD VBG VBN VBP VBZ JJ JJR JJS RBR BR RBS WDT WP WP$ PRP PRP$ DT CD UH SYM FW  LS'

selected_tags = ss.split()
pos_dict = {}
i = 0
for pos in selected_tags:
    pos_dict[pos] = i
    i += 1
n = len(pos_dict)

# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
class WiCDataset(Dataset):

    def __init__(self, mode='train', root_dir='./WiC_dataset/'):

        self.root_dir = root_dir
        self.mode = mode

        file_name = root_dir + self.mode + '/'+ self.mode+'.data.txt'
        with open(file_name,'r', encoding = 'cp850') as file:    
            self.data = file.readlines()
           
        file_name = root_dir + self.mode + '/'+ self.mode +'.gold.txt'
        with open(file_name,'r', encoding = 'cp850') as file:
            self.labels = file.readlines()

    def __len__(self):
        return len(self.labels)
        

    def __getitem__(self, idx):

        line = self.data[idx]
        sample = {}
        parts = line.replace("\n","\t").strip().split("\t")
        sample['word'] = parts[0]

        if parts[1] == "F":
            sample['label'] = False
        else:
            sample['label'] = True

        sample['sentence1'] = parts[3]
        sample['sentence2'] = parts[4]

        idxs = parts[2].split('-')
        sample['idx1'] = idxs[0]
        sample['idx2'] = idxs[1]


        line = self.labels[idx]
        if line.split()[0] == 'F':
            sample['label'] = False
        else:
            sample['label'] = True

        return sample

    def get_vocab(self):  
        vocab = {"<UNK>":0}
        for line in self.data:
            parts = line.replace("\n","\t").strip().split("\t")

            #create vocabulary from all unique words in all sentences
            sentence = parts[3] + " " + parts[4]
            words = sentence.replace("'s","").lower().split()
            #add if not already in vocab
            for word in words:
                if word not in vocab:
                    #add word to vocab dict
                    vocab[word] = len(vocab)
        return vocab,len(vocab)
    


def sen2vec(s,vocab):
    v = []
    words = s.replace("'s","").lower().split()
    for word in words:
        try:
            v.append(vocab[word])
        except:
            v.append(vocab["<UNK>"])
    return tensor(v).unsqueeze(0)


def sen2glove(s,glove_embs):
    v = []
    words = s.replace("'s","").lower().split()
    for word in words:
        try:
            v.append(glove_embs.get_index(word, default=None))
        except:
            v.append(40000)
    return tensor(v).unsqueeze(0)

def sen2pos(s,pos_dict):
    v = []
    words = s.replace("'s","").lower().split()
    pos_tags = pos_tag(words)

    for word, pos in pos_tags:
        if pos in pos_dict.keys():
            embed = np.zeros((n,), dtype=np.float32)
            embed[pos_dict[pos]]=1
        else:
            # print(pos)
            embed = np.zeros((n,), dtype=np.float32)

        v.append(embed)
    return tensor(np.array(v)).unsqueeze(0)

[nltk_data] Downloading package punkt to
[nltk_data]     \\nas01.itap.purdue.edu\puhome\ecn.data\bad\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     \\nas01.itap.purdue.edu\puhome\ecn.data\bad\nltk_data.
[nltk_data]     ..
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     \\nas01.itap.purdue.edu\puhome\ecn.data\bad\nltk_data.
[nltk_data]     ..
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
init_word_embs = 'glove'
neural_arch = 'lstm'
rnn_bidirect = True

In [4]:
if init_word_embs == "glove":
    # TODO: Feed the GloVe embeddings to NN modules appropriately
    # for initializing the embeddings
    glove_embs = api.load("glove-wiki-gigaword-50")
    all_weights = glove_embs.get_normed_vectors()
    avg_wegihts = np.mean(all_weights,axis=0)
    update_weights = np.vstack((all_weights,avg_wegihts))
    weights = torch.FloatTensor(update_weights)
else:
    # vocab size is 7459 based on experiment
    weights = torch.FloatTensor(np.random.rand(7459, 50))

In [5]:
# TODO: Read off the WiC dataset files from the `WiC_dataset' directory
# (will be located in /homes/cs577/WiC_dataset/(train, dev, test))
# and initialize PyTorch dataloader appropriately
# Take a look at this page
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
# and implement a PyTorch Dataset class for the WiC dataset in
# utils.py
root = 'D:/OneDrive - purdue.edu/Courses/CS577_NLP/hw/hw2/WiC_dataset/'
train_data = WiCDataset(root_dir=root)
vocab, _ = train_data.get_vocab()

test_data = WiCDataset('test',root_dir=root)
dev_data = WiCDataset('dev',root_dir=root)

# LSTM baseline

In [7]:
class LSTM(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0)



In [8]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw = model(s1,s2)

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw = model(s1,s2)

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw = model(s1,s2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc



In [9]:
lr0 = 0.001
epochs = 20
model = LSTM(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc = training(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.683376506223517
 val loss  0.6998250835741575
 train accuracy  0.5315033161385408
 val accuracy  0.512539184952978
------Early stopping after epoch: 7 ---------
 train loss  0.6455698371724162
 val loss  0.7151527524368143
 train accuracy  0.6191967575534266
 val accuracy  0.5329153605015674


0.5329153605015674

In [10]:
def LSTM_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM(*inputs).to(torch_device)
        
        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc = training(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [11]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.686278341224384
 val loss  0.7294650170110887
 train accuracy  0.5450149666129404
 val accuracy  0.4248847926267281
---------epoch: 10 ---------
 train loss  0.6551460548943703
 val loss  0.7263396794894873
 train accuracy  0.6099470412157495
 val accuracy  0.5235023041474655
------Early stopping after epoch: 10 ---------
 train loss  0.6551460548943703
 val loss  0.7263396794894873
 train accuracy  0.6099470412157495
 val accuracy  0.5235023041474655
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6835212329826733
 val loss  0.7043602728074597
 train accuracy  0.548238544784711
 val accuracy  0.4838709677419355
------Early stopping after epoch: 7 ---------
 train loss  0.6434677795086922
 val loss  0.7299592611427131
 train accuracy  0.620769053649551
 val accuracy  0.5253456221198156
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6816758154285633
 val loss  0.6973822387132776
 train accuracy

In [79]:
# plt.plot(train_loss,label = "train_loss")
# plt.plot(val_loss,label = "val_loss")
# plt.xlabel('epoch')
# plt.ylabel('loss')
# plt.ylim(bottom = 0)
# plt.legend()

In [80]:
# plt.plot(train_acc,label = "train_acc")
# plt.plot(dev_acc,label = "val_acc")
# plt.xlabel('epoch')
# plt.ylabel('accuracy')
# plt.ylim(bottom = 0,top = 1)

# plt.legend()

# 1-layer target attention

In [12]:
class LSTM_attn_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=0)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=0)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context1[idx1,:].unsqueeze(0), context2[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn1[idx1,:], attn2[idx2,:]
        except:
            cat_rep = torch.cat((context1[-1,:].unsqueeze(0), context2[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attn1[-1,:], attn2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')



        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [14]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [15]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6792453661857498
 val loss  0.7003196788058386
 train accuracy  0.5604274134119381
 val accuracy  0.5141065830721003
------Early stopping after epoch: 5 ---------
 train loss  0.6295064815108005
 val loss  0.7242459383877841
 train accuracy  0.6201179071481209
 val accuracy  0.54858934169279


0.54858934169279

In [34]:
dev_data[0]['word']

'board'

In [26]:
ratios1[0]

tensor([0.2430, 0.2457, 0.2434, 0.2512], grad_fn=<SliceBackward0>)

In [33]:
dev_data[0]['sentence1']

'Room and board .'

In [35]:
ratios2[0]

tensor([0.1607, 0.1533, 0.1840, 0.1766, 0.2200, 0.2268, 0.1577],
       grad_fn=<SliceBackward0>)

In [36]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

# correct 1-layer intra-attn + global attn + target

In [78]:
class LSTM_attn_global(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_global, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.wg = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.wg = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))


        # ws2 = self.ws(lstm_out2)
        wg1 = self.wg(context1)
        # print('wg1',wg1.shape)   

        similarg1 = torch.bmm(wg1.unsqueeze(0), torch.transpose(context2,0,1).unsqueeze(0))
        # print('similarg1',similarg1.shape) 

        attng1 = self.softmax(torch.squeeze(similarg1))
        # print('attng1',attng1.shape)

        contextg1 = torch.squeeze(torch.bmm(attng1.unsqueeze(0), context2.unsqueeze(0)))

        wg2 = self.wg(context2)
        similarg2 = torch.bmm(wg2.unsqueeze(0), torch.transpose(context1,0,1).unsqueeze(0))    
        attng2 = self.softmax(torch.squeeze(similarg2))
        contextg2 = torch.squeeze(torch.bmm(attng2.unsqueeze(0), context1.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((contextg1[idx1,:].unsqueeze(0), contextg2[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attng1[idx1,:], attng2[idx2,:]
        except:
            cat_rep = torch.cat((contextg1[-1,:].unsqueeze(0), contextg2[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attng1[-1,:], attng2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')



        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)
        # print(ratio1)
        # print(ratio2)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [79]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [80]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_global(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.680542396989568
 val loss  0.6978128845788842
 train accuracy  0.5591378039793663
 val accuracy  0.512539184952978
------Early stopping after epoch: 7 ---------
 train loss  0.6242624056541314
 val loss  0.705047607421875
 train accuracy  0.644620486366986
 val accuracy  0.5454545454545454


0.5454545454545454

In [81]:
dev_data[0]['word']

'board'

In [82]:
dev_data[0]['sentence1']

'Room and board .'

In [83]:
ratios1[0]

tensor([0.1474, 0.1477, 0.1431, 0.1414, 0.1399, 0.1408, 0.1397],
       grad_fn=<SliceBackward0>)

In [84]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [85]:
ratios2[0]

tensor([0.2477, 0.2510, 0.2502, 0.2511], grad_fn=<SliceBackward0>)

In [86]:
dev_data[0]['sentence1']

'Room and board .'

In [87]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_global(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [88]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6856305225362652
 val loss  0.7239318003852246
 train accuracy  0.5417913884411697
 val accuracy  0.4313364055299539
---------epoch: 10 ---------
 train loss  0.5928865977758749
 val loss  0.7163166380148329
 train accuracy  0.6647478701358508
 val accuracy  0.5382488479262673
------Early stopping after epoch: 10 ---------
 train loss  0.5928865977758749
 val loss  0.7163166380148329
 train accuracy  0.6647478701358508
 val accuracy  0.5382488479262673
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6814514623927872
 val loss  0.7025360248055875
 train accuracy  0.5516923785401796
 val accuracy  0.5152073732718894
---------epoch: 10 ---------
 train loss  0.5793510435255008
 val loss  0.7318060668382776
 train accuracy  0.6928390513469952
 val accuracy  0.5612903225806452
------Early stopping after epoch: 11 ---------
 train loss  0.5712640469397594
 val loss  0.7482421312463998
 train accuracy  0.69721390743

# hidden

In [89]:
class LSTM_attn_global(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_global, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.wg = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.wg = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))


        # ws2 = self.ws(lstm_out2)
        wg1 = self.wg(context1)
        # print('wg1',wg1.shape)   

        similarg1 = torch.bmm(wg1.unsqueeze(0), torch.transpose(context2,0,1).unsqueeze(0))
        # print('similarg1',similarg1.shape) 

        attng1 = self.softmax(torch.squeeze(similarg1))
        # print('attng1',attng1.shape)

        contextg1 = torch.squeeze(torch.bmm(attng1.unsqueeze(0), context2.unsqueeze(0)))

        wg2 = self.wg(context2)
        similarg2 = torch.bmm(wg2.unsqueeze(0), torch.transpose(context1,0,1).unsqueeze(0))    
        attng2 = self.softmax(torch.squeeze(similarg2))
        contextg2 = torch.squeeze(torch.bmm(attng2.unsqueeze(0), context1.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)

        cat_rep = torch.cat((contextg1[-1,:].unsqueeze(0), contextg2[-1,:].unsqueeze(0)),1)
        ratio1, ratio2 = attng1[-1,:], attng2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [90]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [91]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_global(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6814837866876612
 val loss  0.6976343710968114
 train accuracy  0.5427413411938099
 val accuracy  0.5203761755485894
---------epoch: 10 ---------
 train loss  0.5805995550415669
 val loss  0.744503631113465
 train accuracy  0.6739130434782609
 val accuracy  0.5470219435736677
------Early stopping after epoch: 10 ---------
 train loss  0.5805995550415669
 val loss  0.744503631113465
 train accuracy  0.6739130434782609
 val accuracy  0.5470219435736677


0.5470219435736677

In [92]:
dev_data[0]['word']

'board'

In [93]:
dev_data[0]['sentence1']

'Room and board .'

In [94]:
ratios1[0]

tensor([0.1033, 0.1320, 0.1352, 0.1479, 0.1616, 0.1605, 0.1595],
       grad_fn=<SliceBackward0>)

In [95]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [96]:
ratios2[0]

tensor([0.2487, 0.2503, 0.2508, 0.2502], grad_fn=<SliceBackward0>)

In [97]:
dev_data[0]['sentence1']

'Room and board .'

In [98]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_global(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [99]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6862405087029415
 val loss  0.7237621395269297
 train accuracy  0.548238544784711
 val accuracy  0.4433179723502304
------Early stopping after epoch: 9 ---------
 train loss  0.6070733435988948
 val loss  0.7088069511448732
 train accuracy  0.6654386368869445
 val accuracy  0.5327188940092166
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.680666535948653
 val loss  0.708086173765121
 train accuracy  0.545705733364034
 val accuracy  0.5013824884792627
---------epoch: 10 ---------
 train loss  0.5923635758152487
 val loss  0.7103230981782834
 train accuracy  0.6875431729219433
 val accuracy  0.5695852534562212
------Early stopping after epoch: 12 ---------
 train loss  0.5672015199566832
 val loss  0.7258657429075461
 train accuracy  0.6949113516002763
 val accuracy  0.5631336405529954
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6805457304678506
 val loss  0.6950185186851958
 train accuracy 

# mean

In [100]:
class LSTM_attn_global(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_global, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.wg = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.wg = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))


        # ws2 = self.ws(lstm_out2)
        wg1 = self.wg(context1)
        # print('wg1',wg1.shape)   

        similarg1 = torch.bmm(wg1.unsqueeze(0), torch.transpose(context2,0,1).unsqueeze(0))
        # print('similarg1',similarg1.shape) 

        attng1 = self.softmax(torch.squeeze(similarg1))
        # print('attng1',attng1.shape)

        contextg1 = torch.squeeze(torch.bmm(attng1.unsqueeze(0), context2.unsqueeze(0)))

        wg2 = self.wg(context2)
        similarg2 = torch.bmm(wg2.unsqueeze(0), torch.transpose(context1,0,1).unsqueeze(0))    
        attng2 = self.softmax(torch.squeeze(similarg2))
        contextg2 = torch.squeeze(torch.bmm(attng2.unsqueeze(0), context1.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        cat_rep = torch.cat((contextg1.mean(dim = 0).unsqueeze(0), contextg2.mean(dim = 0).unsqueeze(0)),1)
        ratio1, ratio2 = None, None


        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [101]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [102]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_global(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6829961272136377
 val loss  0.6980776413107367
 train accuracy  0.5316875460574797
 val accuracy  0.5156739811912225
---------epoch: 10 ---------
 train loss  0.582010200292465
 val loss  0.7271281753587873
 train accuracy  0.6807295504789977
 val accuracy  0.5642633228840125
------Early stopping after epoch: 11 ---------
 train loss  0.5697074558423914
 val loss  0.7366528167246278
 train accuracy  0.6903095062638173
 val accuracy  0.5642633228840125


0.5642633228840125

In [103]:
dev_data[0]['word']

'board'

In [104]:
dev_data[0]['sentence1']

'Room and board .'

In [105]:
ratios1[0]

In [106]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [107]:
ratios2[0]

In [108]:
dev_data[0]['sentence1']

'Room and board .'

In [109]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_global(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [110]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6844481578653868
 val loss  0.7210863245247696
 train accuracy  0.5539949343771586
 val accuracy  0.4552995391705069
---------epoch: 10 ---------
 train loss  0.5943760635829208
 val loss  0.7211715487291187
 train accuracy  0.6808657609947041
 val accuracy  0.552073732718894
------Early stopping after epoch: 11 ---------
 train loss  0.5836519210151393
 val loss  0.7235842656430012
 train accuracy  0.6829380612479853
 val accuracy  0.5502304147465438
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6818286633123705
 val loss  0.702240468389977
 train accuracy  0.5537646787934607
 val accuracy  0.511520737327189
------Early stopping after epoch: 8 ---------
 train loss  0.6177898971764909
 val loss  0.7209388275849654
 train accuracy  0.6573796914575178
 val accuracy  0.5456221198156682
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6817931355953546
 val loss  0.6931335730486751
 train accuracy

# multihead

In [111]:
class LSTM_attn_target(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_target, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.multihead = nn.MultiheadAttention(2*lstm_dim, 1)
            
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            # self.softmax = nn.Softmax(dim=0)
            # self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.multihead = nn.MultiheadAttention(lstm_dim, 1)

            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            # self.softmax = nn.Softmax(dim=0)
            # self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        context1,weights1 = self.multihead(lstm_out1, lstm_out1, lstm_out1)
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        context2, weights2 = self.multihead(lstm_out2, lstm_out2, lstm_out2)

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((context1[idx1,:].unsqueeze(0), context2[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = weights1[idx1,:], weights2[idx2,:]
        except:
            cat_rep = torch.cat((context1[-1,:].unsqueeze(0), context2[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = weights1[-1,:], weights2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')



        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2

In [112]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [113]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_target(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6843245526667281
 val loss  0.7005896971890919
 train accuracy  0.5532424465733236
 val accuracy  0.5188087774294671
------Early stopping after epoch: 5 ---------
 train loss  0.6338639958605841
 val loss  0.7201122893808778
 train accuracy  0.6363301400147384
 val accuracy  0.5407523510971787


0.5407523510971787

In [114]:
dev_data[0]['word']

'board'

In [115]:
ratios1[0]

tensor([0.2492, 0.2793, 0.2362, 0.2352], grad_fn=<SliceBackward0>)

In [116]:
dev_data[0]['sentence1']

'Room and board .'

In [117]:
ratios2[0]

tensor([0.1131, 0.2738, 0.1340, 0.1479, 0.1295, 0.1010, 0.1008],
       grad_fn=<SliceBackward0>)

In [118]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [121]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_target(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _, _ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [122]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6840382399658934
 val loss  0.7199507330969181
 train accuracy  0.5678102693990329
 val accuracy  0.48110599078341015
---------epoch: 10 ---------
 train loss  0.5803371625316601
 val loss  0.7269385553175404
 train accuracy  0.6919180290122036
 val accuracy  0.5529953917050692
------Early stopping after epoch: 10 ---------
 train loss  0.5803371625316601
 val loss  0.7269385553175404
 train accuracy  0.6919180290122036
 val accuracy  0.5529953917050692
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6820844403889881
 val loss  0.7047137704313076
 train accuracy  0.5399493437715864
 val accuracy  0.4894009216589862
---------epoch: 10 ---------
 train loss  0.5777366685614782
 val loss  0.7008340721306163
 train accuracy  0.6905364955100162
 val accuracy  0.5594470046082949
------Early stopping after epoch: 10 ---------
 train loss  0.5777366685614782
 val loss  0.7008340721306163
 train accuracy  0.6905364955

# baseline global attention: target

In [40]:
class LSTM_attn_global(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_global, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.wg = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.wg = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)


        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        


        # ws2 = self.ws(lstm_out2)
        wg1 = self.wg(lstm_out1)
        # print('wg1',wg1.shape)   

        similarg1 = torch.bmm(wg1.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))
        # print('similarg1',similarg1.shape) 

        attng1 = self.softmax(torch.squeeze(similarg1))
        # print('attng1',attng1.shape)

        contextg1 = torch.squeeze(torch.bmm(attng1.unsqueeze(0), lstm_out2.unsqueeze(0)))

        wg2 = self.wg(lstm_out2)
        similarg2 = torch.bmm(wg2.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))    
        attng2 = self.softmax(torch.squeeze(similarg2))
        contextg2 = torch.squeeze(torch.bmm(attng2.unsqueeze(0), lstm_out1.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        try:
            cat_rep = torch.cat((contextg1[idx1,:].unsqueeze(0), contextg2[idx2,:].unsqueeze(0)),1)
            ratio1, ratio2 = attng1[idx1,:], attng2[idx2,:]
        except:
            cat_rep = torch.cat((contextg1[-1,:].unsqueeze(0), contextg2[-1,:].unsqueeze(0)),1)
            ratio1, ratio2 = attng1[-1,:], attng2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')


        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)
        # print(ratio1)
        # print(ratio2)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [41]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [42]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_global(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6820347672801906
 val loss  0.6880005743810002
 train accuracy  0.5530582166543847
 val accuracy  0.5376175548589341
------Early stopping after epoch: 5 ---------
 train loss  0.6275072540530582
 val loss  0.7048903318781837
 train accuracy  0.6400147383935151
 val accuracy  0.5360501567398119


0.5360501567398119

In [59]:
s1 = sen2glove(dev_data[0]["sentence1"],glove_embs)
s2 = sen2glove(dev_data[0]["sentence2"],glove_embs)
y_raw,ratio1, ratio2 = model(s1,s2,int(dev_data[0]['idx1']),int(dev_data[0]['idx2']))
y_raw >= 0.5

tensor(False)

In [46]:
dev_data[0]["label"]

False

In [47]:
dev_data[0]["sentence2"]

'He nailed boards across the windows .'

In [None]:
dev_data[0]['word']

'board'

In [10]:
dev_data[0]['sentence1']

'Room and board .'

In [11]:
ratios1[0]

tensor([0.1492, 0.1612, 0.1519, 0.1452, 0.1465, 0.1306, 0.1155],
       grad_fn=<SliceBackward0>)

In [12]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [13]:
ratios2[0]

tensor([0.0824, 0.3849, 0.3054, 0.2273], grad_fn=<SliceBackward0>)

In [14]:
dev_data[0]['sentence1']

'Room and board .'

In [88]:
wrong_idx = []
wrong_ratio1 = []
wrong_ratio2 = []

for i in range(len(train_data)):
    sample = train_data[i]

    # a) calculate probs / get an output
    if init_word_embs == "glove":
        s1 = sen2glove(sample["sentence1"],glove_embs)
        s2 = sen2glove(sample["sentence2"],glove_embs)
    else:
        s1 = sen2vec(sample["sentence1"])
        s2 = sen2vec(sample["sentence2"])

    
    y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))



    result = True if y_raw >= 0.5 else False

    if bool(result) != sample["label"]:
        wrong_idx.append(i)
        if ratio1.shape[0] != s2.shape[1]:
            print('stop')
        wrong_ratio1.append(ratio1)
        wrong_ratio2.append(ratio2)
        

In [89]:
i = 0
print(i)
sample = train_data[wrong_idx[i]]
print(sample["label"])
print(sample["sentence2"])
print(wrong_ratio1[i])
print(sample['word'])
print(sample['sentence1'])
print(wrong_ratio2[i])


0
False
Do you think the sofa will go through the door ?
tensor([0.2249, 0.2059, 0.1078, 0.0671, 0.0596, 0.0874, 0.0771, 0.0399, 0.0341,
        0.0520, 0.0441], grad_fn=<SliceBackward0>)
go
Messages must go through diplomatic channels .
tensor([0.2377, 0.1857, 0.2862, 0.1282, 0.0642, 0.0554, 0.0426],
       grad_fn=<SliceBackward0>)


In [15]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_global(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [16]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6854151638592275
 val loss  0.7222273725518433
 train accuracy  0.5440939442781487
 val accuracy  0.4359447004608295
---------epoch: 10 ---------
 train loss  0.5926936125661985
 val loss  0.6991351571500576
 train accuracy  0.6622150587151738
 val accuracy  0.5419354838709678
------Early stopping after epoch: 14 ---------
 train loss  0.5380422586453776
 val loss  0.7562424620175692
 train accuracy  0.7013585079438176
 val accuracy  0.5336405529953917
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6818069644219146
 val loss  0.7067108857466878
 train accuracy  0.5489293115358047
 val accuracy  0.5059907834101383
---------epoch: 10 ---------
 train loss  0.5745680684074949
 val loss  0.6918045254896313
 train accuracy  0.6781026939903293
 val accuracy  0.6018433179723502
------Early stopping after epoch: 10 ---------
 train loss  0.5745680684074949
 val loss  0.6918045254896313
 train accuracy  0.67810269399

# baseline global attention: hidden

In [18]:
class LSTM_attn_global(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_global, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.wg = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.wg = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)


        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        

        # ws2 = self.ws(lstm_out2)
        wg1 = self.wg(lstm_out1)
        # print('wg1',wg1.shape)   

        similarg1 = torch.bmm(wg1.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))
        # print('similarg1',similarg1.shape) 

        attng1 = self.softmax(torch.squeeze(similarg1))
        # print('attng1',attng1.shape)

        contextg1 = torch.squeeze(torch.bmm(attng1.unsqueeze(0), lstm_out2.unsqueeze(0)))

        wg2 = self.wg(lstm_out2)
        similarg2 = torch.bmm(wg2.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))    
        attng2 = self.softmax(torch.squeeze(similarg2))
        contextg2 = torch.squeeze(torch.bmm(attng2.unsqueeze(0), lstm_out1.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)

        cat_rep = torch.cat((contextg1[-1,:].unsqueeze(0), contextg2[-1,:].unsqueeze(0)),1)
        ratio1, ratio2 = attng1[-1,:], attng2[-1,:]
            # in total of 8 wrong indexing samples
            # print('wrong index')

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [19]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [20]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_global(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6816699056829173
 val loss  0.6951772755589979
 train accuracy  0.548268238761975
 val accuracy  0.5282131661442007
------Early stopping after epoch: 9 ---------
 train loss  0.5938608258106116
 val loss  0.7226690692961403
 train accuracy  0.6764922623434045
 val accuracy  0.5438871473354232


0.5438871473354232

In [21]:
dev_data[0]['word']

'board'

In [22]:
dev_data[0]['sentence1']

'Room and board .'

In [23]:
ratios1[0]

tensor([7.5036e-01, 2.4189e-01, 7.1304e-03, 2.2705e-04, 7.9730e-05, 3.6344e-05,
        2.7488e-04], grad_fn=<SliceBackward0>)

In [24]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [25]:
ratios2[0]

tensor([0.1432, 0.3175, 0.3788, 0.1606], grad_fn=<SliceBackward0>)

In [26]:
dev_data[0]['sentence1']

'Room and board .'

In [27]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_global(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [28]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6845730108076215
 val loss  0.7241532145557316
 train accuracy  0.5562974902141377
 val accuracy  0.4663594470046083
---------epoch: 10 ---------
 train loss  0.5784011268132627
 val loss  0.7443871282762097
 train accuracy  0.6928390513469952
 val accuracy  0.5751152073732719
------Early stopping after epoch: 11 ---------
 train loss  0.5636738200300772
 val loss  0.7623820362003169
 train accuracy  0.7022795302786092
 val accuracy  0.5658986175115207
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6816285950451877
 val loss  0.704836178265409
 train accuracy  0.548468800368409
 val accuracy  0.5023041474654378
------Early stopping after epoch: 8 ---------
 train loss  0.614458948847283
 val loss  0.7162808607250865
 train accuracy  0.6635965922173612
 val accuracy  0.552073732718894
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6822004676167108
 val loss  0.697111321374568
 train accuracy  

# baseline global attention: mean

In [29]:
class LSTM_attn_global(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(LSTM_attn_global, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)
            self.wg = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=1)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)
            self.wg = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

       

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
       

        # ws2 = self.ws(lstm_out2)
        wg1 = self.wg(lstm_out1)
        # print('wg1',wg1.shape)   

        similarg1 = torch.bmm(wg1.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))
        # print('similarg1',similarg1.shape) 

        attng1 = self.softmax(torch.squeeze(similarg1))
        # print('attng1',attng1.shape)

        contextg1 = torch.squeeze(torch.bmm(attng1.unsqueeze(0), lstm_out2.unsqueeze(0)))

        wg2 = self.wg(lstm_out2)
        similarg2 = torch.bmm(wg2.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))    
        attng2 = self.softmax(torch.squeeze(similarg2))
        contextg2 = torch.squeeze(torch.bmm(attng2.unsqueeze(0), lstm_out1.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        cat_rep = torch.cat((contextg1.mean(dim = 0).unsqueeze(0), contextg2.mean(dim = 0).unsqueeze(0)),1)
        ratio1, ratio2 = None, None


        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [30]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_attn(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc, ratios1, ratios2


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc,ratios1, ratios2



In [31]:
lr0 = 0.001
epochs = 20
model = LSTM_attn_global(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc,ratios1, ratios2 = training_attn(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6825534086852892
 val loss  0.6936790442391997
 train accuracy  0.5490051584377302
 val accuracy  0.5344827586206896
------Early stopping after epoch: 8 ---------
 train loss  0.5942335135811994
 val loss  0.7131174021753771
 train accuracy  0.6923360353721444
 val accuracy  0.5595611285266457


0.5595611285266457

In [32]:
dev_data[0]['word']

'board'

In [33]:
dev_data[0]['sentence1']

'Room and board .'

In [34]:
ratios1[0]

In [35]:
dev_data[0]['sentence2']

'He nailed boards across the windows .'

In [36]:
ratios2[0]

In [37]:
dev_data[0]['sentence1']

'Room and board .'

In [38]:
def LSTM_attn_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = LSTM_attn_global(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc, _,_ = training_attn(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [39]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = LSTM_attn_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.684605390499079
 val loss  0.7173834312896026
 train accuracy  0.5542251899608566
 val accuracy  0.45069124423963136
------Early stopping after epoch: 8 ---------
 train loss  0.6219827299317868
 val loss  0.7248472152217742
 train accuracy  0.6444853787704352
 val accuracy  0.5059907834101383
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6820965265585425
 val loss  0.7036602530061924
 train accuracy  0.5454754777803362
 val accuracy  0.5023041474654378
------Early stopping after epoch: 8 ---------
 train loss  0.6101898326869387
 val loss  0.7015468052455357
 train accuracy  0.6513930462813723
 val accuracy  0.5539170506912442
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.6780434998668835
 val loss  0.6966852566064228
 train accuracy  0.5429426663596593
 val accuracy  0.535483870967742
------Early stopping after epoch: 9 ---------
 train loss  0.5926661235573049
 val loss  0.69718838880688

In [17]:
# baseline global attention: hidden

# BERT

In [None]:
class BERT_mean(torch.nn.Module):
    def __init__(self, rnn_bidirect, glove, load_weights, lstm_dim=10, layer_num=1, hidden_dim = 20,p_drop=0.1):

        # TODO: Declare LSTM model architecture
        super(BERT_mean, self).__init__()

        if glove == 'glove':
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=True)
        else:
            self.embedding = nn.Embedding.from_pretrained(load_weights, freeze=False)

        if rnn_bidirect:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False, bidirectional = True) # one layer, bidirectional
            self.hidden_layer = nn.Linear(4*lstm_dim,hidden_dim)
            
            self.softmax = nn.Softmax(dim=0)
            self.ws = nn.Linear(2*lstm_dim, 2*lstm_dim, bias=False)

        else:
            self.lstm = nn.LSTM(50, lstm_dim, num_layers=layer_num, bias=False) # two layers
            self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

            self.softmax = nn.Softmax(dim=0)
            self.ws = nn.Linear(lstm_dim, lstm_dim, bias=False)

        # final hidden layer
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.output_layer = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, s1, s2, idx1, idx2):
        # TODO: Implement LSTM forward pass
        embed1 = self.embedding(s1).squeeze(0)
		# print("embedding_squeeze",self.embedding(s1).squeeze(0).shape)
        lstm_out1,(_,_) = self.lstm(embed1)
        # print('lstm',lstm_out1.shape)

        ws1 = self.ws(lstm_out1)
        # print('ws1',ws1.shape)

        similar1 = torch.bmm(ws1.unsqueeze(0), torch.transpose(lstm_out1,0,1).unsqueeze(0))
        # print('similar1',similar1.shape)
        
        attn1 = self.softmax(torch.squeeze(similar1))
        # print('attn1',attn1.shape)

        context1 = torch.squeeze(torch.bmm(attn1.unsqueeze(0), lstm_out1.unsqueeze(0)))
        # print('context1',context1.shape)

        # condense1 = self.condense(context1)
        # print('condense1',condense1.shape)

        embed2 = self.embedding(s2).squeeze(0)
        lstm_out2,(_,_) = self.lstm(embed2)
        
        ws2 = self.ws(lstm_out2)
        similar2 = torch.bmm(ws2.unsqueeze(0), torch.transpose(lstm_out2,0,1).unsqueeze(0))    
        attn2 = self.softmax(torch.squeeze(similar2))
        context2 = torch.squeeze(torch.bmm(attn2.unsqueeze(0), lstm_out2.unsqueeze(0)))

        # cat_rep = torch.cat((lstm_out1[-1,:].unsqueeze(0), lstm_out2[-1,:].unsqueeze(0)),1)
        cat_rep = torch.cat((context1.mean(dim = 0).unsqueeze(0), context2.mean(dim = 0).unsqueeze(0)),1)
        ratio1, ratio2 = attn1[idx1,:], attn2[idx2,:]

        hidden_rep = self.hidden_layer(cat_rep)
        relu_rep = self.relu(hidden_rep)
		# print("hidden",hidden_rep.shape)
        drop = self.dropout(relu_rep)
		# print("drop",drop.shape)
        output = self.output_layer(drop)
        # print("ouput",output.shape)

        output = self.sigmoid(output)
        # print("sigmoid",output.shape)

        return output.squeeze(0).squeeze(0),ratio1, ratio2



In [None]:
# TODO: Training and validation loop here

lr0 = 0.001
epochs = 20
patience = 5

def training_BERT_mean(model,train_dataset, valid_dataset,lr0,epochs):
    ce = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr0)

    train_acc = []
    dev_acc = []
    train_loss = []
    val_loss = []
    # test_output = []

    best_val_loss = 1

    for epoch in range(epochs):

        model.train()

        #print("Epoch:",i)
        total_loss = 0

        for i in range(len(train_dataset)):
            sample = train_dataset[i]
            
            optimizer.zero_grad()

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)

                # print(sample["sentence1"])
                # print(sample["sentence2"])

            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            # print(int(sample['idx1']))
            # print(int(sample['idx2']))

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            

            y = tensor(float(sample["label"]))
            
            # b) compute loss
            loss = ce(y_raw,y)
            total_loss += loss

            # c) get the gradient
            loss.backward()

            # d) update the weights
            optimizer.step()
        train_loss.append(total_loss.item()/len(train_dataset))

        model.eval()

        score = 0
        
        for i in range(len(train_dataset)):
            sample = train_dataset[i]

            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,_, _ = model(s1,s2,int(sample['idx1']),int(sample['idx2']))

            result = True if y_raw >= 0.5 else False

            if bool(result) == sample["label"]:
                score += 1
        
        train_acc.append(score/len(train_dataset))
        

        score = 0
        total_loss = 0
        ratios1 = []
        ratios2 = []
        for i in range(len(valid_dataset)):
            sample = valid_dataset[i]
            # a) calculate probs / get an output
            if init_word_embs == "glove":
                s1 = sen2glove(sample["sentence1"],glove_embs)
                s2 = sen2glove(sample["sentence2"],glove_embs)
            else:
                s1 = sen2vec(sample["sentence1"])
                s2 = sen2vec(sample["sentence2"])

            y_raw,ratio1, ratio2 = model(s1,s2,int(sample['idx1']),int(sample['idx2']))
            
            ratios1.append(ratio1)
            ratios2.append(ratio2)
            
            y = tensor(float(sample["label"]))
            loss = ce(y_raw,y)
            total_loss += loss

            result = True if y_raw >= 0.5 else False
            if bool(result) == sample["label"]:
                score += 1

        val_loss.append(total_loss.item()/len(valid_dataset))
        dev_acc.append(score/len(valid_dataset))



        if epoch% 10 == 0:
            print("---------epoch:",epoch,"---------")
            print(" train loss ", train_loss[-1]) 
            print(" val loss ", val_loss[-1])
            print(" train accuracy ",train_acc[-1])
            print(" val accuracy ",dev_acc[-1])

        # check if validation loss has improvedval loss < best val loss:
        if val_loss[-1] < best_val_loss:
            best_val_loss = val_loss[-1] 
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("------Early stopping after epoch:",epoch,"---------")
                print(" train loss ", train_loss[-1]) 
                print(" val loss ", val_loss[-1]) 
                print(" train accuracy ",train_acc[-1])
                print(" val accuracy ",dev_acc[-1])
                return train_loss,val_loss, train_acc,dev_acc


    print("---------endng epoch:",epoch,"---------")
    print(" train loss ", train_loss[-1]) 
    print(" val loss ", val_loss[-1]) 
    print(" train accuracy ",train_acc[-1])
    print(" val accuracy ",dev_acc[-1])
    return train_loss,val_loss, train_acc,dev_acc



In [None]:
lr0 = 0.001
epochs = 20
model = BERT_mean(rnn_bidirect,init_word_embs, weights).to(torch_device)
train_loss,val_loss,train_acc,dev_acc = training_BERT_mean(model,train_data, dev_data,lr0,epochs)
dev_acc[-1]

---------epoch: 0 ---------
 train loss  0.6821564777686302
 val loss  0.7007296092831603
 train accuracy  0.5477155490051584
 val accuracy  0.5109717868338558
------Early stopping after epoch: 5 ---------
 train loss  0.6278748143307848
 val loss  0.7451646380289969
 train accuracy  0.614406779661017
 val accuracy  0.5407523510971787


0.5407523510971787

In [None]:
def BERT_mean_kFold(k,epochs,lr0,*inputs):
    
    num_val_samples = len(train_data)//k
    cv_score = []
    for i in range(k):
        print('Processing fold: ', i + 1)
        """%%%% Initiate new model %%%%""" #in every fold
        model = BERT_mean(*inputs).to(torch_device)

        valid_idx = np.arange(len(train_data))[i * num_val_samples:(i + 1) * num_val_samples]
        train_idx = np.concatenate([np.arange(len(train_data))[:i * num_val_samples], np.arange(len(train_data))[(i + 1) * num_val_samples:]], axis=0)
        
        train_dataset = Subset(train_data, train_idx)
        valid_dataset = Subset(train_data, valid_idx)

        
        _,_,_,valid_acc = training_BERT_mean(model,train_dataset, valid_dataset,lr0,epochs)
        cv_score.append(valid_acc[-1])
    
    print('cv_score: ',sum(cv_score)/len(cv_score))

    return sum(cv_score)/len(cv_score)

In [None]:
epochs = 20
k = 5

params = {}
params['lr0'] = [0.001]

params['lstm_dim'] = [10]
params['layer_num'] = [1]
params['hidden_dim'] = [20]
params['p_drop'] = [0.1]

result = []

best_params = {}
best_score = 0
for lr0 in params['lr0']:
    for layer_num in params['layer_num']:
        for lstm_dim in params['lstm_dim']:
            for hidden_dim in params['hidden_dim']:
                for p_drop in params['p_drop']:
                    inputs = rnn_bidirect, init_word_embs, weights, lstm_dim, layer_num, hidden_dim,p_drop
                    score = BERT_mean_kFold(k, epochs,lr0,*inputs)
                    result.append(score)

                    print('current setting is ','lr0',lr0,'layer_num',layer_num,'lstm_dim',lstm_dim,'hidden_dim',hidden_dim,'p_drop',p_drop)
                    print('current score is',score)

                    if score>best_score:
                        best_score = score
                        best_params['lr0'] = lr0
                        best_params['layer_num'] = layer_num
                        best_params['lstm_dim'] = lstm_dim
                        best_params['hidden_dim'] = hidden_dim
                        best_params['p_drop'] = p_drop

print('best score is', best_score)
print('best_parameters are', best_params)
                    



Processing fold:  1
---------epoch: 0 ---------
 train loss  0.6854976871006505
 val loss  0.7224369734663019
 train accuracy  0.5512318673727837
 val accuracy  0.4589861751152074
---------epoch: 10 ---------
 train loss  0.5755603148205446
 val loss  0.779713601670507
 train accuracy  0.6658991480543404
 val accuracy  0.5502304147465438
------Early stopping after epoch: 10 ---------
 train loss  0.5755603148205446
 val loss  0.779713601670507
 train accuracy  0.6658991480543404
 val accuracy  0.5502304147465438
Processing fold:  2
---------epoch: 0 ---------
 train loss  0.6815547288740502
 val loss  0.6971167217201901
 train accuracy  0.5629749021413769
 val accuracy  0.5345622119815668
------Early stopping after epoch: 8 ---------
 train loss  0.6026027538028149
 val loss  0.7726247479838709
 train accuracy  0.6431038452682477
 val accuracy  0.567741935483871
Processing fold:  3
---------epoch: 0 ---------
 train loss  0.679816512833151
 val loss  0.6972530804471486
 train accuracy 

# test

In [8]:
# TODO: Testing loop
# Write predictions (F or T) for each test example into test.pred.txt
# One line per each example, in the same order as test.data.txt.

# score = 0
# for i in range(len(test_data)):
#     sample = test_data[i]
#     # a) calculate probs / get an output
#     if init_word_embs == "glove":
#         s1 = sen2glove(sample["sentence1"],glove_embs)
#         s2 = sen2glove(sample["sentence2"],glove_embs)
#     else:
#         s1 = sen2vec(sample["sentence1"])
#         s2 = sen2vec(sample["sentence2"])

#     y_raw = model(s1,s2)
#     result = True if y_raw >= 0.5 else False
#     if bool(result) == sample["label"]:
#         score += 1

#     output = "T" if y_raw >= 0.5 else "F"
#     test_output.append(result)

# print(" test accuracy ",score/len(test_data))

# with open('test.pred.txt', 'w') as f:
#     for line in test_output:
#         f.write(f"{line}\n")