In [0]:
import torch

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd drive/My Drive

/content/drive/My Drive


In [4]:
cd MSRP

/content/drive/My Drive/MSRP


In [0]:
from data import Data
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import f1_score
import sklearn

In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
use_cuda = torch.cuda.is_available()

In [0]:
data_file = "./dataset/train.tsv"
data_test_file = "./dataset/test.tsv"
training_ratio = 0.999999
max_len = 20
tracking_pair = False
hidden_size = 50
batch_size = 16
num_iters = 10
learning_rate = 0.001

In [9]:
data = Data(data_file,data_test_file,training_ratio,max_len)

Number of Training Samples   : 4075
Number of Validation Samples   : 1


In [10]:
len(data.word2index)

15545

In [11]:
print('Number of training samples        :', len(data.x_train))
print('Number of validation samples      :', len(data.x_val))
print('Maximum sequence length           :', max_len)

Number of training samples        : 4075
Number of validation samples      : 1
Maximum sequence length           : 20


In [0]:
embd_file = "./glove-global-vectors-for-word-representation/glove.6B.100d.txt"

In [0]:
from embedding_helper2 import Get_Embedding

In [14]:
embedding = Get_Embedding(embd_file, data.word2index)
embedding_size = embedding.embedding_matrix.shape[1]

<class 'numpy.ndarray'>
Paragrams done


In [15]:
embedding_size

125

In [16]:
len(embedding.embedding_matrix)

15546

In [0]:
import torch.nn as nn
from torch import Tensor
from torch import optim
import torch.nn.functional as F

In [18]:
# Stop words already removed
'''stops = set(stopwords.words('english'))
stopNos = []
for i in range(len(data.index2word)):
    if data.index2word[i] in stops:
        stopNos.append(i)
        
stopNos
print(len(stopNos))'''

"stops = set(stopwords.words('english'))\nstopNos = []\nfor i in range(len(data.index2word)):\n    if data.index2word[i] in stops:\n        stopNos.append(i)\n        \nstopNos\nprint(len(stopNos))"

In [0]:
def commonWords(sen_1, sen_2):
  d = np.empty(len(data.word2index), dtype=int)
  for i in range(len(d)):
    d[i] = -1
    
  listPairs = []
  list1 = []
  list2 = []
  for i in range(len(sen_1)):
    d[sen_1[i]] = i
    
  for i in range(len(sen_2)):
    if d[sen_2[i]] > -1 and sen_2[i] != 0 :
      list1.append(d[sen_2[i]])
      list2.append(i)
    
  list1 = list(dict.fromkeys(list1))
  list2 = list(dict.fromkeys(list2))
  
  listPairs.append(list1)
  listPairs.append(list2)
  return listPairs

In [0]:
def max_pool(e_list):
  e_list = np.array(e_list)
  
  for i in range(len(e_list)):
    e_list[i] = e_list[i].data.cpu().numpy()
  mp = []
  for i in range(100):
    m = e_list[0][i]
    for j in range(len(e_list)):
      m = max(m, e_list[j][i])
    mp.append(m)
      
  #print("Length of mp = " + str(len(mp)))
  return torch.cuda.FloatTensor(mp)

In [0]:
class Manhattan_LSTM(nn.Module):
    def __init__(self, hidden_size, embedding, train_embedding = False):
        super(Manhattan_LSTM, self).__init__()
        self.use_cuda = torch.cuda.is_available()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(embedding.shape[0], embedding.shape[1])
        self.embedding.weight = nn.Parameter(embedding)
        self.input_size = embedding.shape[1]
        
        self.embedding.weight.requires_grad = train_embedding
        
        self.lstm_1 = nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bidirectional=True)
        self.lstm_2 = nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bidirectional=True)
        
    def exponent_neg_manhattan_distance(self, x1, x2):
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=1))
    
    def forward(self, input, hidden):
        
        #print(input[0])
        #print(input[1])
        
        ip0 = input[0].t()
        ip1 = input[1].t()
        
        commonList = []
        
        for i in range(batch_size):
            listPairs = commonWords(ip0[i], ip1[i])
            commonList.append(listPairs)
    
        commonList = np.array(commonList)
        
        
        
        #print(commonList)
        embedded_1 = self.embedding(input[0])
        embedded_2 = self.embedding(input[1])
        
        bs = embedded_1.size()[1]
        outputs_1, hidden_1 = self.lstm_1(embedded_1, hidden)
        outputs_2, hidden_2 = self.lstm_1(embedded_2, hidden)
        
        max_pool_1 = F.adaptive_avg_pool1d(outputs_1.permute(1,2,0),1).view(batch_size,-1)
        max_pool_2 = F.adaptive_avg_pool1d(outputs_2.permute(1,2,0),1).view(batch_size,-1)
        #print(max_pool_1.shape)
        
        ehs_1 = []
        for i in range(batch_size):
            e_list = []
            for j in range(len(commonList[i][0])):
                x = commonList[i][0][j]
              
                e_list.append(outputs_1[x][i])
            if len(e_list) > 0:
                mp1 = max_pool(e_list)
            else:
                mp1 = torch.zeros(100)
              
            ehs_1.append(mp1.cuda())
        
        
        ehs_2 = []
        for i in range(batch_size):
            e_list = []
            for j in range(len(commonList[i][1])):
                x = commonList[i][1][j]
              
                e_list.append(outputs_2[x][i])
            if len(e_list) > 0:
                mp2 = max_pool(e_list)
            else:
                mp2 = torch.zeros(100)
              
            ehs_2.append(mp2.cuda())

        #print(len(ehs_1[0]))
        
        ths_1 = torch.zeros(batch_size, 200)
        for i in range(batch_size):
            ths_1[i] = torch.cat((max_pool_1[i], ehs_1[i]),0)
          
        ths_2 = torch.zeros(batch_size, 200)
        for i in range(batch_size):
            ths_2[i] = torch.cat((max_pool_2[i], ehs_2[i]),0)
          
        
        similarity_scores = self.exponent_neg_manhattan_distance(ths_1.cuda(), ths_2.cuda())
        #similarity_scores = self.exponent_neg_manhattan_distance(max_pool_1, max_pool_2)
        
        return similarity_scores
    
    def init_weights(self):
        for name_1, param_1 in self.lstm_1.named_parameters():
            if 'bias' in name_1:
                nn.init.constant_(param_1, 0.0)
            elif 'weight' in name_1:
                nn.init.xavier_normal_(param_1)

        lstm_1 = self.lstm_1.state_dict()
        lstm_2 = self.lstm_2.state_dict()

        for name_1, param_1 in lstm_1.items():
            # Backwards compatibility for serialized parameters.
            if isinstance(param_1, torch.nn.Parameter):
                param_1 = param_1.data

            lstm_2[name_1].copy_(param_1)

    def init_hidden(self, batch_size):
        # Hidden dimensionality : 2 (h_0, c_0) x Num. Layers * Num. Directions x Batch Size x Hidden Size
        result = torch.zeros(2, 2, batch_size, self.hidden_size)
        result = tuple(result)

        if self.use_cuda: 
            result = (result[0].cuda(), result[1].cuda())
            return result
        else: return result

In [0]:
model = Manhattan_LSTM(hidden_size, embedding.embedding_matrix, train_embedding=False)
if use_cuda: model = model.cuda()
model.init_weights()

In [0]:
import time
import random
from torch import optim
import torch.nn.utils.rnn as rnn

In [24]:
x_train = data.x_train
x_val = data.x_val
y_train = data.y_train
y_val = data.y_val
x_test = data.x_test
y_test = data.y_test
train_samples = len(x_train)
val_samples = len(x_val)
test_samples = len(x_test)
test_samples

1725

In [29]:
input_variables = x_train[0:batch_size]
print(x_train[0])

sequences_1 = [sequence[0] for sequence in input_variables]
sequences_2 = [sequence[1] for sequence in input_variables]
batch_size = len(sequences_1)

temp = rnn.pad_sequence(sequences_1 + sequences_2)
sequences_1 = temp[:, :batch_size]
sequences_2 = temp[:, batch_size:]

ip1 = sequences_1.t()
ip2 = sequences_2.t()

print(len(ip1[4]))
commonList = []

for i in range(batch_size):
    listPairs = commonWords(ip1[i], ip2[i])
    commonList.append(listPairs)
    
commonList = np.array(commonList)
print(commonList)

[tensor([ 497, 4260, 1918, 1922,  158, 2329,  113,  762,  113,  497, 4260],
       device='cuda:0'), tensor([1918,  596,  158, 2329,  762,  113,  482,   59,  497, 4260,  113,  915,
        2293,  436,  467], device='cuda:0')]
23
[[list([2, 4, 5, 7, 8, 9, 10, 22])
  list([0, 2, 3, 4, 5, 8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22])]
 [list([3, 4, 5, 6, 7, 22])
  list([4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])]
 [list([1, 2, 4, 22])
  list([4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])]
 [list([3, 4, 5, 6, 7, 14, 9, 10, 11, 12, 13, 15, 17, 18, 22])
  list([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22])]
 [list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 22])
  list([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22])]
 [list([1, 4, 2, 3, 7, 8, 10, 22])
  list([2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])]
 [list([3, 4, 5, 6, 8, 9, 10, 11, 12, 22])
  list

In [0]:
criterion = nn.BCELoss()
print_every = 1
print_loss_total = 0.0
train_loss = 0.0
val_loss = 0.0

In [0]:
model_trainable_parameters = tuple(filter(lambda p: p.requires_grad, model.parameters()))
model_optimizer = optim.Adam(model_trainable_parameters, lr=learning_rate)

In [32]:
hidden = model.init_hidden(batch_size)
len(hidden[0][0][0])

50

In [0]:
from helper import Helper
help_fn = Helper()

In [34]:
start = time.time()
print('Beginning Model Training.\n')

for epoch in range(0, num_iters):
    for i in range(0, train_samples, batch_size):
        input_variables = x_train[i:i+batch_size]
        similarity_scores = y_train[i:i+batch_size]
        
        sequences_1 = [sequence[0] for sequence in input_variables]
        sequences_2 = [sequence[1] for sequence in input_variables]
        batch_size = len(sequences_1)

        temp = rnn.pad_sequence(sequences_1 + sequences_2)
        sequences_1 = temp[:, :batch_size]
        sequences_2 = temp[:, batch_size:]

        if model_optimizer: model_optimizer.zero_grad()
        loss = 0.0

        hidden = model.init_hidden(batch_size)
        output_scores = model([sequences_1, sequences_2], hidden).view(-1)

        loss += criterion(output_scores, similarity_scores)
        
        loss.backward()
        model_optimizer.step()
        
        train_loss = loss
        print_loss_total += loss
     
    
    if (epoch+1) % 5:
        learning_rate *= 0.5
        model_optimizer = optim.Adam(model_trainable_parameters, lr=learning_rate)
    
    a_scores = []
    p_scores = []
    corr = 0
    for i in range(0, test_samples, batch_size):
        input_variables = x_test[i:i+batch_size]
        actual_scores = y_test[i:i+batch_size]

        sequences_1 = [sequence[0] for sequence in input_variables]
        sequences_2 = [sequence[1] for sequence in input_variables]
        batch_size = len(sequences_1)

        temp = rnn.pad_sequence(sequences_1 + sequences_2)
        sequences_1 = temp[:, :batch_size]
        sequences_2 = temp[:, batch_size:]

        loss = 0.0
        

        hidden = model.init_hidden(batch_size)
        output_scores = model([sequences_1, sequences_2], hidden).view(-1)
        
        loss += criterion(output_scores, actual_scores)
        
        val_loss = loss
        
        
        for j in range(0, batch_size):
          acts = actual_scores[j].data.cpu().numpy()
          preds = output_scores[j].data.cpu().numpy()
          a_scores.append(acts)

          p = 0
          if preds >= 0.5:
            p = 1
          if acts == p:
            corr = corr + 1
          p_scores.append(p)
    
    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d) %.4f' % (help_fn.time_slice(start, (epoch+1) / num_iters), epoch, print_loss_avg))
        print("Training loss    " + str(train_loss.data.cpu().numpy()) + "    Test loss:    " + str(val_loss.data.cpu().numpy()))
        print(" Test Accuracy    " + str(corr/len(a_scores)) + "    f1 score    " + str(f1_score(p_scores, a_scores)))


Beginning Model Training.

0m 27s (- 4m 6s) (0) 217.0579
Training loss    0.58490187    Test loss:    0.66612184
 Test Accuracy    0.6330434782608696    f1 score    0.7260926006057984
0m 55s (- 3m 40s) (1) 269.1867
Training loss    0.7497622    Test loss:    0.8407385
 Test Accuracy    0.6701449275362319    f1 score    0.7684167684167683
1m 29s (- 3m 28s) (2) 768.5204
Training loss    0.6956213    Test loss:    0.55205494
 Test Accuracy    0.6828985507246377    f1 score    0.7651352511807642
2m 15s (- 3m 23s) (3) 2236.1389
Training loss    0.6183882    Test loss:    0.47931898
 Test Accuracy    0.6968115942028985    f1 score    0.7862689006947282
3m 1s (- 3m 1s) (4) 2165.9490
Training loss    0.6113146    Test loss:    0.46470895
 Test Accuracy    0.6985507246376812    f1 score    0.788617886178862
3m 48s (- 2m 32s) (5) 2140.8367
Training loss    0.61456966    Test loss:    0.4518956
 Test Accuracy    0.7026086956521739    f1 score    0.7918864097363083
4m 34s (- 1m 57s) (6) 2116.7263


In [0]:
# For test data

sum_diff = 0.0
a_scores = []
p_scores = []
corr = 0
for i in range(0, test_samples, batch_size):
    input_variables = x_test[i:i+batch_size]
    actual_scores = y_test[i:i+batch_size]
    
    sequences_1 = [sequence[0] for sequence in input_variables]
    sequences_2 = [sequence[1] for sequence in input_variables]
    batch_size = len(sequences_1)

    temp = rnn.pad_sequence(sequences_1 + sequences_2)
    sequences_1 = temp[:, :batch_size]
    sequences_2 = temp[:, batch_size:]

    if model_optimizer: model_optimizer.zero_grad()
    loss = 0.0

    hidden = model.init_hidden(batch_size)
    output_scores = model([sequences_1, sequences_2], hidden).view(-1)

    #loss += criterion(output_scores, similarity_scores)
    
    
    
    for j in range(0, batch_size):
        acts = actual_scores[j].data.cpu().numpy()
        preds = output_scores[j].data.cpu().numpy()
        a_scores.append(acts)
        
        p = 0
        if preds >= 0.5:
          p = 1
        print("Actual score:    " + str(acts) + "    Predicted score:    " + str(preds))
        
        if acts == p:
          corr = corr + 1
        
        p_scores.append(p)
        sum_diff+=abs(acts-preds)
        
    

print(len(a_scores))
print(f1_score(p_scores,a_scores))
print(corr/len(a_scores))
#print(spearmanr(p_scores,a_scores))
#print(sklearn.metrics.mean_squared_error(p_scores, a_scores))

Actual score:    1.0    Predicted score:    0.71425426
Actual score:    1.0    Predicted score:    0.61202294
Actual score:    1.0    Predicted score:    0.9509639
Actual score:    0.0    Predicted score:    0.52050894
Actual score:    0.0    Predicted score:    0.44496173
Actual score:    1.0    Predicted score:    0.6712246
Actual score:    0.0    Predicted score:    0.3904715
Actual score:    1.0    Predicted score:    0.7126737
Actual score:    1.0    Predicted score:    0.46509853
Actual score:    0.0    Predicted score:    0.057343215
Actual score:    1.0    Predicted score:    0.61287564
Actual score:    1.0    Predicted score:    0.6223024
Actual score:    1.0    Predicted score:    0.5585758
Actual score:    0.0    Predicted score:    0.5354292
Actual score:    1.0    Predicted score:    0.9013651
Actual score:    0.0    Predicted score:    0.50529057
Actual score:    1.0    Predicted score:    0.7450318
Actual score:    1.0    Predicted score:    0.76365274
Actual score:    1

In [0]:
print(learning_rate)

5.0331648000000016e-05
