In [2]:
from Preprocess import get_words_and_embeddings
from Preprocess import questionID_to_questionData
from Preprocess import training_id_to_similar_different
from Preprocess import devTest_id_to_similar_different
from get_q_matrices_functions import get_question_matrix
from get_q_matrices_functions import padded_q_matrix
import numpy as np
import torch
from torch.autograd import Variable
from torch.nn.modules.distance import CosineSimilarity
import scipy.stats

In [3]:
# load/pre-process data
word2vec = get_words_and_embeddings()
id2Data = questionID_to_questionData(py35 = True)
training_data = training_id_to_similar_different(py35 = True)
dev_data = devTest_id_to_similar_different(dev=True)
test_data = devTest_id_to_similar_different(dev=False)

In [4]:
# LSTM parameters
input_size = len(word2vec[list(word2vec.keys())[0]])
hidden_size = 100
num_layers = 1
bias = True
batch_first = True
dropout = 0
bidirectional = False

# LSTM model
lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional)

# initial hidden layer and cell state
h0 = Variable(torch.zeros(1, 1, 100))
c0 = Variable(torch.zeros(1, 1, 100))

# loss function
loss_function = torch.nn.MarginRankingLoss(margin=0.2, size_average=False)

# cosine similarity distance metric
cosSim = CosineSimilarity()

# number of negative examples to sample for each question
sample_size = 20

# number of questions before calling .backward()
batch_size = 100

# adam optimizer
weight_decay = 0.001
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.01, weight_decay=0.001)

# epoch count and number of iterations through train data
epoch_number = 1
num_epochs = 1

In [None]:
for epoch in range(num_epochs):
    for batch in range(1, round(len(training_data.keys())/batch_size) + 2):
        # get questions for current batch
        try:
            q_batch = list(training_data.keys())[:batch_size*batch]
        except IndexError:
            q_batch = list(training_data.keys())[batch_size*(batch-1):]
        
        # zeroing gradient and batch_loss
        optimizer.zero_grad()
        batch_loss = Variable(torch.zeros(1).float())
        count = 0
        for q in q_batch:
            # for tracking
            print(count)
            count += 1
            
            # get question matrix
            q_matrix_3d = Variable(torch.from_numpy(get_question_matrix(q, word2vec, id2Data)))

            # get positive question matrix
            pos_q_matrix_3d = Variable(torch.from_numpy(get_question_matrix(training_data[q][0][0], word2vec, id2Data)))

            # get negative question IDs and convert questions to matrices
            neg_qs = np.random.choice(training_data[q][1], sample_size, replace = False)
            neg_qs_matrix_list = []
            neg_qs_seq_length = []
            for p in neg_qs:
                neg_q_matrix_3d = get_question_matrix(p, word2vec, id2Data)
                neg_qs_matrix_list.append(neg_q_matrix_3d)
                neg_qs_seq_length.append(neg_q_matrix_3d.shape[1])

            # do padding for negative questions
            neg_qs_padded = padded_q_matrix(neg_qs_seq_length, neg_qs_matrix_list, input_size)

            # initial hidden layer and cell state
            h0 = Variable(torch.zeros(1, 1, 100))
            c0 = Variable(torch.zeros(1, 1, 100))

            # get hidden layers for question and average them
            q_hidden = lstm(q_matrix_3d, (h0, c0))[0]
            avg_h_q = torch.sum(q_hidden, dim = 1)/q_matrix_3d.size()[1]

            # get hidden layers for positive question and average them
            pos_q_hidden = lstm(pos_q_matrix_3d, (h0, c0))[0]
            avg_h_pos_q = torch.sum(pos_q_hidden, dim = 1)/pos_q_matrix_3d.size()[1]

            # get cosine similarity between question and positive question; repeat for tensor length 20 for loss function input
            score_pos_q = cosSim.forward(avg_h_q, avg_h_pos_q)
            score_pos_q_vec = Variable(torch.ones(len(neg_qs_seq_length))).float() * score_pos_q

            # get hidden layers for all negative questions and sum then for each sequence
            neg_q_hidden = torch.nn.utils.rnn.pad_packed_sequence(lstm(neg_qs_padded, (h0, c0))[0], batch_first=True)
            sum_h_neg_q = torch.sum(neg_q_hidden[0], dim = 1)

            # average hidden layers for each negative question and get cosine similarities between negative questions and positive question
            score_neg_qs = Variable(torch.zeros(sum_h_neg_q.size()[0]).float())
            for i in range(sum_h_neg_q.size()[0]):
                avg_h_neg_q_i = (sum_h_neg_q[i]/neg_q_hidden[1][i]).unsqueeze(0)
                score_neg_q_i = cosSim.forward(avg_h_q, avg_h_neg_q_i)
                score_neg_qs[i] = score_neg_q_i

            # add loss for question to batch loss
            batch_loss += loss_function.forward(score_pos_q_vec, score_neg_qs, -1.0 * Variable(torch.ones(score_neg_qs.size()[0])))

        # take gradient wrt batch_loss
        batch_loss.backward()
        
        print(batch_loss.data[0])

        # update weights
        optimizer.step()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
404.3222961425781
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [19]:
for epoch in range(num_epochs):
    for q in list(training_data.keys()):
        # zeroing gradient
        optimizer.zero_grad()
        
        # get question matrix
        q_matrix_3d = Variable(torch.from_numpy(get_question_matrix(q, word2vec, id2Data)))

        # get positive question matrix
        pos_q_matrix_3d = Variable(torch.from_numpy(get_question_matrix(training_data[q][0][0], word2vec, id2Data)))

        # get negative question IDs and convert questions to matrices
        neg_qs = np.random.choice(training_data[q][1], sample_size, replace = False)
        neg_qs_matrix_list = []
        neg_qs_seq_length = []
        for p in neg_qs:
            neg_q_matrix_3d = get_question_matrix(p, word2vec, id2Data)
            neg_qs_matrix_list.append(neg_q_matrix_3d)
            neg_qs_seq_length.append(neg_q_matrix_3d.shape[1])

        # do padding for negative questions
        neg_qs_padded = padded_q_matrix(neg_qs_seq_length, neg_qs_matrix_list, input_size)

        # get hidden layers for question and average them
        q_hidden = lstm(q_matrix_3d, (h0, c0))[0]
        avg_h_q = torch.sum(q_hidden, dim = 1)/q_matrix_3d.size()[1]

        # get hidden layers for positive question and average them
        pos_q_hidden = lstm(pos_q_matrix_3d, (h0, c0))[0]
        avg_h_pos_q = torch.sum(pos_q_hidden, dim = 1)/pos_q_matrix_3d.size()[1]

        # get cosine similarity between question and positive question; repeat for tensor length 20 for loss function input
        score_pos_q = cosSim.forward(avg_h_q, avg_h_pos_q)
        score_pos_q_vec = Variable(torch.ones(len(neg_qs_seq_length))).float() * score_pos_q

        # get hidden layers for all negative questions and sum then for each sequence
        neg_q_hidden = torch.nn.utils.rnn.pad_packed_sequence(lstm(neg_qs_padded, (h0, c0))[0], batch_first=True)
        sum_h_neg_q = torch.sum(neg_q_hidden[0], dim = 1)

        # average hidden layers for each negative question and get cosine similarities between negative questions and positive question
        score_neg_qs = Variable(torch.zeros(sum_h_neg_q.size()[0]).float())
        for i in range(sum_h_neg_q.size()[0]):
            avg_h_neg_q_i = (sum_h_neg_q[i]/neg_q_hidden[1][i]).unsqueeze(0)
            score_neg_q_i = cosSim.forward(avg_h_q, avg_h_neg_q_i)
            score_neg_qs[i] = score_neg_q_i

        # add loss for question to batch loss
        batch_loss = loss_function.forward(score_pos_q_vec, score_neg_qs, Variable(torch.ones(score_neg_qs.size()[0])))

        # take gradient wrt batch_loss
        batch_loss.backward()
        
        print(batch_loss.data[0])

        # update weights
        optimizer.step()

Variable containing:
 0.4999
[torch.FloatTensor of size 1]

Variable containing:
 0.5000
[torch.FloatTensor of size 1]

Variable containing:
 0.4999
[torch.FloatTensor of size 1]

Variable containing:
 0.5008
[torch.FloatTensor of size 1]

Variable containing:
 0.5000
[torch.FloatTensor of size 1]

Variable containing:
 0.4999
[torch.FloatTensor of size 1]

Variable containing:
 0.5001
[torch.FloatTensor of size 1]

Variable containing:
 0.4998
[torch.FloatTensor of size 1]

Variable containing:
 0.5000
[torch.FloatTensor of size 1]

Variable containing:
 0.5000
[torch.FloatTensor of size 1]

Variable containing:
 0.4999
[torch.FloatTensor of size 1]

Variable containing:
 0.4999
[torch.FloatTensor of size 1]

Variable containing:
 0.5000
[torch.FloatTensor of size 1]

Variable containing:
 0.5004
[torch.FloatTensor of size 1]

Variable containing:
 0.4999
[torch.FloatTensor of size 1]

Variable containing:
 0.5000
[torch.FloatTensor of size 1]

Variable containing:
 0.4999
[torch.Floa

KeyboardInterrupt: 