-
Notifications
You must be signed in to change notification settings - Fork 0
/
neural_net_helpers.py
299 lines (271 loc) · 8.61 KB
/
neural_net_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
from __future__ import print_function
import numpy as np
import pickle
from keras.preprocessing import sequence
'''
Function: addUnseenWords
------------------------
This function accepts a mapping from words to vectors, the
desired size of each vector, and a list of strings representing
training sentences. It then takes any word in X that is not
in wordsToVecs, and adds it while initializing it to a random vector.
'''
def addUnseenWords(wordsToVecs, wordVecSize, X):
# For every article in the data
for sentence in X:
# For every word in the article
for word in sentence.split():
# If we've not seen it before, initialize a random vector.
if word not in wordsToVecs:
wordsToVecs[word] = 0.01*np.random.randn(wordVecSize)
'''
Function: createEmbeddingMatrix
-------------------------------
This function accepts a map from words to vectors, and a map from words
to indices, and then builds an embedding matrix such that row i of the
matrix contains the vector associated with the word at index i.
'''
def createEmbeddingMatrix(wordsToVecs, wordsToIndices):
print("Creating Embedding Matrix...")
numVocabWords = len(wordsToVecs) + 1
# Initialize the embedding matrix
embeddingMatrix = np.zeros([numVocabWords, len(wordsToVecs.values()[0])])
# For every word in our dictionary
for word in wordsToIndices:
# Get the index and vector
index = wordsToIndices[word]
vector = wordsToVecs[word]
# Set the appropriate element in the matrix
embeddingMatrix[index, :] = vector
# Return result
return embeddingMatrix
'''
Function: loadGloveVectors
--------------------------
Given a desired dimensionality (valid values are only 50, 100, 200, 300),
this function returns a dictionary mapping every word in the Glove corpus to
its corresponding pre-trained vector.
'''
def loadGloveVectors(nDim=100):
print('Loading Glove Vector Embeddings...')
wordsToVecs = {}
filename = 'glove.6B.' + str(nDim)+'d.txt'
f = open(filename, 'r')
# For every line in the file
for line in f:
# Split the line
split_line = line.split()
# Get the word in question
word = split_line[0]
# Convert the remaining values to an np array of floats
vector = np.array([float(x) for x in split_line[1:]])
# Assign the vector to that word.
wordsToVecs[word] = vector
return wordsToVecs
'''
Function: buildDictionary
-------------------------
This helper function takes in a mapping of words to vectors,
and uses this corpus of words to map each word to an index.
It then returns this mapping as a dictionary.
'''
def buildDictionary(wordsToVecs, saveToFile=True):
print("Mapping corpus words to indices...")
wordsToIndices = {}
for index, word in enumerate(wordsToVecs):
# We use index + 1 instead of index because we want
# to reserve the 0-th index for a padding token.
wordsToIndices[word] = index + 1
# Save the mappings to a file.
if saveToFile:
f = open('wordsToInd.p', 'wb')
pickle.dump(wordsToIndices, f)
f.close()
return wordsToIndices
'''
Function: convertToIndexSequence
--------------------------------
This function takes a list of articles (strings) and returns a list
of sequences of indices corresponding to the words in the article.
If there are any words that are not in the corpus, we ignore them.
The return value is a list of lists of indices.
'''
def convertToIndexSequence(X, wordsToIndices):
finalResult = []
# For every article
for index, article in enumerate(X):
# Split into words
split_line = article.split()
indexSequence = []
# For every word
for word in split_line:
# If it's not in our mapping from words to indices,
# ignore it.
if word not in wordsToIndices:
continue
# Append it to the sequence
indexSequence.append(wordsToIndices[word])
finalResult.append(indexSequence)
# Return the final result
return finalResult
'''
Function: breakApartInputs
--------------------------
This function takes in a list of index sequences X, a list of labels
Y, and an optional subsetSize. It then breaks X into chunks of size
subsetSize, and duplicates Y for the necessary number of times for each
article. It then returns the final results.
'''
def breakApartInputs(X, Y, subsetSize=7):
X_result = []
Y_result = []
# For every article-label pair
for article, label in zip(X, Y):
# For every chunk
for i in range(0, len(article), subsetSize):
X_result.append(article[i:i+subsetSize])
Y_result.append(label)
# Return result
return X_result, Y_result
'''
Function: augmentData
---------------------
This function takes in X, a list of sequences,
and Y, a list of corresponding labels,
and augments this data by appending the reverse of
every sequence to X, and appending Y to itself.
'''
def augmentData(X, Y):
print('Augmenting Data...')
X_reversed = []
# For every sequence
for sequence in X:
# Reverse it
reversedSeq = sequence[::-1]
X_reversed.append(reversedSeq)
# Return results
X += X_reversed
Y += Y
return X, Y
'''
Function: ensembleAccuracy
--------------------------
This function accepts an ensemble neural net model,
a given blockLength, a set of articles X (a list of
index sequences), and Y, a list of labels. It then
computes the accuracy of the model on this data.
'''
def ensembleAccuracy(model, blockLength, X, Y):
counter = 0.0
# For every article, label pair
for artSequence, label in zip(X, Y):
# Break article into chunks
chunks = []
for i in range(0, len(artSequence), blockLength):
chunks.append(artSequence[i:i+blockLength])
# Pad chunks to the same length
chunks = sequence.pad_sequences(chunks, padding='pre')
# Predict the probabilities for each chunk
probs = model.predict_proba(chunks, verbose=0)
sumProbs = np.sum(probs, axis=0)
# Compute the predicted label
predClass = np.argmax(sumProbs)
if predClass == label:
counter += 1
# Return final accuracy
return counter/float(len(Y))
'''
Function: ensembleProbs
--------------------------
This function accepts an ensemble neural net,
a block length, and a list of articles.
It then returns for every input sequence
the probabilities that it belongs to each
category.
'''
def ensembleProbs(model, blockLength, X):
allProbs = []
# For every article
for artSequence in X:
# Break the article into chunks
chunks = []
for i in range(0, len(artSequence), blockLength):
chunks.append(artSequence[i:i+blockLength])
# Pad chunks to same size
chunks = sequence.pad_sequences(chunks, padding='pre', maxlen=blockLength)
# Predict probability for each chunk
probs = model.predict_proba(chunks, verbose=0)
# Sum over the probability vectors
sumProbs = np.sum(probs, axis=0)
# Normalize and append final probabilities.
allProbs.append(sumProbs/np.sum(sumProbs))
return allProbs
'''
Function: convertToOneHot
------------------------------
Given a list of lists of indices,
converts every index into a one-hot vector
with dimensionality num_words.
'''
def convertToOneHot(Y, num_words):
# Create 3D matrix (num_sentences, num_words_per_sentence, num_vocab_words)
result = np.zeros((Y.shape[0], Y.shape[1], num_words))
# For every sentence
for sent in range(Y.shape[0]):
# For every word
for word in range(Y.shape[1]):
# Get the index from the index sequence
index = Y[sent, word]
result[sent][word][index] = 1
return result
'''
Function: convertYsToIndexSequence
------------------------------
Given a list of target strings,
converts it into a list of lists of indices
by mapping every word to a unique index starting
from 1. The 0-th index is reserved for the padding token.
'''
def convertYsToIndexSequence(Y):
final_result = []
words_to_inds = {}
inds_to_words = {}
inds_to_words[0] = 'PADDING_TOKEN'
counter = 1
for line in Y:
split_line = line.split()
index_seq = []
for token in split_line:
if token in words_to_inds:
index_seq += [words_to_inds[token]]
else:
index_seq += [counter]
words_to_inds[token] = counter
inds_to_words[counter] = token
counter += 1
final_result.append(index_seq)
return final_result, words_to_inds, inds_to_words, counter - 1
'''
Function: convert_to_word_list
------------------------------
Given a list of lists of indices, and a mapping from indices
to words, returns a list of reconstructed sentences.
'''
def convert_to_word_list(indices, inds_to_words):
final_list = []
# For every index list
for ind_list in indices:
# For every index
result_string = ''
for i, index in enumerate(ind_list):
word = inds_to_words[index]
if index == 0: continue
# If this is not the 0-th iteration,
# prepend a space
if i != 0: result_string += ' '
# Append the word
result_string += inds_to_words[index]
# Append the entire string to our list
final_list += [result_string]
# Return result
return final_list