-
Notifications
You must be signed in to change notification settings - Fork 0
/
MLFunctions.py
317 lines (227 loc) · 12.3 KB
/
MLFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
##########################################
# Machine Learning Functions and Classes #
##########################################
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def plot_history(history, title, save=False, fname=None):
"""
Pass in a history created by training a keras model,
and a simple string to represent the title of the figure
you'd like to display. Note that this function displays an
accuracy graph and is intended for use with classification models.
"""
# Clear former plots
plt.clf()
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.figure()
plt.title(title)
plt.xlabel('Epoch')
plt.ylabel('Accuracy [Backlog Assignment] (%)')
plt.plot(hist['epoch'], hist['accuracy'] * 100,
label='Train Accuracy')
plt.plot(hist['epoch'], hist['val_accuracy'] * 100,
label = 'Validation Accuracy')
plt.ylim([0,100])
plt.legend()
if save:
if fname is None:
fname = './history.png'
plt.savefig(fname)
def clear_memory(model=None):
"""
This simple clear memory function removes all existing keras models
from system (RAM) and/or GPU memeory (VRAM) to prevent memory leaks
and to ensure you can load another model without error. You should run
this clear_memory function every time you want to discard a currently loaded
model and build a new one.
If you are running a GPU instance, use the command
`nvidia-smi` from the bash terminal to determine the ammount of occupied VRAM.
If you see high memory usage, try using this function to clear the memory. If
the problem still persists, a memory leak is likely the culprit, so you will need
to kill all existing python/ jupyter kernals. If all else fails, a system restart
will always succeed in clearing the VRAM.
"""
K.clear_session()
if model:
del model
def test_with_uncertainty(model, test_features: pd.DataFrame, test_labels: pd.DataFrame, label_to_id: dict, n_iter=10) -> pd.DataFrame:
"""
USE FOR TESTING
---------------
Parameters:
- model: a keras model trained with Monte Carlo Dropout
- test_features: one or multiple samples (1 sample = 1 vector of
test_features).
- test_labels: the ground-truth labels corresponding to the samples.
(one-hot-encoded)
- label_to_id: A dictionary that maps prediction indicies (indicies of
the softmax outputs of the model) to label names.
- n_iter: the number of stochastic forward passes you'd like your model
to perform for each given test sample. i.e. for n_iter=10, the
function applies 10 distinct dropout schemas and generates
10 different softmax outputs from the model. It then takes the final or 'master'
prediction to be the maximum softmax index with the highest overall probability
(i.e. final prediction = average prediction over all 10 trials) and the uncertainty
to be the Standard Deviation of the set of 10 softmax outputs (more variable softmax output
= more uncertainty)
Returns: preds_df, a dataframe of labels predicted by your model, ground truth lables, and prediction
uncertainty values.
"""
# Define the testing function to use dropout (testing function is just a forward pass through training function)
f = K.function([model.layers[0].input, K.learning_phase()], [model.layers[-1].output])
num_classes = len(label_to_id)
result = np.zeros((n_iter,) + (test_features.shape[0], num_classes) )
preds_df = pd.DataFrame(columns = ['Prediction', 'Label', 'Uncertainty'])
# Generate n_iter different softmax outputs by altering dropout each pass through
for i in range(n_iter):
result[i,:, :] = f((test_features, 1))[0]
predictions = result.mean(axis=0) # “ultimate prediction”
uncertainties = result.std(axis=0) # “STD as proxy for uncertainty”
for i in range(len(predictions)):
one_hot_label = test_labels.iloc[i] # Get the ground truth one-hot-encoded label for this prediction
predicted_index = np.argmax(predictions[i]) # Predicted index is maximum softmax ouput probability
predicted_value = label_to_id[predicted_index] # convert predicted index to label name
actual_value = one_hot_label.idxmax()
uncertainty = uncertainties[i][predicted_index]
preds_df = preds_df.append({'Prediction' : predicted_value , 'Label' : actual_value, 'Uncertainty' : uncertainty} , ignore_index=True)
return preds_df
def predict_with_uncertainty(model, test_features: pd.DataFrame, label_to_id: dict, n_iter=10) -> tuple:
"""
USE FOR DEPLOYMENT
------------------
Wheras the above function is used during the testing phase (testing monte
carlo accuracies), this function should be used for deployment. Note that
there are no ground-truth labels passed into the function.
Parameters:
- model: a keras model trained with Monte Carlo Dropout
- test_features: a SINGLE sample (1 sample = 1 vector of
test_features).
- label_to_id: A dictionary that maps prediction indicies (indicies of
the softmax outputs of the model) to label names.
- n_iter: the number of stochastic forward passes you'd like your model
to perform for each given test sample. i.e. for n_iter=10, the
function applies 10 distinct dropout schemas and generates
10 different softmax outputs from the model. It then takes the final or 'master'
prediction to be the maximum softmax index with the highest overall probability
(i.e. final prediction = average prediction over all 10 trials) and the uncertainty
to be the Standard Deviation of the set of 10 softmax outputs (more variable softmax output
= more uncertainty)
Returns: a tuple, including the predicted label value and an uncertainty value
... i.e. return (prediction id, uncertainty)
"""
# from tensorflow.keras.models import load_model
# tf.compat.v1.disable_eager_execution()
#
# model = load_model("../saved_models/tfidf-model/tfidf-classifier.h5")
# Define the testing function to use dropout (testing function is just a forward pass through training function)
f = K.function([model.layers[0].input, K.learning_phase()], [model.layers[-1].output])
num_classes = len(label_to_id)
result = np.zeros((n_iter,) + (test_features.shape[0], num_classes) )
test_features = test_features.values.reshape(1,-1) # reshape single input sample to matrix so model can parse it
# Generate n_iter different softmax outputs by altering dropout each pass through
for i in range(n_iter):
result[i,:, :] = f((test_features, 1))[0]
predictions = result.mean(axis=0) # “ultimate prediction”
uncertainties = result.std(axis=0) # “STD as proxy for uncertainty”
predicted_index = np.argmax(predictions[0])
predicted_value = label_to_id[predicted_index] # convert predicted index to label name
uncertainty = uncertainties[0][predicted_index]
return predicted_value, uncertainty
def get_monte_carlo_accuracy(preds_df: pd.DataFrame, threshold=None) -> tuple:
"""
Takes as input a preds_df (Dataframe) generated by the predict_with_uncertainty
function.This function will assess the models accuracy only on test examples
below `threshold` uncertainty.
Returns: accuracy and percentage of the orignal dataset that is retained once
threshold is applied.
"""
orig_length = len(preds_df)
retained = 100 # If no data is lost, we've retained 100% of testing data
# Evaluate only confident predictions
if threshold:
preds_df = preds_df[preds_df['Uncertainty'] < threshold]
# Avoid Divide-by-Zero error:
if len(preds_df) == 0:
retained = 0
else:
retained = len(preds_df) / orig_length * 100
good_preds = preds_df[preds_df['Prediction'] == preds_df['Label']]
accuracy = len(good_preds) / len(preds_df) * 100
return retained, accuracy
def graph_confidence(preds_df, save=False, fname=None):
"""
Takes as input a preds_df (Dataframe) generated by the predict_with_uncertainty
function. This functions plugs in a large range of threshold values to the
`get_monte_carlo_accuracy` function in order to generate and plot a graph of
the trade-off between accuracy and effeciency.
Returns the accuracy of the model when it discards about 50% of the test data.
This information can be useful for evaluating how 'good' a given model is at
confidence bounding.
"""
# Clear former plots
plt.clf()
uncertanties = np.logspace(0, 10, 10000, base=0.5).tolist() # Logspace generates a better distribution of values since
# uncertainy values are based on STD, which is distributed more logarithmically
# than linearly - in short, this distribution produces a much better graph.
accuracies = []
proportions = []
for uncertainty in uncertanties:
retained, accuracy = (get_monte_carlo_accuracy(preds_df, uncertainty))
proportions.append(retained)
accuracies.append(accuracy)
proportions[0] = 100 # Show 100% accuracy on 0% of the training data
plt.scatter(proportions, accuracies,s=5, c='c')
plt.ylabel("Test Accuracy (%)")
plt.xlabel("Percentage of Testing Data Retained")
plt.title("Monte Carlo Accuracies")
# Save for later use:
if save:
if fname is None:
fname = '../figures/monte-carlo-accuracies.png'
plt.savefig(fname)
# Return accuracy on 50% of training data:
for i, proportion in enumerate(proportions):
# Return as soon as we dip below 50
if proportion < 50:
return accuracies[i]
def acc_to_uncertainty(preds_df, target_acc):
"""
Finds the uncertainty value that most closely approximates the desired accuracy ('target_acc')
(i.e. returns the unvertainty value that throws away the minimum ammount of
test data while still producing the desired accuracy)
**PASS IN value for 'target_acc' out of 100 (true percent)!
"""
uncertainties = np.logspace(0, 10, 10000, base=0.5).tolist()
for uncertainty in uncertainties:
retained, accuracy = (get_monte_carlo_accuracy(preds_df, uncertainty))
# Return as soon as we reach the desired accuracy
if accuracy > target_acc:
print('% of test data retained', retained)
print('Exact accuracy', accuracy)
return uncertainty
def proportion_to_uncertainty(preds_df, target_prop):
"""
Finds the maximum uncertainty value that results in the target proprotion ('target_prop')
of the test data being retained.
Ex: target_prop = 70%
Returns: The uncertainty value that causes the model to discard approx. 30% of the data.
**PASS IN value for 'target_prop' out of 100 (true percent)!
"""
uncertainties = np.logspace(0, 10, 10000, base=0.5).tolist()
for uncertainty in uncertainties:
retained, accuracy = (get_monte_carlo_accuracy(preds_df, uncertainty))
# Return as soon as we drop barely below the target proportion
if retained < target_prop:
print('% of test data retained', retained)
print('Exact accuracy', accuracy)
return uncertainty
# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
if epoch % 100 == 0: print('')
print('.', end='')