-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
cost.py
774 lines (634 loc) · 26.1 KB
/
cost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
"""
Classes representing cost functions.
Currently, these are primarily used to specify the objective function for
the SGD and BGD training algorithms.
"""
import functools
import logging
import warnings
from theano.compat.six.moves import reduce
import theano.tensor as T
from theano.compat.six.moves import zip as izip
from pylearn2.compat import OrderedDict
from pylearn2.utils import safe_zip
from pylearn2.utils import safe_union
from pylearn2.space import CompositeSpace, NullSpace
from pylearn2.utils.data_specs import DataSpecsMapping
from pylearn2.utils.exc import reraise_as
logger = logging.getLogger(__name__)
class Cost(object):
"""
Represents an objective function to be minimized by some
`TrainingAlgorithm`.
Notes
-----
While functions may be represented just as theano graphs,
this class allows us to add extra functionality. The
`get_gradients` methods allows us to use a method other
than `theano.tensor.grad` to compute the gradient of the
cost function. This enables using approximate gradients
of cost functions that are not differentiable or whose
true gradient is computationally intractable. Additionally,
the get_monitoring_channels method allows monitoring of
quantities that are useful when training with a given
objective function (such as termination criteria that are
usually used with the function).
"""
# TODO: remove this when it is no longer necessary, it should be
# mostly phased out due to the new data_specs interface.
# If True, the data argument to expr and get_gradients must be a
# (X, Y) pair, and Y cannot be None.
supervised = False
def expr(self, model, data, ** kwargs):
"""
Returns a theano expression for the cost function.
Returns a symbolic expression for a cost function applied to the
minibatch of data.
Optionally, may return None. This represents that the cost function
is intractable but may be optimized via the get_gradients method.
Parameters
----------
model : a pylearn2 Model instance
data : a batch in cost.get_data_specs() form
kwargs : dict
Optional extra arguments. Not used by the base class.
"""
# Fall back to cost_per_example implementation if possible
try:
per_example = self.cost_per_example(self, model, data, **kwargs)
except NotImplementedError:
raise NotImplementedError(str(type(self)) + " does not implement "
"expr.")
# Handle explicitly undefined costs
if per_example is None:
return None
assert per_example.ndim == 1
return per_example.mean()
def cost_per_example(self, model, data, ** kwargs):
"""
Returns a theano expression for the cost per example.
This method is optional. Most training algorithms will work without
it.
Parameters
----------
model : Model
data : a batch in cosst.get_data_specs() form
kwargs : dict
Optional extra arguments to be used by 3rd party
TrainingAlgorithm classes and/or FixedVarDescr.
Returns
-------
cost_per_example : 1-D Theano tensor
Each element of this vector gives the cost
for another example. The overall cost is the mean of this vector.
"""
raise NotImplementedError(str(type(self)) + "does not implement "
"cost_per_example.")
def get_gradients(self, model, data, ** kwargs):
"""
Provides the gradients of the cost function with respect to the model
parameters.
These are not necessarily those obtained by theano.tensor.grad
--you may wish to use approximate or even intentionally incorrect
gradients in some cases.
Parameters
----------
model : a pylearn2 Model instance
data : a batch in cost.get_data_specs() form
kwargs : dict
Optional extra arguments, not used by the base class.
Returns
-------
gradients : OrderedDict
a dictionary mapping from the model's parameters
to their gradients
The default implementation is to compute the gradients
using T.grad applied to the value returned by expr.
However, subclasses may return other values for the gradient.
For example, an intractable cost may return a sampling-based
approximation to its gradient.
updates : OrderedDict
a dictionary mapping shared variables to updates that must
be applied to them each time these gradients are computed.
This is to facilitate computation of sampling-based approximate
gradients.
The parameters should never appear in the updates dictionary.
This would imply that computing their gradient changes
their value, thus making the gradient value outdated.
"""
try:
cost = self.expr(model=model, data=data, **kwargs)
except TypeError:
# If anybody knows how to add type(self) to the exception message
# but still preserve the stack trace, please do so
# The current code does neither
message = "Error while calling " + str(type(self)) + ".expr"
reraise_as(TypeError(message))
if cost is None:
raise NotImplementedError(str(type(self)) +
" represents an intractable cost and "
"does not provide a gradient "
"approximation scheme.")
params = list(model.get_params())
grads = T.grad(cost, params, disconnected_inputs='ignore')
gradients = OrderedDict(izip(params, grads))
updates = OrderedDict()
return gradients, updates
def get_monitoring_channels(self, model, data, **kwargs):
"""
.. todo::
WRITEME
.. todo::
how do you do prereqs in this setup? (I think PL changed
it, not sure if there still is a way in this context)
Returns a dictionary mapping channel names to expressions for
channel values.
Parameters
----------
model : Model
the model to use to compute the monitoring channels
data : batch
(a member of self.get_data_specs()[0])
symbolic expressions for the monitoring data
kwargs : dict
used so that custom algorithms can use extra variables
for monitoring.
Returns
-------
rval : dict
Maps channels names to expressions for channel values.
"""
self.get_data_specs(model)[0].validate(data)
return OrderedDict()
def get_fixed_var_descr(self, model, data):
"""
Subclasses should override this if they need variables held
constant across multiple updates to a minibatch.
TrainingAlgorithms that do multiple updates to a minibatch should
respect this. See the FixedVarDescr class for details.
Parameters
----------
model : Model
data : theano.gof.Variable or tuple
A valid member of the Space used to train `model` with this
cost.
Returns
-------
fixed_var_descr : FixedVarDescr
A description of how to hold the necessary variables constant
"""
self.get_data_specs(model)[0].validate(data)
fixed_var_descr = FixedVarDescr()
return fixed_var_descr
def get_data_specs(self, model):
"""
Returns a specification of the Space the data should lie in and
its source (what part of the dataset it should come from).
Parameters
----------
model : Model
The model to train with this cost
Returns
-------
data_specs : tuple
The tuple should be of length two.
The first element of the tuple should be a Space (possibly a
CompositeSpace) describing how to format the data.
The second element of the tuple describes the source of the
data. It probably should be a string or nested tuple of strings.
See Also
--------
For many common cases, rather than implementing this method
yourself, you probably want
to just inherit from `DefaultDataSpecsMixin` or NullDataSpecsMixin.
Notes
-----
.. todo
figure out return format for sure. PL seems to have documented
this method incorrectly.
"""
raise NotImplementedError(str(type(self)) + " does not implement " +
"get_data_specs.")
def is_stochastic(self):
"""
Returns True if the cost is stochastic.
Stochastic costs are incompatible with some optimization algorithms
that make multiple updates per minibatch, such as algorithms that
use line searches. These optimizations should raise a TypeError if
given a stochastic Cost, or issue a warning if given a Cost whose
`is_stochastic` method raises NotImplementedError.
Returns
-------
is_stochastic : bool
Whether the cost is stochastic. For example, dropout is
stochastic.
"""
raise NotImplementedError(str(type(self)) + " needs to implement "
"is_stochastic.")
class SumOfCosts(Cost):
"""
Combines multiple costs by summing them.
Parameters
----------
costs : list
List of Cost objects or (coeff, Cost) pairs
"""
def __init__(self, costs):
"""
Initialize the SumOfCosts object and make sure that the list of costs
contains only Cost instances.
Parameters
----------
costs : list
List of Cost objects or (coeff, Cost) pairs
"""
assert isinstance(costs, list)
assert len(costs) > 0
self.costs = []
self.coeffs = []
for cost in costs:
if isinstance(cost, (list, tuple)):
coeff, cost = cost
else:
coeff = 1.
self.coeffs.append(coeff)
self.costs.append(cost)
if not isinstance(cost, Cost):
raise ValueError("one of the costs is not "
"Cost instance")
# TODO: remove this when it is no longer necessary
self.supervised = any([cost_.supervised for cost_ in self.costs])
def expr(self, model, data, ** kwargs):
"""
Returns the sum of the costs the SumOfCosts instance was given at
initialization.
Parameters
----------
model : pylearn2.models.model.Model
the model for which we want to calculate the sum of costs
data : flat tuple of tensor_like variables.
data has to follow the format defined by self.get_data_specs(),
but this format will always be a flat tuple.
"""
self.get_data_specs(model)[0].validate(data)
composite_specs, mapping = self.get_composite_specs_and_mapping(model)
nested_data = mapping.nest(data)
costs = []
for cost, cost_data in safe_zip(self.costs, nested_data):
costs.append(cost.expr(model, cost_data, **kwargs))
assert len(costs) > 0
if any([cost is None for cost in costs]):
sum_of_costs = None
else:
costs = [coeff * cost
for coeff, cost in safe_zip(self.coeffs, costs)]
assert len(costs) > 0
sum_of_costs = reduce(lambda x, y: x + y, costs)
return sum_of_costs
def get_composite_data_specs(self, model):
"""
Build and return a composite data_specs of all costs.
The returned space is a CompositeSpace, where the components are
the spaces of each of self.costs, in the same order. The returned
source is a tuple of the corresponding sources.
Parameters
----------
model : pylearn2.models.Model
"""
spaces = []
sources = []
for cost in self.costs:
space, source = cost.get_data_specs(model)
spaces.append(space)
sources.append(source)
# Build composite space representing all inputs
composite_space = CompositeSpace(spaces)
sources = tuple(sources)
return (composite_space, sources)
def get_composite_specs_and_mapping(self, model):
"""
Build the composite data_specs and a mapping to flatten it, return both
Build the composite data_specs described in `get_composite_specs`, and
build a DataSpecsMapping that can convert between it and a flat
equivalent version. In particular, it helps building a flat data_specs
to request data, and nesting this data back to the composite
data_specs, so it can be dispatched among the different sub-costs.
Parameters
----------
model : pylearn2.models.Model
Notes
-----
This is a helper function used by `get_data_specs` and `get_gradients`,
and possibly other methods.
"""
composite_space, sources = self.get_composite_data_specs(model)
mapping = DataSpecsMapping((composite_space, sources))
return (composite_space, sources), mapping
def get_data_specs(self, model):
"""
Get a flat data_specs containing all information for all sub-costs.
Parameters
----------
model : pylearn2.models.Model
TODO WRITEME
Notes
-----
This data_specs should be non-redundant. It is built by flattening
the composite data_specs returned by `get_composite_specs`.
This is the format that SumOfCosts will request its data in. Then,
this flat data tuple will be nested into the composite data_specs,
in order to dispatch it among the different sub-costs.
"""
composite_specs, mapping = self.get_composite_specs_and_mapping(model)
composite_space, sources = composite_specs
flat_composite_space = mapping.flatten(composite_space)
flat_sources = mapping.flatten(sources)
data_specs = (flat_composite_space, flat_sources)
return data_specs
@functools.wraps(Cost.get_gradients)
def get_gradients(self, model, data, ** kwargs):
indiv_results = []
composite_specs, mapping = self.get_composite_specs_and_mapping(model)
nested_data = mapping.nest(data)
for cost, cost_data in safe_zip(self.costs, nested_data):
result = cost.get_gradients(model, cost_data, ** kwargs)
indiv_results.append(result)
grads = OrderedDict()
updates = OrderedDict()
params = model.get_params()
for coeff, packed in zip(self.coeffs, indiv_results):
g, u = packed
for param in g:
if param not in params:
raise ValueError("A shared variable (" +
str(param) +
") that is not a parameter appeared "
"a cost gradient dictionary.")
for param in g:
assert param.ndim == g[param].ndim
v = coeff * g[param]
if param not in grads:
grads[param] = v
else:
grads[param] = grads[param] + v
assert grads[param].ndim == param.ndim
assert not any([state in updates for state in u])
assert not any([state in params for state in u])
updates.update(u)
return grads, updates
@functools.wraps(Cost.get_monitoring_channels)
def get_monitoring_channels(self, model, data, ** kwargs):
self.get_data_specs(model)[0].validate(data)
rval = OrderedDict()
composite_specs, mapping = self.get_composite_specs_and_mapping(model)
nested_data = mapping.nest(data)
for i, cost in enumerate(self.costs):
cost_data = nested_data[i]
try:
channels = cost.get_monitoring_channels(model, cost_data,
**kwargs)
rval.update(channels)
except TypeError:
reraise_as(Exception('SumOfCosts.get_monitoring_channels '
'encountered TypeError while calling {0}'
'.get_monitoring_channels'.format(
type(cost))))
value = cost.expr(model, cost_data, ** kwargs)
if value is not None:
name = ''
if hasattr(value, 'name') and value.name is not None:
name = '_' + value.name
rval['term_' + str(i) + name] = value
return rval
def get_fixed_var_descr(self, model, data):
"""
.. todo::
WRITEME
Parameters
----------
model : Model
data : theano.gof.Variable or tuple
A valid member of the Space defined by
self.get_data_specs(model)[0]
"""
data_specs = self.get_data_specs(model)
data_specs[0].validate(data)
composite_specs, mapping = self.get_composite_specs_and_mapping(model)
nested_data = mapping.nest(data)
descrs = [cost.get_fixed_var_descr(model, cost_data)
for cost, cost_data in safe_zip(self.costs, nested_data)]
return reduce(merge, descrs)
class NullDataSpecsMixin(object):
"""
Use multiple inheritance with both this object and Cost in order to
obtain a data specification corresponding to not using data at all.
Due to method resolution order, you want Cost to appear after
NullDataSpecsMixin in the superclass list.
"""
def get_data_specs(self, model):
"""
Provides an implementation of `Cost.expr`.
Returns data specifications corresponding to not using any
data at all.
Parameters
----------
model : pylearn2.models.Model
"""
return (NullSpace(), '')
class DefaultDataSpecsMixin(object):
"""
Use multiple inheritance with both this object and Cost in order to
obtain the default data specification.
Due to method resolution order, you want Cost to appear after
DefaultDataSpecsMixin in the superclass list.
"""
def get_data_specs(self, model):
"""
Provides a default data specification.
The cost requests input features from the model's input space and
input source. `self` must contain a bool field called `supervised`.
If this field is True, the cost requests targets as well.
Parameters
----------
model : pylearn2.models.Model
TODO WRITEME
"""
if self.supervised:
space = CompositeSpace([model.get_input_space(),
model.get_target_space()])
sources = (model.get_input_source(), model.get_target_source())
return (space, sources)
else:
return (model.get_input_space(), model.get_input_source())
class LpPenalty(NullDataSpecsMixin, Cost):
"""
L-p penalty of the tensor variables provided.
Parameters
----------
variables : list
list of tensor variables to be regularized
p : int
p in "L-p penalty"
"""
def __init__(self, variables, p):
"""
Parameters
----------
variables : list
list of tensor variables to be regularized
p : int
p in "L-p penalty"
"""
self.variables = variables
self.p = p
def expr(self, model, data, **kwargs):
"""
Return the L-p penalty term. The optional parameters are never used;
they're only there to provide an interface that's consistent with
the Cost superclass.
Parameters
----------
model : a pylearn2 Model instance
data : a batch in cost.get_data_specs() form
kwargs : dict
Optional extra arguments. Not used by the base class.
"""
# This Cost does not depend on any data, and get_data_specs does not
# ask for any data, so we should not be provided with some.
self.get_data_specs(model)[0].validate(data)
penalty = 0
for var in self.variables:
# Absolute value handles odd-valued p cases
penalty = penalty + abs(var ** self.p).sum()
return penalty
class MethodCost(Cost):
"""
A cost specified via the string name of a method of the model.
Parameters
----------
method : a string specifying the name of the method of the model
that should be called to generate the objective function.
data_specs : a string specifying the name of a method/property of
the model that describe the data specs required by
method
"""
def __init__(self, method, data_specs=None):
"""
.. todo::
WRITEME
Parameters
----------
method : a string specifying the name of the method of the model
that should be called to generate the objective function.
data_specs : a string specifying the name of a method/property of
the model that describe the data specs required by
method
"""
self.method = method
self.data_specs = data_specs
def expr(self, model, data, *args, **kwargs):
"""
Patches calls through to a user-specified method of the model
Parameters
----------
model : pylearn2.models.model.Model
the model for which we want to calculate the sum of costs
data : flat tuple of tensor_like variables.
data has to follow the format defined by self.get_data_specs(),
but this format will always be a flat tuple.
"""
self.get_data_specs(model)[0].validate(data)
fn = getattr(model, self.method)
return fn(data, *args, **kwargs)
@functools.wraps(Cost.get_data_specs)
def get_data_specs(self, model):
if self.data_specs is not None:
fn = getattr(model, self.data_specs)
else:
# To be compatible with earlier scripts,
# try (self.method)_data_specs
fn = getattr(model, '%s_data_specs' % self.method)
if callable(fn):
return fn()
else:
return fn
def _no_op(data):
"""
An on_load_batch callback that does nothing.
"""
class FixedVarDescrDataSpecsError(TypeError):
"""
An error raised when code attempts to use the unused
data_specs field of FixedVarDescr
"""
class FixedVarDescr(object):
"""
An object used to describe variables that influence the cost but that
should be held fixed for each minibatch, even if the learning algorithm
makes multiple changes to the parameters on this minibatch, i.e., for a
line search, etc.
Attributes
- fixed_vars : dict
maps string names to shared variables or some sort of data
structure surrounding shared variables.
Any learning algorithm that does multiple updates on the same
minibatch should pass fixed_vars to the cost's expr and
get_gradient methods as keyword arguments.
- on_load_batch : list
A list of callable objects that the learning algorithm should
call with input data.
All of these callables must take an argument with the same
(space, source) format as the cost used for training.
TODO: It can be hard for a human user to know the right format
ahead of time if you use SumOfCosts, make a better way of handling
this.
PL had added a data_specs field to this class which
was meant to define the (space, source) format for each of
the members of on_load_batch, but the doc was internally
inconsistent, none of the TrainingAlgorithms obeyed it,
and the Cost's handling of it was buggy. IG removed this
broken functionality so that at least singleton costs can
used FixedVarDescr but it would be good to restore functionality
to composite costs.
"""
def __init__(self):
self.fixed_vars = {}
self.on_load_batch = []
def _data_specs_err(self, x=None):
raise FixedVarDescrDataSpecsError("The data_specs field of "
"FixedVarDescr has been removed. "
"While this field existed and was "
"documented at one time, no "
"TrainingAlgorithm respected it. "
"The data_specs of all members of "
"on_load_batch must match those of "
"the cost.")
data_specs = property(_data_specs_err, _data_specs_err)
def merge(left, right):
"""
Combine two FixedVarDescrs
Parameters
----------
left : FixedVarDescr
right : FixedVarDescr
Returns
-------
merged : FixedVarDescr
a new FixedVarDescr describing all variables and operations
described by `left` and `right`
"""
# We assume aliasing is a bug
assert left is not right
assert left.fixed_vars is not right.fixed_vars
assert left.on_load_batch is not right.on_load_batch
merged = FixedVarDescr()
for key in left.fixed_vars:
if key in right.fixed_vars:
raise ValueError("Can't merge these FixedVarDescrs, "
"both contain " + key)
assert not any([key in left.fixed_vars for key in right.fixed_vars])
merged.fixed_vars.update(left.fixed_vars)
merged.fixed_vars.update(right.fixed_vars)
merged.on_load_batch = safe_union(left.on_load_batch,
right.on_load_batch)
return merged