forked from huqiaoping/drml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
multi_sigmoid_cross_entropy_loss_layer.hpp
128 lines (115 loc) · 5.33 KB
/
multi_sigmoid_cross_entropy_loss_layer.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#ifndef CAFFE_MULTI_SIGMOID_CROSS_ENTROPY_LOSS_LAYER_HPP_
#define CAFFE_MULTI_SIGMOID_CROSS_ENTROPY_LOSS_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/loss_layer.hpp"
#include "caffe/layers/sigmoid_layer.hpp"
namespace caffe {
/**
* @brief Computes the cross-entropy (logistic) loss @f$
* E = \frac{-1}{n} \sum\limits_{n=1}^N \left[
* p_n \log \hat{p}_n +
* (1 - p_n) \log(1 - \hat{p}_n)
* \right]
* @f$, often used for predicting targets interpreted as probabilities.
*
* This layer is implemented rather than separate
* SigmoidLayer + CrossEntropyLayer
* as its gradient computation is more numerically stable.
* At test time, this layer can be replaced simply by a SigmoidLayer.
*
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times H \times W) @f$
* the scores @f$ x \in [-\infty, +\infty]@f$,
* which this layer maps to probability predictions
* @f$ \hat{p}_n = \sigma(x_n) \in [0, 1] @f$
* using the sigmoid function @f$ \sigma(.) @f$ (see SigmoidLayer).
* -# @f$ (N \times C \times H \times W) @f$
* the targets @f$ y \in [0, 1] @f$
* @param top output Blob vector (length 1)
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* the computed cross-entropy loss: @f$
* E = \frac{-1}{n} \sum\limits_{n=1}^N \left[
* p_n \log \hat{p}_n + (1 - p_n) \log(1 - \hat{p}_n)
* \right]
* @f$
*/
template <typename Dtype>
class MultiSigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
public:
explicit MultiSigmoidCrossEntropyLossLayer(const LayerParameter& param)
: LossLayer<Dtype>(param),
sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
sigmoid_output_(new Blob<Dtype>()) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; }
protected:
/// @copydoc MultiSigmoidCrossEntropyLossLayer
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
/**
* @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
* predictions.
*
* Gradients cannot be computed with respect to the target inputs (bottom[1]),
* so this method ignores bottom[1] and requires !propagate_down[1], crashing
* if propagate_down[1] is set.
*
* @param top output Blob vector (length 1), providing the error gradient with
* respect to the outputs
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
* as @f$ \lambda @f$ is the coefficient of this layer's output
* @f$\ell_i@f$ in the overall Net loss
* @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
* @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
* (*Assuming that this top Blob is not used as a bottom (input) by any
* other layer of the Net.)
* @param propagate_down see Layer::Backward.
* propagate_down[1] must be false as gradient computation with respect
* to the targets is not implemented.
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times H \times W) @f$
* the predictions @f$x@f$; Backward computes diff
* @f$ \frac{\partial E}{\partial x} =
* \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
* @f$
* -# @f$ (N \times 1 \times 1 \times 1) @f$
* the labels -- ignored as we can't compute their error gradients
*/
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
/// Read the normalization mode parameter and compute the normalizer based
/// on the blob size. If normalization_mode is VALID, the count of valid
/// outputs will be read from valid_count, unless it is -1 in which case
/// all outputs are assumed to be valid.
virtual Dtype get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count);
/// The internal SigmoidLayer used to map predictions to probabilities.
shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
/// sigmoid_output stores the output of the SigmoidLayer.
shared_ptr<Blob<Dtype> > sigmoid_output_;
/// bottom vector holder to call the underlying SigmoidLayer::Forward
vector<Blob<Dtype>*> sigmoid_bottom_vec_;
/// top vector holder to call the underlying SigmoidLayer::Forward
vector<Blob<Dtype>*> sigmoid_top_vec_;
/// Whether to ignore instances with a certain label.
bool has_ignore_label_;
/// The label indicating that an instance should be ignored.
int ignore_label_;
/// How to normalize the loss.
LossParameter_NormalizationMode normalization_;
Dtype normalizer_;
int outer_num_, inner_num_;
};
} // namespace caffe
#endif // CAFFE_MULTI_SIGMOID_CROSS_ENTROPY_LOSS_LAYER_HPP_