forked from VowpalWabbit/vowpal_wabbit
-
Notifications
You must be signed in to change notification settings - Fork 3
/
svrg.cc
182 lines (147 loc) · 4.88 KB
/
svrg.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#include <assert.h>
#include <iostream>
#include "gd.h"
#include "vw.h"
using namespace std;
using namespace LEARNER;
namespace SVRG {
#define W_INNER 0 // working "inner-loop" weights, updated per example
#define W_STABLE 1 // stable weights, updated per stage
#define W_STABLEGRAD 2 // gradient corresponding to stable weights
struct svrg
{
int stage_size; // Number of data passes per stage.
int prev_pass; // To detect that we're in a new pass.
int stable_grad_count; // Number of data points that
// contributed to the stable gradient
// calculation.
// The VW process' global state.
vw* all;
};
// Mimic GD::inline_predict but with offset for predicting with either
// stable versus inner weights.
template<int offset>
inline void vec_add(float& p, const float x, float& w) {
float* ws = &w;
p += x * ws[offset];
}
template<int offset>
inline float inline_predict(vw& all, example& ec)
{
float acc = ec.l.simple.initial;
GD::foreach_feature<float, vec_add<offset> >(all, ec, acc);
return acc;
}
// -- Prediction, using inner vs. stable weights --
float predict_stable(const svrg& s, example& ec)
{
return GD::finalize_prediction(s.all->sd, inline_predict<W_STABLE>(*s.all, ec));
}
void predict(svrg& s, base_learner&, example& ec)
{
ec.partial_prediction = inline_predict<W_INNER>(*s.all, ec);
ec.pred.scalar = GD::finalize_prediction(s.all->sd, ec.partial_prediction);
}
float gradient_scalar(const svrg& s, const example& ec, float pred)
{
return s.all->loss->first_derivative(s.all->sd, pred, ec.l.simple.label)
* ec.l.simple.weight;
}
// -- Updates, taking inner steps vs. accumulating a full gradient --
struct update {
float g_scalar_stable;
float g_scalar_inner;
float eta;
float norm;
};
inline void update_inner_feature(update& u, float x, float& w)
{
float* ws = &w;
w -= u.eta * ((u.g_scalar_inner - u.g_scalar_stable) * x + ws[W_STABLEGRAD] / u.norm);
}
inline void update_stable_feature(float& g_scalar, float x, float& w)
{
float* ws = &w;
ws[W_STABLEGRAD] += g_scalar * x;
}
void update_inner(const svrg& s, example& ec)
{
update u;
// |ec| already has prediction according to inner weights.
u.g_scalar_inner = gradient_scalar(s, ec, ec.pred.scalar);
u.g_scalar_stable = gradient_scalar(s, ec, predict_stable(s, ec));
u.eta = s.all->eta;
u.norm = (float) s.stable_grad_count;
GD::foreach_feature<update, update_inner_feature>(*s.all, ec, u);
}
void update_stable(const svrg& s, example& ec)
{
float g = gradient_scalar(s, ec, predict_stable(s, ec));
GD::foreach_feature<float, update_stable_feature>(*s.all, ec, g);
}
void learn(svrg& s, base_learner& base, example& ec)
{
assert(ec.in_use);
predict(s, base, ec);
const int pass = (int) s.all->passes_complete;
if (pass % (s.stage_size + 1) == 0) { // Compute exact gradient
if (s.prev_pass != pass && !s.all->quiet) {
cout << "svrg pass " << pass << ": committing stable point" << endl;
for (uint32_t j = 0; j < VW::num_weights(*s.all); j++) {
float w = VW::get_weight(*s.all, j, W_INNER);
VW::set_weight(*s.all, j, W_STABLE, w);
VW::set_weight(*s.all, j, W_STABLEGRAD, 0.f);
}
s.stable_grad_count = 0;
cout << "svrg pass " << pass << ": computing exact gradient" << endl;
}
update_stable(s, ec);
s.stable_grad_count++;
} else { // Perform updates
if (s.prev_pass != pass && !s.all->quiet) {
cout << "svrg pass " << pass << ": taking steps" << endl;
}
update_inner(s, ec);
}
s.prev_pass = pass;
}
void save_load(svrg& s, io_buf& model_file, bool read, bool text)
{
if (read) {
initialize_regressor(*s.all);
}
if (model_file.files.size() > 0) {
bool resume = s.all->save_resume;
char buff[512];
uint32_t text_len = sprintf(buff, ":%d\n", resume);
bin_text_read_write_fixed(model_file, (char*) &resume, sizeof(resume), "",
read, buff, text_len, text);
if (resume) {
GD::save_load_online_state(*s.all, model_file, read, text);
} else {
GD::save_load_regressor(*s.all, model_file, read, text);
}
}
}
}
using namespace SVRG;
base_learner* svrg_setup(vw& all)
{
if (missing_option(all, false, "svrg", "Streaming Stochastic Variance Reduced Gradient")) {
return NULL;
}
new_options(all, "SVRG options")
("stage_size", po::value<int>()->default_value(1), "Number of passes per SVRG stage");
add_options(all);
svrg& s = calloc_or_die<svrg>();
s.all = &all;
s.stage_size = all.vm["stage_size"].as<int>();
s.prev_pass = -1;
s.stable_grad_count = 0;
// Request more parameter storage (4 floats per feature)
all.reg.stride_shift = 2;
learner<svrg>& l = init_learner(&s, learn, 1 << all.reg.stride_shift);
l.set_predict(predict);
l.set_save_load(save_load);
return make_base(l);
}