-
Notifications
You must be signed in to change notification settings - Fork 0
/
structuredNMF_VB.m
233 lines (218 loc) · 10.3 KB
/
structuredNMF_VB.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
function [E_t E_v a_tm b_tm a_ve b_ve M] = structuredNMF_VB(x, a_tm, b_tm, a_ve, b_ve, a_utility, u_dirichlet, adjacency, ...
EPOCH, start_optimizing_hyperparameters_after, tie_v, tie_a_tm, tie_b_tm, print_period)
% note that b_tm and b_ve are not scales - those are means...
generate_initial_data
save('graphlab_test_matlab.mat'); % save input data, to be passed to prepare_for_graphlab
relevant_indeces = uint32(relevant_indeces);
LtLv_raw = zeros(size(x_raw));
calculate_bound = true;
boundOld = -Inf;
tictocs = zeros(1,EPOCH);
mean_train = sum(sum(x_raw))./full(sum(sum(M)));
gammalnX_raw = gammaln(double(x_raw) + 1);
start_from = 1;
bounds = [];
for e=start_from:EPOCH,
tic
calculate_bound = rem(e, print_period)==0 | e==EPOCH;
%LtLv = L_t*L_v; % this can potentially be a large matrix - but we only
%need entries on same locations where x is observed
LtLv_raw = sum(L_t(rows_mask,:).*L_v(:,cols_mask)',2);
tmp = sparse(W,K);
tmp(linear_mask) = x_raw./LtLv_raw;
Sig_t = L_t.*(tmp*L_v');
Sig_v = L_v.*(L_t'*tmp);
% Updating the expectations of T
alpha_tm = a_tm + Sig_t;
beta_tm = 1./( a_tm./b_tm + M*E_v' );
E_t = alpha_tm.*beta_tm;
masked_sum_over_E_t = E_t'*M;
entropies = 0;
% Updating the utility, inverse-gamma nodes, which have been added to
% make the gamma chains conjugate.
for(current = first_s:network_size)
get_messages_utility;
natural_parameters = sum(messages);
a = natural_parameters(:,:,2)+1;
b = -1./natural_parameters(:,:,1);
% having fully obtained the mesages, the updated natural parameter
% vector is now used to reestimate sufficient statistics
expectations_markov(current,:,1) = a.*b;
expectations_markov(current,:,2) = psi(a)+log(b);
%
if(calculate_bound)
% store entropies of utility gammas
psi_a_ = expectations_markov(current,:,2)-log(b);
entropies = entropies + sum(-(a-1).*psi_a_+log(b)+ a + gammaln(a));
end
end
% Updating the expectations of V
for(current = 1:last_v)
get_messages_V; % "messages" are info coming from the Markov blanket of the currently processed node in the graphical model. The so called "message passing" framework
natural_parameters = sum(messages);
a = natural_parameters(:,:,2)+1;
b = -1./natural_parameters(:,:,1);
% having fully obtained the mesages, updated natural parameter
% vector is used to reestimate sufficient statistics
expectations_markov(current,:,1) = a.*b;
temp_log_markov(current,:) = psi(a)+log(b);
% expectations_markov(current,:,2) = psi(a)+log(b);
% psi_a_ = expectations_markov(current,:,2)-log(b);
% entropies = part2 + sum(-(a-1).*psi_a_+log(b)+ a + gammaln(a));
end
E_v = expectations_markov(1:last_v,:,1)';
% Now it's a good moment to efficielntly calculate bound. Note that Lt
% and Lv and expectations_markov have not yet been refreshed
if rem(e, 2)==0, fprintf(1, '*'); end;
if calculate_bound,
bound_procedure;
diff = bound-boundOld;
boundOld = bound;
fprintf(1, '\nBound = %f\tdiff = %f\tlast_elapsed = %f\n', boundOld, diff, tictocs(e));
rrse_train = sse(x_raw - EtEv_raw)./sse(x_raw - ones(size(x_raw,1),1)*mean_train );
fprintf(1, '\trrse_train = %f\n', rrse_train);
bounds = [bounds, bound];
if( diff<0 )
% error('Convergence violated')
end
if e>1
figure(1); plot(bounds(2:end))
end
end
expectations_markov(1:last_v,:,2) = temp_log_markov; % delaying the assignment due to bound calculation
L_t = exp(psi(alpha_tm)).*beta_tm;
L_v = exp(expectations_markov(1:last_v,:,2)');
% Updating the mixture selection models, alongside with their Dirichlet
% priors. Mixture models is added to gamma nodes which have multiple
% parents, according to the specified adjacency matrix.
numparents = sum(gamma_chain_adjacency);
relevant_indices = find(numparents > 1);
for(current = relevant_indices)
% A safe way to get the current's parents:
parents = find( gamma_chain_adjacency(:, current) );
nrparents = size(parents,1);
% Dirichlet distributions
%
% we get a Dirichlet distribution with following parameters u:
u_parameters = expectations_discrete{current}( : ,:) + ...
u_dirichlet{current};
% ...and the expectation of such a Dirichlet is
expectations_dirichlet{current} = psi(u_parameters) - ...
ones(size(u_parameters))*diag( psi(sum(u_parameters,1)) );
%
% Discrete distributions:
get_messages_discrete;
natural_parameters = sum(messages,3);
%
% Updating expectations:
% They are different from other exponential family distributions in
% the way log-partition function gets determined; here we require
% natural parameters to form discrete likelihoods
Z = sum(exp(natural_parameters));
% log-sum-exp trick
temp = max(natural_parameters);
logZ = temp+log(sum(exp(natural_parameters-repmat( temp, [size(natural_parameters,1), 1] ) )));
expectations_discrete{current} = exp(natural_parameters - repmat(logZ,size(natural_parameters,1),1));
% log-sum-exp trick done
% this is how the term would look like without the log-sum-exp trick:
% expectations_discrete{current} = exp(natural_parameters)./repmat(Z,size(natural_parameters,1),1);
end
% if(e>start_optimizing_hyperparameters_after)
% %% Hyperparameter optimization
% %.
% switch tie_b_tm,
% case 'free',
% b_tm = E_t;
% case 'rows',
% b_tm = repmat(sum(a_tm.*E_t, 1)./sum(a_tm,1), [W 1]);
% case 'cols',
% b_tm = repmat(sum(a_tm.*E_t, 2)./sum(a_tm,2), [1 I]);
% case 'tie_all',
% b_tm = sum(sum(a_tm.*E_t))./sum(a_tm(:)).*ones(W, I);
% % case 'clamp', do nothing
% end;
%
% % Newton-Rapshon numerical optimization for shape parameters
% if ~strcmp( tie_a_tm, 'clamp'),
% Z = E_t./b_tm - (log(L_t) - log(b_tm));
% switch tie_a_tm,
% case 'free',
% a_tm = gnmf_solvebynewton(Z, a_tm);
% case 'rows',
% a_tm = gnmf_solvebynewton(sum(Z,1)/W, a_tm);
% case 'cols',
% a_tm = gnmf_solvebynewton(sum(Z,2)/I, a_tm);
% case 'tie_all',
% a_tm = gnmf_solvebynewton(sum(Z(:))/(W*I), a_tm);
% % case 'clamp', do nothing
% end;
% end;
%
% % Updating the shapes and scales related to factor V
% % For a_ve, b_ve optimization has to be done by traversing the DAGs.
% % This code makes 2 options available:
% % 1. clamped a_ve, clamped b_ve -> 'clamp'
% % 2. rowwise tied a_ve (as dictated by parameter_mappings_V), free b_ve -> 'free'
% % 3. clamped a_ve, free b_ve -> 'free'
% switch(tie_v)
% case 'free',
% % TODO: more extensive testing - are 10 iterations of
% % Newton-Raphson enough?
% nonzero_bs = (b_ve(:,1) ~= 0)'; % supposing graphical models have the same structure for each row of V
% for( iter = 1:size(parameter_mappings_V,1) )
% currentDAG = parameter_mappings_V(iter,:);
% mask = currentDAG & ~nonzero_bs;
%
% indeces = find(mask);
% Z=0;
% for(i = indeces)
% parent_indices = find( gamma_chain_adjacency(:,i) );
% if( isempty(expectations_discrete{i}) )
% Z = Z + ...
% expectations_markov(i,:,1).*sum(expectations_markov(parent_indices,:,1) ,1) ...
% -expectations_markov(i,:,2) ...
% -sum(expectations_markov(parent_indices,:,2) ,1);
% else
% Z = Z + ...
% expectations_markov(i,:,1).*sum(expectations_markov(parent_indices,:,1).*expectations_discrete{i}(:,:) ,1) ...
% -expectations_markov(i,:,2) ...
% -sum(expectations_markov(parent_indices,:,2).*expectations_discrete{i}(:,:) ,1);
% end
% end
% % accumulating from non-topmost nodes done
% mask = currentDAG & nonzero_bs;
% indeces = find(mask);
% % b_ve (topmost only)
% b_ve(indeces,:) = expectations_markov(indeces,:,1);
% % topmost done.
% Z = Z + sum( expectations_markov(indeces,:,1)./b_ve(indeces,:) ...
% -expectations_markov(indeces,:,2) + ...
% log(b_ve(indeces,:)), 1);
%
% Z = Z/sum(parameter_mappings_V(iter,:),2); % to get the form prepared for solving
% indeces = find(currentDAG);
% a_ve(indeces,:) = gnmf_solvebynewton(Z, a_ve(indeces,:));
% end
% case 'semifree',
% nonzero_bs = (b_ve(:,1) ~= 0)'; % supposing graphical models have the same structure for each row of V
% for( iter = 1:size(parameter_mappings_V,1) )
% currentDAG = parameter_mappings_V(iter,:);
%
% mask = currentDAG & nonzero_bs; % TODO: don't calculate this at each iteration - store it
% indeces = find(mask);
% % b_ve (topmost only)
% b_ve(indeces,:) = expectations_markov(indeces,:,1);
%
% end
% % free parameters end
% case 'clamp'
% % do nothing
% end
%
% end
%
tictocs(1,e) = toc;
end
Sig_t
Sig_v
full(tmp)