Permalink
Browse files

added source

  • Loading branch information...
jacobeisenstein committed Jul 10, 2011
1 parent d71cb49 commit 7b146cd1d21bce18763f0afdb34ff8ad1f8db96b
View
@@ -0,0 +1,12 @@
+function [bound word_score lv_score] = scoreDoc(counts,beta,phi,x,sigma,alpha,e_log_theta)
+word_score = scoreWords(counts,beta);
+if nargin < 7,
+ e_log_theta = digamma(sigma) - digamma(sum(sigma));
+end
+if numel(alpha) == 1 && numel(sigma) > 1, alpha = repmat(alpha,1,numel(sigma)); end
+lv_score = e_log_theta * phi' * x';% ... %E[log p(z | theta)]
+lv_score = lv_score - x * sum(log(phi).*phi,2); %E[log q(phi)]
+lv_score = lv_score + gammaln(sum(alpha)) - sum(gammaln(alpha)) + e_log_theta * (alpha' - 1); % ... %E[log p(theta | alpha)]
+lv_score = lv_score - gammaln(sum(sigma)) + sum(gammaln(sigma)) - e_log_theta * (sigma' - 1); %E[log q(theta)]
+bound = word_score + lv_score;
+end
View
@@ -0,0 +1,4 @@
+function score = scoreWords(counts,beta)
+%function score = scoreWords(counts,beta)
+ score = sum(sum(counts .* beta));% - logsumexp(beta') * sum(counts,2);
+end
View
@@ -0,0 +1,16 @@
+function [perplex ll ll_per_word] = computePerplexity(docs,topics,alpha,varargin)
+%function [perplex ll ll_per_word] = computePerplexity(docs,topics,alpha,varargin)
+%calls out to slow 3rd-party code (Wallach, Murray, Salakhutdinov, Mimno)
+%for computing perplexity, the right way
+
+[num_its ] =process_options(varargin,'num-its',100);
+doc_log_prob = 0;
+fprintf('computing perplexity');
+for i = 1:size(docs,1)
+ doc_log_prob = doc_log_prob + ldae_chibms(termCountsToWordList(docs(i,:)),topics,alpha,num_its);
+ if rem(i,10)==0, fprintf('='); end
+end
+fprintf('\n');
+ll_per_word = doc_log_prob / sum(sum(docs));
+perplex = exp(-ll_per_word);
+end
View
Binary file not shown.
View
Binary file not shown.
@@ -0,0 +1,24 @@
+function [e_log_beta score lv_score word_score] = computeBetaDirichlet(ecounts,eta)
+% function beta = computeBetaBoring(ecounts)
+%
+% standard variational Bayesian treatment of E[log beta], where
+% beta ~ Dirichlet(eta)
+%
+% E[log(beta)] = digamma(counts + eta) - digamma(sum(counts + eta))
+% accepts multiple rows of ecounts
+% eta can be a vector or a scalar (indicating a symmetric prior)
+if nargin == 1, eta= 0; end
+[K W] = size(ecounts);
+if isscalar(eta), eta = repmat(eta,1,W); end
+
+word_score = 0; lv_score = 0;
+e_log_beta = zeros(size(ecounts));
+for k = 1:K
+ e_log_beta(k,:) = digamma(ecounts(k,:) + eta) - digamma(sum(ecounts(k,:)+eta));
+ if nargout > 1
+ word_score = word_score + ecounts(k,:)*e_log_beta(k,:)';
+ lv_score = lv_score - kldirichlet(ecounts(k,:)+eta,eta);
+ end
+end
+score = lv_score + word_score;
+end
@@ -0,0 +1,31 @@
+function eta = computeBetaSparseVariational(ecounts,eq_m,varargin)
+%function [eta bound] =
+%computeBetaSparseVariational(ecounts,eq_m,varargin)
+% newton optimization, variational EM for tau.
+[max_its verbose init_eta min_eta max_inv_tau] = ...
+ process_options(varargin,'max-its',1,...
+ 'verbose',false,'init-eta',[],'min-eta',1e-20,...
+ 'max-inv-tau',1e5);
+[W K] = size(ecounts); %eta = zeros(size(ecounts));
+
+if isempty(init_eta),
+ eta = zeros(W,1);
+ eq_inv_tau = ones(size(eta));
+else
+ eta = init_eta;
+ eta(eta.^2<min_eta.^2) = sign(eta(eta.^2<min_eta^2))*min_eta;
+ eq_inv_tau = 1./(eta.^2);
+end
+
+if ~verbose, fprintf('.'); end
+
+em_iter = newDeltaIterator(max_its,'debug',verbose,'thresh',1e-4);
+
+exp_eq_m = exp(eq_m);
+while ~(em_iter.done)
+ eta = newton(@evalLogNormal,eta,{ecounts,exp_eq_m,eq_inv_tau},'debug',verbose==1,'alpha',.1,'max-its',10000);
+ eq_inv_tau = 1./(eta.^2);
+ eq_inv_tau(eq_inv_tau >= max_inv_tau) = max_inv_tau;
+ em_iter = updateDeltaIterator(em_iter,eta);
+end
+end
View
@@ -0,0 +1,20 @@
+function [l g step] = evalLogNormal(eta,counts,exp_eq_m,invsigsq)
+Ck = sum(counts,1); C = sum(Ck); [W K] = size(counts);
+denom = repmat(exp(eta),1,K).*exp_eq_m;
+l = -(sum(eta' * counts) - Ck * log ( sum(denom) )' - 0.5 * trace(eta' * spdiag(invsigsq) * eta));
+if nargout > 1
+ beta = Ck * normalize_rows(denom') / (C + 1e-10); %expected beta
+ g = -(sum(counts,2) - C * beta' - invsigsq .* eta);
+ if nargout > 2
+ avec = -1./ (C * beta' + invsigsq);
+ a_times_g = (-g) .* avec;
+ c_times_a_times_beta = C * avec .* beta';
+ %step = -a_times_g + c_times_a_times_beta ./ (1 + beta * c_times_a_times_beta) * (beta * a_times_g);
+ step = -a_times_g + c_times_a_times_beta .* (beta * a_times_g ./ (1 + beta * c_times_a_times_beta));
+ end
+end
+end
+
+%these are slower
+%l = -(sum(eta' * counts) - Ck * log ( exp(eta)' * exp(eq_m) )' - 0.5 * sum(eta.*eta.*invsigsq));
+%l = -(sum(eta' * counts) - Ck * log ( exp(eta)' * exp(eq_m) )' - 0.5 * (eta.^2)'*invsigsq);
@@ -0,0 +1,27 @@
+function [l g step] = evalQTauLogAMinus1(log_a_minus_1,b,etasq)
+%function [l g step] = evalQTauA(a,b,etasq)
+%
+%Q(tau) = Gamma(a,b)
+%eta \sim N(0,tau)
+%tau \sim 1/tau
+%
+%a is Wx1
+%b is Wx1
+%eta is Wx1
+
+a = exp(log_a_minus_1) + 1;
+
+%l = -.5 E[log tau] - .5 eta.^eta.^ E[1/tau] - E[log tau] - E[log Q(tau)]
+log_b = log(b);
+l = -(a + .5).*(digamma(a) + log_b) - .5 * etasq ./ ((a-1).*b) + a + gammaln(a) + a .* log_b;
+g = -(a + .5).*(trigamma(a)) + .5 * etasq ./ (b .* (a-1) .* (a-1)) + 1;
+
+l = -sum(l);
+g = -g .* (a - 1);
+
+step = 0; %do this later
+if nargout > 2
+ error('newton step size not yet implemented');
+end
+
+end
View
@@ -0,0 +1,10 @@
+function makeTopicReport(eta,vocab,varargin)
+%function makeTopicReport(eta,vocab,varargin)
+ [N background] = process_options(varargin,'N',10,'background',mean(eta));
+ for i = 1:size(eta,1)
+ if (size(eta,1)>1), fprintf('%d. ',i); end
+ num_legit = sum(abs(eta(i,:) - background)>1e-3);
+ fprintf('%s ',vocab{sortidxs(eta(i,:) - background,2,'descend',min(N,num_legit))})
+ fprintf('\n');
+ end
+end
View
@@ -0,0 +1,25 @@
+function [tr_words te_words widx tr_idx te_idx] = preprocess(counts,varargin)
+%function [tr_words te_words widx tr_idx te_idx] = preprocess(counts,varargin)
+[debug max_words num_folds fold holdout] = process_options(varargin,'debug',0,'max-words',5000,'num-folds',5,'fold',1,'holdout',1);
+[N W] = size(counts);
+
+counts = holdoutWords(counts,holdout);
+
+te_idx = fold:num_folds:N;
+tr_idx = setdiff_sorted(1:N,te_idx);
+if debug
+ tr_idx = tr_idx(randsample(numel(tr_idx),min(1000,numel(tr_idx))));
+ te_idx = te_idx(randsample(numel(te_idx),min(100,numel(te_idx))));
+end
+count_sums = sum(counts(tr_idx,:)>0);
+max_words = min(max_words,sum(count_sums>0));
+
+widx = sortidxs(count_sums,2,'descend',max_words);
+tr_idx = tr_idx(sum(counts(tr_idx,widx),2)>0);
+te_idx = te_idx(sum(counts(te_idx,widx),2)>0);
+tr_words = [counts(tr_idx,widx)]; %sum(ap_full(tr_idx,nonwords),2)];
+te_words = [counts(te_idx,widx)]; %sum(ap_full(te_idx,nonwords),2)];
+
+%tr_words = holdoutWords(tr_words,holdout);
+%te_words = holdoutWords(te_words,holdout);
+end
View
@@ -0,0 +1,32 @@
+function runLDA(K,seed,W,varargin)
+%function runLDA(K,seed,W,varargin)
+[debug dataset do_non_sparse do_sparse] = process_options(varargin,'debug',false,'dataset','20news','do-non-sparse',1,'do-sparse',1);
+
+directory = sprintf('traces.lda.%s',dataset);
+load(sprintf('data/%s.mat',dataset));
+
+if strcmp(dataset,'20news')
+ words = tr_data;
+ [tr_words te_words widx] = preprocess(words,'max-words',W,'debug',debug,'num-folds',10);
+else
+ [tr_words te_words widx] = preprocess(counts','max-words',W,'holdout',0.1,'debug',debug,'num-folds',50);
+ vocab = words;
+end
+
+if ~exist(directory,'dir')
+ mkdir(directory);
+end
+
+options = {'seed',seed,'te-x',te_words,'vocab',vocab(widx),'max-mstep-its',1000};
+if (debug), options = cat(2,options,'max-its',10); end
+
+basename = sprintf('%s/out.%d.%d',directory,K,seed);
+
+if do_non_sparse
+[ig theta_lda eta_lda] = sparseTAM(tr_words,K,'sparse',0,'compute-perplexity',1000,options{:});
+end
+if do_sparse
+[ig theta_sage eta_sage] = sparseTAM(tr_words,K,'sparse',1,'compute-perplexity',50,options{:});
+end
+save(sprintf('%s.final.mat',basename));
+end
Oops, something went wrong.

0 comments on commit 7b146cd

Please sign in to comment.