Permalink
Browse files

Added --min-postprob option to modulate --postprob reporting of poste…

…rior probabilities of all DP matrix cells in xrate
  • Loading branch information...
1 parent 8a118f1 commit 7d536ef824b071a0e827eba6454a9ce4f710b809 Ian Holmes committed May 3, 2010
View
@@ -73,7 +73,6 @@ AC_DEFUN([GUILE_PROGS],
AC_PATH_PROG(GUILE_TOOLS,guile-tools)
AC_SUBST(GUILE_TOOLS)
AC_SUBST(GUILE_INCLUDED,[1])
- AC_DEFINE([GUILE_INCLUDED], [], [Guile is available.])
AC_SUBST(GUILE_DEPS,[guile])
fi
fi
View
@@ -1,13 +1,13 @@
#include <stdio.h>
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
#include <libguile.h>
#include "guile/stockholm-type.h"
#include "guile/newick-type.h"
#include "ecfg/guile-ecfg.h"
#endif /* GUILE_INCLUDED */
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
static void inner_main (void *closure, int argc, char **argv)
{
init_stockholm_type();
@@ -20,7 +20,7 @@ static void inner_main (void *closure, int argc, char **argv)
int main (int argc, char **argv)
{
SExpr_Scheme_evaluator::mark_guile_initialized(); // hack to avoid initializing guile twice
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
scm_boot_guile (argc, argv, inner_main, 0);
#endif /* GUILE_INCLUDED */
printf ("Guile unavailable - try installing guile and rebuilding\n");
View
@@ -590,7 +590,9 @@ void ECFG_scores::make_GFF (GFF_list& gff_list,
// record CYK score
sstring cyk_val, cyk_tag;
- cyk_tag << ECFG_GFF_LogCYK_tag << '(' << info.name << ')';
+ cyk_tag << ECFG_GFF_LogCYK_tag;
+ if (record_probs_of_all_states)
+ cyk_tag << '(' << info.name << ')';
cyk_val << Nats2Bits (cyk_ll);
group_key_val[cyk_tag] = cyk_val;
@@ -600,7 +602,9 @@ void ECFG_scores::make_GFF (GFF_list& gff_list,
if (s == state || record_probs_of_all_states)
{
sstring pp_val, pp_tag;
- pp_tag << ECFG_GFF_LogPostProb_tag << '(' << state_info[s].name << ')';
+ pp_tag << ECFG_GFF_LogPostProb_tag;
+ if (record_probs_of_all_states)
+ pp_tag << '(' << state_info[s].name << ')';
pp_val << Nats2Bits (pp_calc->post_state_ll (s, subseq));
group_key_val[pp_tag] = pp_val;
}
@@ -611,7 +615,9 @@ void ECFG_scores::make_GFF (GFF_list& gff_list,
if (s == state || record_probs_of_all_states)
{
sstring ins_val, ins_tag;
- ins_tag << ECFG_GFF_LogInsideProb_tag << '(' << state_info[s].name << ')';
+ ins_tag << ECFG_GFF_LogInsideProb_tag;
+ if (record_probs_of_all_states)
+ ins_tag << '(' << state_info[s].name << ')';
ins_val << Nats2Bits (ins_calc->state_inside_ll (s, subseq));
group_key_val[ins_tag] = ins_val;
}
View
@@ -3,7 +3,7 @@
#include "util/vector_output.h"
#include "ecfg/ecfgsexpr.h"
-#ifdef BEAGLE_INCLUDED
+#if defined(BEAGLE_INCLUDED) && BEAGLE_INCLUDED
#include "libhmsbeagle/beagle.h"
#endif /* BEAGLE_INCLUDED */
@@ -438,7 +438,7 @@ void ECFG_EM_matrix::use_precomputed_phyloemit (Emit_loglike_matrix& phyloemit)
void ECFG_EM_matrix::compute_phylo_likelihoods_with_beagle()
{
-#ifdef BEAGLE_INCLUDED
+#if defined(BEAGLE_INCLUDED) && BEAGLE_INCLUDED
// set up the outputs & inputs
const int subseqs = env.subseqs();
@@ -1202,7 +1202,7 @@ void ECFG_inside_outside_matrix::annotate (Stockholm& stock, GFF_list& gff_list,
stock.set_gc_annot (logpostprob_tag, ppbycol);
}
-void ECFG_inside_outside_matrix::annotate_all_post_state_ll (GFF_list& gff_list, const sstring& seqname, const ECFG_cell_score_map& annot, const sstring& annot_tag) const
+void ECFG_inside_outside_matrix::annotate_all_post_state_ll (GFF_list& gff_list, const sstring& seqname, const ECFG_cell_score_map& annot, const sstring& annot_tag, double min_postprob) const
{
for (int subseq_idx = 0; subseq_idx < inside.env.subseqs(); ++subseq_idx)
{
@@ -1212,32 +1212,49 @@ void ECFG_inside_outside_matrix::annotate_all_post_state_ll (GFF_list& gff_list,
{
ECFG_cell_score_map::const_iterator annot_state_iter = annot.find (ECFG_subseq_state (subseq, *s));
const bool annot_found = annot_state_iter != annot.end();
-
- annotate_post_prob (gff_list, seqname, *s, subseq);
- if (annot_found)
- gff_list.back().set_value (annot_tag.c_str(), "1");
+ const Prob pp = Nats2Prob (post_state_ll (*s, subseq_idx));
+ if (pp >= min_postprob || annot_found)
+ {
+ map<sstring,sstring> extra;
+ if (annot_found)
+ extra[annot_tag] = sstring("1");
+ annotate_post_prob (gff_list, seqname, *s, subseq, &extra);
+ }
}
}
}
-void ECFG_inside_outside_matrix::annotate_post_prob (GFF_list& gff_list, const sstring& seqname, int state, const Subseq_coords& subseq) const
+void ECFG_inside_outside_matrix::annotate_post_prob (GFF_list& gff_list, const sstring& seqname, int state, const Subseq_coords& subseq, const map<sstring,sstring>* extra_annotations) const
{
const int subseq_idx = inside.env.find_subseq_idx (subseq.start, subseq.len);
+ const Log2 inout_lg = Nats2Bits (post_state_ll (state, subseq_idx));
+ const Log2 inside_lg = Nats2Bits (inside.cell (subseq_idx, state));
GFF gff;
gff.seqname = seqname.size() ? seqname : inside.stock.get_name();
gff.source = inside.ecfg.name;
gff.feature = inside.ecfg.state_info[state].name;
gff.start = subseq.start + 1;
gff.end = subseq.end();
- gff.score = Nats2Bits (inside.cell (subseq_idx, state));
+ gff.score = inout_lg;
const sstring unique_id = gff_list.create_unique_id();
gff.set_value (GFF_ID_tag, unique_id.c_str());
- sstring score_str;
- score_str << Nats2Bits (post_state_ll (state, subseq_idx));
- gff.set_value (ECFG_GFF_LogPostProb_tag, score_str.c_str());
+ sstring inout_tag, inout_score_str;
+ inout_score_str << inout_lg;
+ gff.set_value (ECFG_GFF_LogPostProb_tag, inout_score_str.c_str());
+
+ sstring inside_tag, inside_score_str;
+ inside_score_str << inside_lg;
+ gff.set_value (ECFG_GFF_LogInsideProb_tag, inside_score_str.c_str());
+
+ if (extra_annotations)
+ {
+ typedef map<sstring,sstring> AnnotMap;
+ for_const_contents (AnnotMap, *extra_annotations, tv)
+ gff.set_value (tv->first.c_str(), tv->second.c_str());
+ }
gff_list.push_back (gff);
}
View
@@ -217,8 +217,8 @@ struct ECFG_inside_outside_matrix : Grammar_state_enum, ECFG_posterior_probabili
// Annotation methods
void annotate (Stockholm& stock, GFF_list& gff_list, const sstring& seqname, const ECFG_cell_score_map& annot, const sstring& logpostprob_tag) const;
- void annotate_all_post_state_ll (GFF_list& gff_list, const sstring& seqname, const ECFG_cell_score_map& annot, const sstring& annot_tag) const;
- void annotate_post_prob (GFF_list& gff_list, const sstring& seqname, int state, const Subseq_coords& subseq) const;
+ void annotate_all_post_state_ll (GFF_list& gff_list, const sstring& seqname, const ECFG_cell_score_map& annot, const sstring& annot_tag, double min_postprob = 0.) const;
+ void annotate_post_prob (GFF_list& gff_list, const sstring& seqname, int state, const Subseq_coords& subseq, const map<sstring,sstring>* extra_annotations = 0) const;
void annotate_hidden_classes (Stockholm& stock, const ECFG_cell_score_map& annot);
};
View
@@ -17,7 +17,7 @@
#define LOGPOSTPROB_TAG_PREFIX "PP"
// GFF tags for posterior probabilities
-#define CYK_STATE_LABEL "CYK" /* indicates that GFF postprob line is part of CYK trace */
+#define CYK_STATE_LABEL "inCYKParse" /* indicates that GFF postprob line is part of CYK trace */
// tags for training meta-information in grammar
#define TRAINING_INFO "training-info"
@@ -152,7 +152,8 @@ void ECFG_main::init_opts (const char* desc)
opts.add ("s -score", report_sumscore = false, "report Inside log-likelihood, corresponding to a sum over all parse trees");
opts.add ("c -confidence", report_confidence = false, "report Inside-Outside posterior log-probabilities of nodes in CYK parse tree");
opts.add ("pp -postprob", report_postprob = false, "report Inside-Outside posterior log-probabilities of all possible parse tree nodes");
- opts.add ("hc -hidden-classes", report_hidden_classes = false, "impute ML hidden classes at each site (for substitution models with hidden classes)");
+ opts.add ("mpp -min-postprob", min_postprob = .001, "minimum posterior probability to report for --postprob option");
+ opts.add ("hc -hidden-classes", report_hidden_classes = false, "impute ML hidden classes at each site & each taxon (for substitution models with hidden classes)");
opts.newline();
opts.print_title ("Annotation output");
@@ -163,7 +164,7 @@ void ECFG_main::init_opts (const char* desc)
opts.print_title ("Acceleration of annotation DP algorithms (experimental)");
opts.add ("fp -fast-prune", use_fast_prune = false, "attempt pruning algorithm in probability-space, rather than log-space (caveat: prone to underflow)");
-#ifdef BEAGLE_INCLUDED
+#if defined(BEAGLE_INCLUDED) && BEAGLE_INCLUDED
opts.add ("bgl -beagle", use_beagle = false, "use Beagle GPU library to do pruning");
#else /* BEAGLE_INCLUDED */
opts.add ("bgl -beagle", use_beagle = false); // if Beagle not compiled, then allow this option, but don't advertise it (using it will cause a warning)
@@ -702,24 +703,32 @@ void ECFG_main::annotate_alignments (ostream* align_stream)
{
if (report_postprob)
{
+ CTAG(6,XRATE) << "Annotating posterior probabilities for all subsequences/states\n";
sstring pp_tag;
pp_tag << LOGPOSTPROB_TAG_PREFIX << '_' << ecfg_name;
sstring trace_tag (CYK_STATE_LABEL);
- inout_mx->annotate_all_post_state_ll (gff_list, align_id, cyk_trace, trace_tag);
+ inout_mx->annotate_all_post_state_ll (gff_list, align_id, cyk_trace, trace_tag, min_postprob);
}
if (report_confidence)
{
+ CTAG(6,XRATE) << "Annotating posterior probabilities for CYK parse\n";
sstring pp_tag;
pp_tag << CONFIDENCE_TAG_PREFIX << '_' << ecfg_name;
inout_mx->annotate (*stock, gff_list, align_id, cyk_trace, pp_tag);
}
if (want_hidden_classes)
- inout_mx->annotate_hidden_classes (*stock, cyk_trace);
+ {
+ CTAG(6,XRATE) << "Annotating hidden class labels for CYK parse\n";
+ inout_mx->annotate_hidden_classes (*stock, cyk_trace);
+ }
if (want_ancestral_reconstruction)
- inout_mx->inside.reconstruct_MAP (*stock, cyk_trace, CYK_MAP_reconstruction_tag, ancrec_CYK_MAP, ancrec_postprob);
+ {
+ CTAG(6,XRATE) << "Annotating ancestral reconstruction for CYK parse\n";
+ inout_mx->inside.reconstruct_MAP (*stock, cyk_trace, CYK_MAP_reconstruction_tag, ancrec_CYK_MAP, ancrec_postprob);
+ }
}
}
View
@@ -31,6 +31,7 @@ struct ECFG_main
bool report_confidence;
bool report_postprob;
bool report_hidden_classes;
+ double min_postprob;
int max_subseq_len;
sstring grammars_filename;
sstring tree_grammar_filename;
@@ -4,7 +4,7 @@
#include "util/svisitor.h"
#include "seq/pkeywords.h"
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
#include "ecfg/guile-ecfg.h"
#endif /* GUILE_INCLUDED */
@@ -4,7 +4,7 @@
#include "tree/tree_alignment.h"
#include "util/sexpr.h"
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
#include <libguile.h>
#include "guile/guile-keywords.h"
#include "guile/stockholm-type.h"
@@ -15,7 +15,7 @@
#include "ecfg/ecfgsexpr.h"
#include "ecfg/guile-ecfg.h"
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
static void get_alphgram_sexpr (SCM alphabet_and_grammar_scm,
SExpr*& top_level_sexpr,
SExpr*& alphabet_and_grammar_sxpr)
@@ -193,7 +193,7 @@ void init_xrate_primitives (void)
// ECFG_Scheme_evaluator
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
static void*
register_grammar_functions (void* data)
{
@@ -213,7 +213,7 @@ register_grammar_functions (void* data)
ECFG_Scheme_evaluator::ECFG_Scheme_evaluator (const Stockholm* stock)
: stock(stock)
{
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
register_functions = &register_grammar_functions;
data = (void*) this;
#endif /* GUILE_INCLUDED */
@@ -5,17 +5,17 @@
#include "seq/stockholm.h"
#include "ecfg/ecfg.h"
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
#include "guile/guile-defs.h"
#endif /* GUILE_INCLUDED */
// ECFG
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
SCM ecfg_to_scm (const ECFG_scores& ecfg, const ECFG_counts* counts = 0);
#endif /* GUILE_INCLUDED */
// xrate functions
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
void init_xrate_primitives (void);
#endif /* GUILE_INCLUDED */
View
@@ -1,7 +1,7 @@
#ifndef GFF_INCLUDED
#define GFF_INCLUDED
-#include <list>
+#include <deque>
#include "util/sstring.h"
#include "util/Regexp.h"
#include "seq/biosequence.h"
@@ -71,7 +71,7 @@ struct GFF : GFF_enum, NSE
friend istream& operator>> (istream& ins, GFF& gff);
};
-struct GFF_list : list<GFF>, GFF_enum
+struct GFF_list : deque<GFF>, GFF_enum
{
// basic IO
friend ostream& operator<< (ostream& out, const GFF_list& gff_list);
@@ -422,7 +422,7 @@ SExpr_Scheme_evaluator::SExpr_Scheme_evaluator()
void SExpr_Scheme_evaluator::initialize()
{
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
if (!initialized)
scm_with_guile (register_functions, data);
write_proc = scm_variable_ref (scm_c_lookup("write"));
@@ -484,7 +484,7 @@ SExpr SExpr_Scheme_evaluator::evaluate (SExpr& sexpr) const
THROWEXPR("SExpr_Scheme_evaluator::evaluate called on an atom");
if (sexpr.is_empty_list())
THROWEXPR("SExpr_Scheme_evaluator::evaluate called on an empty list");
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
sstring result_string;
// iterate over all but the first child
SExpr::SExprIter iter = sexpr.child.begin();
View
@@ -4,7 +4,7 @@
#include "util/sexpr.h"
#include <map>
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
#include <libguile.h>
#endif /* GUILE_INCLUDED */
@@ -123,7 +123,7 @@ class SExpr_Scheme_evaluator
void* (*register_functions)(void *); // pointer to function that registers functions
void* data; // data that will be passed to *register_functions
static bool initialized;
-#ifdef GUILE_INCLUDED
+#if defined(GUILE_INCLUDED) && GUILE_INCLUDED
SCM write_proc;
#endif /* GUILE_INCLUDED */
public:

0 comments on commit 7d536ef

Please sign in to comment.