Skip to content
Browse files

Placement algorithm functional

  • Loading branch information...
1 parent ad17f65 commit e3e6cd1f120db406f560e9b36592ea1ef7dd1506 @breadbaron breadbaron committed
Showing with 88 additions and 67 deletions.
  1. +15 −6 python/visualizeReconstruction.py
  2. +30 −23 src/protpal/ReadProfileScore.cc
  3. +39 −35 src/protpal/protpal.cc
  4. +4 −3 src/protpal/reconstruction.cc
View
21 python/visualizeReconstruction.py
@@ -2,7 +2,7 @@
from math import exp, log
from parseSexpr import str2sexpr
from SExpr import *
-from extra_functions import float2str
+from extra_functions import float2str, get_cmd_args
allReqsFound = True
try:
from weblogolib import *
@@ -97,16 +97,25 @@ def write_PNG(filename, weights, alphabet, type="weblogo"):
node = profile.get_value_for_tag("node")
incoming_alphabet = list("arndcqeghilkmfpstwyv".upper()) # eventually get this from the profile file
alphabet = str(unambiguous_protein_alphabet)
-viterbi_path = profile.find_all_with_tag("viterbi_path")[0]
+try:
+ viterbi_path = profile.find_all_with_tag("viterbi_path")[0]
+except:
+ sys.stderr.write("Viterbi path not found...continuing\n")
+ viterbi_path = []
outString = "digraph profile_node_"+node+"{\nedge[arrowsize=0.5];\n rankdir=LR\n"
colors = r.rainbow(20);
sys.stderr.write("Creating PNGs for each state in profile...\n")
-maxPostProb = max( [ getfloat(state.get_value_for_tag("postprob")) for state in profile.find_all_with_tag("state") \
- if not state.get_value_for_tag("type") in ['start', 'end','wait']])
-minPostProb = min( [ getfloat(state.get_value_for_tag("postprob")) for state in profile.find_all_with_tag("state") \
+try:
+ maxPostProb = max( [ getfloat(state.get_value_for_tag("postprob")) for state in profile.find_all_with_tag("state") \
+ if not state.get_value_for_tag("type") in ['start', 'end','wait']])
+ minPostProb = min( [ getfloat(state.get_value_for_tag("postprob")) for state in profile.find_all_with_tag("state") \
if not state.get_value_for_tag("type") in ['start', 'end','wait']])
+except:
+ sys.stderr.write("Postprobs not found...continuing\n")
+ maxPostProb = minPostProb = 1
+
statecount = 0
@@ -173,7 +182,7 @@ def write_PNG(filename, weights, alphabet, type="weblogo"):
dotFileName = 'test.dot'
-outputFileName = 'output.pdf'
+outputFileName = sys.argv[1]
DOTcmd = "dot -Tpdf %s > %s"%(dotFileName, outputFileName)
fh=open(dotFileName,'w')
View
53 src/protpal/ReadProfileScore.cc
@@ -62,7 +62,7 @@ bfloat ReadProfileScore::get_score(const Read& read, bool viterbi)
dummy_ostream, // hmmoc filestream
false, // only write hmmoc file, don't do actual DP
false, // keep backPointers - for finding viterbi traceback
- true); // display logging messages
+ false); // display logging messages
value = get_forward_value();
}
clear_DP_matrix();
@@ -188,8 +188,8 @@ void ReadProfileScore::fill_DP_matrix(const Read& read, ostream& hmmoc, bool hmm
cerr<<"\tInvestigating transition from : "<< pairHMM.state_name[*kPrime] <<endl;
iPrime = get_incoming_read_state(i, *kPrime);
// We can be sure this DP cell will be -inf, so we continue
- // if (iPrime == -1 && pairHMM.state_type[*kPrime] != "start")
- // continue;
+ if (iPrime == -1 && pairHMM.state_type[*kPrime] != "start")
+ continue;
hmm_transition_weight = pairHMM.get_transition_weight(*kPrime, *k);
if (logging)
cerr<<"\tTransition weight between HMM states: " << hmm_transition_weight << endl;
@@ -216,7 +216,7 @@ void ReadProfileScore::fill_DP_matrix(const Read& read, ostream& hmmoc, bool hmm
toAdd = get_DP_cell(iPrime, *jPrime, *kPrime);
if (! toAdd > 0.0)
{
- cerr<<"\tThis incoming trans had 1e-inf Forward value\n";
+ // cerr<<"\tThis incoming trans had 1e-inf Forward value\n";
continue;
}
if (*j != *jPrime)
@@ -324,8 +324,8 @@ bfloat ReadProfileScore::get_viterbi_value(void)
inline int ReadProfileScore::get_incoming_read_state(int readIndex, state hmm_state)
{
- if (readIndex == 0)
- cerr<<"Warning: incoming on start of read called!\n";
+ // if (readIndex == 0)
+ // cerr<<"Warning: incoming on start of read called!\n";
if (pairHMM.state_type[hmm_state] == "insert")
return readIndex;
@@ -351,7 +351,7 @@ inline vector<state> ReadProfileScore::get_incoming_profile_states(state profile
ReadProfileModel::ReadProfileModel(void) //Alphabet& alphabet_in, Irrev_EM_matrix& rate_matrix_in)
{
// eventually we'd like to optimize this value!
- branch_length = 0.1;
+ branch_length = 0.01;
// Build up the HMM
num_states=-1;
@@ -472,30 +472,41 @@ inline bfloat ReadProfileModel::get_emission_weight(int readIndex, state profile
emissionWeight = 0.0;
// cerr<< "Size of read profile: " << read_profile.size() << endl;
// cerr<< "Index queried: " << readIndex << endl;
- // toks = sub_alphabet.tokens()
+ vector<sstring> toks = sub_alphabet.tokens();
+
+ // Match state: emit to both read and profile
if (state_type[hmm_state] == "match")
- for (symbolIter = read_profile[readIndex].begin();
- symbolIter != read_profile[readIndex].end(); symbolIter++)
- for (alphIdx = 0; alphIdx < alphabet_size; alphIdx++)
- emissionWeight += equilibrium_dist[alphIdx] *
- conditional_sub_matrix(alphIdx,symbolIter->first) * symbolIter->second;
+ {
+ for (symbolIter = read_profile[readIndex].begin();
+ symbolIter != read_profile[readIndex].end(); symbolIter++)
+ for (alphIdx = 0; alphIdx < alphabet_size; alphIdx++)
+ {
+ emissionWeight += equilibrium_dist[alphIdx]* // prior on this character
+ profile->get_absorb_weight(profileState, alphIdx)* //absorption by profile
+ conditional_sub_matrix(alphIdx,symbolIter->first)* // substitute to read's character
+ symbolIter->second; // read's affinity for the given character
+ }
+ //cerr<<"\n\nThe emission weight was: " << emissionWeight << endl;
+ }
+ // Insert state: emit only to profile - product of equilibrium prob and absorb weight
else if ( state_type[hmm_state] == "insert" )
for (alphIdx = 0; alphIdx < alphabet_size; alphIdx++)
- emissionWeight += profile->get_absorb_weight(profileState, alphIdx) *
- equilibrium_dist[alphIdx];
+ emissionWeight += profile->get_absorb_weight(profileState, alphIdx) * // absorption by profile
+ equilibrium_dist[alphIdx]; // prior on this character
+ // Delete state: emit only to read - product of equilibrium prob and affinity for character
else if ( state_type[hmm_state] == "delete" )
for (symbolIter = read_profile[readIndex].begin();
symbolIter != read_profile[readIndex].end(); symbolIter++)
{
// cerr<<toks[*symbolIter] << " has weight: " << equilibrium_dist[alphIdx] <<endl;
- emissionWeight += equilibrium_dist[symbolIter->first]*symbolIter->second;
+ emissionWeight += equilibrium_dist[symbolIter->first]*// prior on character
+ symbolIter->second; // read's affinity for given character
}
-
else
emissionWeight = 1.0;
- //cerr<<"\nThe emission weight was: " << result << endl;
+
// cerr<<"Done\t";
return emissionWeight;
}
@@ -509,11 +520,7 @@ void ReadProfileModel::set_substitution_model(Alphabet& alphabet_in, Irrev_EM_ma
equilibrium_dist = rate_matrix_in.create_prior();
}
-
-
-
-
-// HMMoC Adapter - an option to the ReadProfileScore class
+// HMMoC Adapter - an option to the ReadProfileScore class. Not really functional just yet...
void ReadProfileScore::HMMoC_adapter(const char* filename, bool precompute)
{
start_clique.clear(); mainClique.clear(); end_clique.clear();
View
74 src/protpal/protpal.cc
@@ -31,6 +31,8 @@
int main(int argc, char* argv[])
{
+ try{
+
time_t start,end;
time (&start);
// A few utility variables
@@ -38,14 +40,13 @@ int main(int argc, char* argv[])
vector<Node> children;
vector<double> branchLengths;
vector<string> node_names;
- double verySmall = 0.001; //proxy for zero-length branches
+ double verySmall = 1e-5; //proxy for zero-length branches
double branch_length;
string alignString;
// create main reconstruction object
Reconstruction reconstruction(argc, argv);
-
// yeccch - I've mostly moved over to using weight_profiles, but some
// hackiness still persists! -OW
// It seems like this is used only in reading transducers in from a file...def. fixable
@@ -67,7 +68,7 @@ int main(int argc, char* argv[])
if (fn->second == "None")
continue;
else
- cerr<< reconstruction.tree.node_name[fn->first] << "\t" << fn->second << endl;
+ cerr<< "\t" << reconstruction.tree.node_name[fn->first] << "\t" << fn->second << endl;
cerr<<"Parsing reads...";
map<string, string> reads =
@@ -96,15 +97,22 @@ int main(int argc, char* argv[])
}
}
-
// Display stuff that got stored in scores
+ cerr<<"\n\n";
+ cout<<"#Read\tNode\tPosteriorProbability\n";
+ bfloat totalScore;
for (ScoreMap::iterator readIter = scores.begin(); readIter != scores.end();
++readIter)
{
- cout << "Read: " << readIter->first << "\n";
+ totalScore = 0.0;
+ for (map<string, bfloat>::iterator nodeIter = (readIter->second).begin();
+ nodeIter != (readIter->second).end(); ++nodeIter)
+ totalScore += nodeIter->second;
+
for (map<string, bfloat>::iterator nodeIter = (readIter->second).begin();
nodeIter != (readIter->second).end(); ++nodeIter)
- cout << "\t" << nodeIter->first << "\t" << nodeIter->second<<endl;
+ cout << readIter->first<< "\t" << nodeIter->first << "\t" << nodeIter->second/totalScore <<endl;
+ cout<< "\n\n";
}
@@ -135,37 +143,27 @@ int main(int argc, char* argv[])
reconstruction.root_profile_filename = reconstruction.profile_to_make + ".sexpr";
}
- if ( reconstruction.tree.is_leaf(new_root) )
+ if (reconstruction.tree.is_leaf(new_root) )
{
- //Just write the thing, and exit. Implement an exact-match profile writer
- ExactMatch leaf(
- reconstruction.sequences[reconstruction.tree.node_name[new_root]], // sequence
- new_root, //tree index
- reconstruction.alphabet, // sequence alphabet
- reconstruction.codon_model // codon model?
- );
- // Then, make an absorbing transducer from this, and write the profile from there
- AbsorbingTransducer leafAbsorb(&leaf);
- ofstream saved_profile;
- saved_profile.open(reconstruction.root_profile_filename.c_str());
- state_path dummy; // dummy viterbi path
- leafAbsorb.write_profile(saved_profile, dummy);
- saved_profile.close();
- exit(0);
+ Phylogeny::Node n = reconstruction.tree.add_named_node("", new_root, 1e-5);
+ reconstruction.tree.node_name[new_root] = "tmpRoot";
+ reconstruction.tree.node_name[n] = reconstruction.profile_to_make; ;
+ }
+ // Re-root the tree at the desired node, and then proceed with reconstruction
+ if ( new_root != reconstruction.tree.root )
+ {
+ string tmpFileName = "tree_tmp.newick";
+ ofstream tmpTreeFile(tmpFileName.c_str());
+ reconstruction.tree.write(tmpTreeFile, -1, new_root);
+ tmpTreeFile.close();
+ ifstream tree_file(tmpFileName.c_str());
+ reconstruction.tree.read(tree_file); //tmpFileName.c_str());
+ reconstruction.tree.force_binary();
+ reconstruction.set_node_names();
+ system("rm -f tree_tmp.newick");
}
- string tmpFileName = "tree_tmp.newick";
- ofstream tmpTreeFile(tmpFileName.c_str());
- reconstruction.tree.write(tmpTreeFile, -1, new_root);
- tmpTreeFile.close();
- ifstream tree_file(tmpFileName.c_str());
- reconstruction.tree.read(tree_file); //tmpFileName.c_str());
- reconstruction.tree.force_binary();
- system("rm -f tree_tmp.newick");
}
- cout<<"The new tree:\n ";
- reconstruction.tree.write(cout, -1, -1);
-
// These transducers remain the same throughout the traversal, so we can initialize them
// once and for all and leave them.
SingletTrans R(reconstruction.alphabet, reconstruction.rate_matrix, reconstruction.root_insert_prob);
@@ -389,9 +387,9 @@ int main(int argc, char* argv[])
if (reconstruction.loggingLevel>=1)
{
- cerr<<"\tLeft profile has: "<<profile.left_profile.num_delete_states;
+ cerr<<"\tLeft profile " << profile.left_profile.name << " has: "<<profile.left_profile.num_delete_states;
cerr<<" absorbing states.\n";
- cerr<<"\tRight profile has: "<<profile.right_profile.num_delete_states;
+ cerr<<"\tRight profile " << profile.right_profile.name << " has: "<<profile.right_profile.num_delete_states;
cerr<< " absorbing states.\n";
}
@@ -759,6 +757,12 @@ int main(int argc, char* argv[])
{
cerr<<"\nProtPal reconstruction completed without errors. \n";
}
+ }
+ catch(const Dart_exception& e)
+ {
+ cerr<<e.what();
+ exit(1);
+ }
return(0);
View
7 src/protpal/reconstruction.cc
@@ -147,10 +147,10 @@ Reconstruction::Reconstruction(int argc, char* argv[])
viterbi = !stoch_trace;
codon_model = (codon_matrix_filename != "None");
-
// Make sure we have the essential data - sequences and a tree
// First, make sure we have sequence data from somewhere
- if(stkFileName == "None" && fastaFileName =="None" && !simulate && phylocomposer_filename == "None")
+ if (stkFileName == "None" && fastaFileName =="None" && !simulate && phylocomposer_filename == "None" &&
+ reads_to_place_filename == "None")
{
error += "\tNo sequence file could be imported. Use -stk or -fa to specify a sequence file\n";
all_reqd_args = false;
@@ -221,7 +221,8 @@ Reconstruction::Reconstruction(int argc, char* argv[])
// Irrev_EM_matrix rate_matrix(1,1);
// Alphabet alphabet ("uninitialized", 1);
ECFG_builder::init_chain_and_alphabet (alphabet, rate_matrix, ecfg_sexpr);
- parse_sequences(alphabet);
+ if (stkFileName || fastaFileName)
+ parse_sequences(alphabet);
if(estimate_root_insert)
{

0 comments on commit e3e6cd1

Please sign in to comment.
Something went wrong with that request. Please try again.