Added pseudocounts option to phyloExpCM_FreqsFromAlignment.py

jbloom · Mar 31, 2014 · 478d8d5 · 478d8d5
1 parent ebd9311
commit 478d8d5
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 1 deletion.
diff --git a/docs/phyloExpCM_FreqsFromAlignment.rst b/docs/phyloExpCM_FreqsFromAlignment.rst
@@ -32,6 +32,8 @@ The input file should contain the following keys:
 
 * *includestop* : specifies whether we tally the frequencies of stop codons as a possible amino acid. If *True* then we include stop codons in the tally; if *False* then we do not.
 
+* *pseudocounts* : integer number of pseudocounts added to the counts for each amino acid at each site. If you set this to zero, the actual fractions will be computed. But you might want to use a pseudocount (such as one) if you don't want to estimate zero frequencies.
+
 * *outputfile* is the created file in form the of the ``*_equilibriumpreferences.txt`` file created by the `mapmuts`_ script ``mapmuts_inferpreferences.py``. Specifically, for each site all the occurrences of each amino acid is counted. Gaps are disregarded, and stop codons are either included or disregarded depending on the value of *includestop*. For each site *r* the script counts the fraction of sequences (among those being counted at this site) that have *a* as the amino acid. The results are written to the create file as in::
 
     #SITE   WT_AA   SITE_ENTROPY    PI_A    PI_C    PI_D    PI_E    PI_F    PI_G    PI_H    PI_I    PI_K    PI_L    PI_M    PI_N    PI_P    PI_Q    PI_R    PI_S    PI_T    PI_V    PI_W    PI_Y    PI_*
@@ -53,6 +55,7 @@ Here is an example input file::
     alignmentfile cds_sequences.fasta
     translateseqs True
     includestop False
+    pseudocounts 1
     outputfile aminoacid_frequencies.txt
 
 

diff --git a/scripts/phyloExpCM_FreqsFromAlignment.py b/scripts/phyloExpCM_FreqsFromAlignment.py
@@ -35,6 +35,7 @@ def main():
         raise IOError("Failed to find alignmentfile of %s" % alignmentfile)
     translateseqs = phyloExpCM.io.ParseBoolValue(d, 'translateseqs')
     includestop = phyloExpCM.io.ParseBoolValue(d, 'includestop')
+    pseudocounts = phyloExpCM.io.ParseIntValue(d, 'pseudocounts')
     outputfile = phyloExpCM.io.ParseStringValue(d, 'outputfile')
 
     # read sequences, make sure all of same length
@@ -54,7 +55,7 @@ def main():
     count_d = {}
     aminoacids = mapmuts.sequtils.AminoAcids(includestop)
     for r in range(1, seqlength + 1):
-        count_d[r] = dict([(aa, 0) for aa in aminoacids])
+        count_d[r] = dict([(aa, pseudocounts) for aa in aminoacids])
     for (head, seq) in seqs:
         for r in range(1, seqlength + 1):
             aa = seq[r - 1]