Added phyloExpCM_FreqsFromAlignment.py script

jbloom · Mar 31, 2014 · ebd9311 · ebd9311
1 parent 08b7604
commit ebd9311
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 4 deletions.
diff --git a/docs/phyloExpCM_FreqsFromAlignment.rst b/docs/phyloExpCM_FreqsFromAlignment.rst
@@ -4,7 +4,7 @@
 phyloExpCM_FreqsFromAlignment.py
 ========================================
 
-This simple script gets the frequencies of different amino acids from a sequence
+This simple script gets the frequencies of amino acids at each site from a sequence
 alignment. It then writes them to a format equivalent to that of the ``*_equilibriumpreferences.txt`` files produced by the `mapmuts`_ script ``mapmuts_inferpreferences.py``.
 
 Typically you would want to run this script if you wanted to create a file giving the amino-acid frequencies in an alignment of sequences that could be analyzed with the `mapmuts`_ scripts ``mapmuts_siteprofileplots.py`` or ``mapmuts_preferencescorrelate.py``.
@@ -35,12 +35,16 @@ The input file should contain the following keys:
 * *outputfile* is the created file in form the of the ``*_equilibriumpreferences.txt`` file created by the `mapmuts`_ script ``mapmuts_inferpreferences.py``. Specifically, for each site all the occurrences of each amino acid is counted. Gaps are disregarded, and stop codons are either included or disregarded depending on the value of *includestop*. For each site *r* the script counts the fraction of sequences (among those being counted at this site) that have *a* as the amino acid. The results are written to the create file as in::
 
     #SITE   WT_AA   SITE_ENTROPY    PI_A    PI_C    PI_D    PI_E    PI_F    PI_G    PI_H    PI_I    PI_K    PI_L    PI_M    PI_N    PI_P    PI_Q    PI_R    PI_S    PI_T    PI_V    PI_W    PI_Y    PI_*
-    1   M   2.85229 0.0113803   0.0402777   0.0153121   0.0320438   0.013312    0.00936795  0.026916    0.0192017   0.0224047   0.018926    0.554093    0.0445008   0.0138125   0.0212192   0.0131771   0.0152268   0.0237621   0.0102606   0.0369008   0.0367991   0.0211056
-    2   A   0.968359    0.878703    0.00384431  0.00390501  0.00815666  0.0048199   0.00244426  0.00333017  0.00258064  0.00391181  0.00094265  0.0248087   0.00238593  0.00064321  0.00288323  0.00610788  0.0012339   0.001455    0.0290568   0.0107545   0.00492073  0.00311186
-    3   S   2.93851 0.0295947   0.00304848  0.00227603  0.00668501  0.131168    0.000710502 0.0199653   0.00804841  0.000619715 0.366698    0.0132841   0.0223199   0.0048327   0.0170484   0.00875982  0.090712    0.0171888   0.0102176   0.177257    0.069395    0.000170873
+    1   na   2.85229 0.0113803   0.0402777   0.0153121   0.0320438   0.013312    0.00936795  0.026916    0.0192017   0.0224047   0.018926    0.554093    0.0445008   0.0138125   0.0212192   0.0131771   0.0152268   0.0237621   0.0102606   0.0369008   0.0367991   0.0211056
+    2   na   0.968359    0.878703    0.00384431  0.00390501  0.00815666  0.0048199   0.00244426  0.00333017  0.00258064  0.00391181  0.00094265  0.0248087   0.00238593  0.00064321  0.00288323  0.00610788  0.0012339   0.001455    0.0290568   0.0107545   0.00492073  0.00311186
+    3   na   2.93851 0.0295947   0.00304848  0.00227603  0.00668501  0.131168    0.000710502 0.0199653   0.00804841  0.000619715 0.366698    0.0132841   0.0223199   0.0048327   0.0170484   0.00875982  0.090712    0.0171888   0.0102176   0.177257    0.069395    0.000170873
 
   In this file, the ``SITE_ENTROPY`` column gives the site entropy taken to the base 2.
 
+  In this file, all of the wildtype identities (``WT_AA``) are set to ``na`` because there is no wildtype in the sequence alignment.
+
+  In this file, residues are numbered sequentially starting with one according to the aligned sequences in *alignmentfile*.
+
 Example input file
 ---------------------------
 Here is an example input file::

diff --git a/docs/scripts.rst b/docs/scripts.rst
@@ -13,6 +13,7 @@ Here are the scripts:
    phyloExpCM_buildHyphyExpCM
    phyloExpCM_optimizeHyphyTree
    phyloExpCM_ExpModelOptimizeHyphyTree
+   phyloExpCM_FreqsFromAlignment
    phyloExpCM_multiHyphyRuns
 
 

diff --git a/scripts/phyloExpCM_FreqsFromAlignment.py b/scripts/phyloExpCM_FreqsFromAlignment.py
@@ -0,0 +1,93 @@
+#!python 
+
+"""Script to get frequencies from sequence alignment
+
+This script is part of the ``phyloExpCM`` package.
+
+Written by Jesse Bloom."""
+
+
+import sys
+import os
+import re
+import time
+import phyloExpCM.io
+import phyloExpCM.submatrix
+import mapmuts.bayesian
+import mapmuts.sequtils
+
+
+def main():
+    """Main body of script."""
+    print "\nRunning phyloExpCM_FreqsFromAlignment.py..."
+
+    # parse arguments
+    args = sys.argv[1 : ]
+    if len(args) != 1:
+        raise IOError("Script must be called with exactly one argument specifying the name of the input file.")
+    infilename = sys.argv[1]
+    if not os.path.isfile(infilename):
+        raise IOError("Failed to find infile of %s" % infilename)
+    d = phyloExpCM.io.ParseInfile(open(infilename))
+    print "\nRead the following key / value pairs from infile %s:\n%s" % (infilename, '\n'.join(["%s %s" % tup for tup in d.iteritems()]))
+    alignmentfile = phyloExpCM.io.ParseStringValue(d, 'alignmentfile')
+    if not os.path.isfile(alignmentfile):
+        raise IOError("Failed to find alignmentfile of %s" % alignmentfile)
+    translateseqs = phyloExpCM.io.ParseBoolValue(d, 'translateseqs')
+    includestop = phyloExpCM.io.ParseBoolValue(d, 'includestop')
+    outputfile = phyloExpCM.io.ParseStringValue(d, 'outputfile')
+
+    # read sequences, make sure all of same length
+    print "Reading alignment from %s..." % alignmentfile
+    seqs = mapmuts.sequtils.ReadFASTA(alignmentfile)
+    if not seqs:
+        raise ValueError("Failed to find any sequences in alignmentfile")
+    seqlength = len(seqs[0][1])
+    for (head, seq) in seqs:
+        if len(seq) != seqlength:
+            raise ValueError("All sequences are not the same length in alignmentfile. Are you sure they are aligned?")
+    if translateseqs:
+        if seqlength % 3:
+            raise ValueError("Sequences are supposed to be translated, but the lengths are not multiples of 3")
+        seqs = mapmuts.sequtils.Translate(seqs, readthrough_stop=True, translate_gaps=True)
+        seqlength = len(seqs[0][1])
+    count_d = {}
+    aminoacids = mapmuts.sequtils.AminoAcids(includestop)
+    for r in range(1, seqlength + 1):
+        count_d[r] = dict([(aa, 0) for aa in aminoacids])
+    for (head, seq) in seqs:
+        for r in range(1, seqlength + 1):
+            aa = seq[r - 1]
+            if aa == '-':
+                continue
+            elif aa == '*' and not includestop:
+                continue
+            else:
+                assert aa in aminoacids, "Invalid amino acid %s" % aa
+                count_d[r][aa] += 1
+
+    # write outputfile
+    print "\nNow writing the frequencies to %s..." % outputfile
+    f = open(outputfile, 'w')
+    f.write('#SITE\tWT_AA\tSITE_ENTROPY')
+    for aa in aminoacids:
+        f.write('\tPI_%s' % aa)
+    f.write('\n')
+    for r in range(1, seqlength + 1):
+        pi_d = {}
+        n = float(sum(count_d[r].values()))
+        if not n:
+            raise ValueError("No counts for site %d" % r)
+        pi_d = dict([(aa, count_d[r][aa] / n) for aa in aminoacids])
+        assert abs(1.0 - sum(pi_d.values())) < 1.0e-7, "Sum of frequencies not close to one for site %d" % r
+        f.write('%d\tna\t%f' % (r, mapmuts.bayesian.SiteEntropy(pi_d)))
+        for aa in aminoacids:
+            f.write('\t%f' % pi_d[aa])
+        f.write('\n')
+    f.close()
+
+    # script done
+    print "\nScript complete."
+
+
+main() # run the script
diff --git a/setup.py b/setup.py
@@ -54,5 +54,6 @@
             'scripts/phyloExpCM_optimizeHyphyTree.py',
             'scripts/phyloExpCM_ExpModelOptimizeHyphyTree.py',
             'scripts/phyloExpCM_multiHyphyRuns.py',
+            'scripts/phyloExpCM_FreqsFromAlignment.py',
             ],
 )