Removed custom '<clipN>' cs notation.

Instead of writing a script to process the '<clipN>' notation, I decided to stick with just using tuples to designate features that have clipping. This makes counting mutations and clipping easier.
jbloomlab · Aug 22, 2019 · dcdb543 · dcdb543 · khdcrawford · Aug 22, 2019
1 parent 74760e2
commit dcdb543
Show file tree

Hide file tree

Showing 3 changed files with 1,184 additions and 124 deletions.
diff --git a/alignparse/cs_tag.py b/alignparse/cs_tag.py
@@ -18,8 +18,7 @@
     'identity': ':[0-9]+',
     'substitution': r'\*[acgtn][acgtn]',
     'insertion': r'\+[acgtn]+',
-    'deletion': r'\-[acgtn]+',
-    'clip': '<clip[0-9]+>'
+    'deletion': r'\-[acgtn]+'
     }
 """dict: Short ``cs`` tag operation regular expression matches."""
 
@@ -53,9 +52,6 @@ def split_cs(cs_string, *, invalid='raise'):
     >>> split_cs(':32*nt*na:10-gga:5+aaa:10')
     [':32', '*nt', '*na', ':10', '-gga', ':5', '+aaa', ':10']
 
-    >>> split_cs('<clip8>:32*nt*na:10-gga:5')
-    ['<clip8>', ':32', '*nt', '*na', ':10', '-gga', ':5']
-
     >>> split_cs('bad:32*nt*na:10-gga:5', invalid='ignore') is None
     True
 
@@ -100,8 +96,6 @@ def cs_op_type(cs_op, *, invalid='raise'):
     'substitution'
     >>> cs_op_type(':45')
     'identity'
-    >>> cs_op_type('<clip8>')
-    'clip'
     >>> cs_op_type('*nt:45')
     Traceback (most recent call last):
     ...
@@ -151,10 +145,6 @@ def cs_op_len_target(cs_op, *, invalid='raise'):
     2
     >>> cs_op_len_target('+gc')
     0
-    >>> cs_op_len_target('<clip8>')
-    8
-    >>> cs_op_len_target('<clip46>')
-    46
     >>> cs_op_len_target('*nt:45')
     Traceback (most recent call last):
     ...
@@ -377,60 +367,56 @@ def extract_cs(self, start, end):
 
         return (feature_cs, clip5, clip3)
 
-def cs_to_sequence(cs, feat_name, feat_seq, *, custom_cs=False):
+
+def cs_to_sequence(cs, seq):
     """Convert `cs` string for a feature in `feat_seq` to nt sequence.
 
-    Paramters
-    ---------
-    cs : str
-        `cs` string for feature
-    feat_name : str
-        Name of feature for which the cs string is being converted to seq.
-    custom_cs : bool
-        If `True`, will process custom `cs` strings that include clip amounts
-        on the 5' and/or 3' ends. Default is False to require users to
-        to consider if they want to process `cs` strings with clipping.
+    Parameters
+    ----------
+    cs : str or tuple
+        `cs` string or tuple for feature. If there was clipping, this is a
+        tuple of (cs string, clip5, clip3).
+    seq : str
+        Sequence of target for region corresponding to `cs` string.
 
     Returns
     -------
     sequence : str
         Nucleotide sequence for specified feature in the query.
+
     """
     raise RuntimeError('not yet implemented')
 
-def cs_to_mutation_str(cs, feat_name, *, custom_cs=False):
+
+def cs_to_mutation_str(cs, seq):
     """Convert `cs` string for a feature in `feat_seq` to mutation string.
-    
-    Paramters
-    ---------
-    cs : str
-        `cs` string for feature
-    feat_name : str
-        Name of feature for which the cs string is being converted to a
-        mutation string.
-    custom_cs : bool
-        If `True`, will process custom `cs` strings that include clip amounts
-        on the 5' and/or 3' ends. Default is False to require users to
-        to consider if they want to process `cs` strings with clipping.
+
+    Parameters
+    ----------
+    cs : str or tuple
+        `cs` string or tuple for feature. If there was clipping, this is a
+        tuple of (cs string, clip5, clip3).
+    seq : str
+        Sequence of target for region corresponding to `cs` string.
 
     Returns
     -------
     mut_str : str
         Mutation string of form 'A56T G86A' for all mutations in the feature
         in the query compared to the target sequence.
+
     """
     raise RuntimeError('not yet implemented')
 
+
 def cs_to_mutation_count(cs):
     """Count the number of nucleotide mutations in `cs` string.
-    
-    Paramters
-    ---------
-    cs : str
-        `cs` string for feature
 
-    There is no `custom_cs` parameter becuase this function does ignores
-    clipping.
+    Parameters
+    ----------
+    cs : str or tuple
+        `cs` string or tuple for feature. If there was clipping, this is a
+        tuple of (cs string, clip5, clip3).
 
     Returns
     -------
@@ -443,10 +429,15 @@ def cs_to_mutation_count(cs):
     -------
     >>> cs_to_mutation_count(':4*nt-tc:2+g')
     4
-
-    >>> cs_to_mutation_count('<clip4>:4*nt-tc:2+g')
+    >>> cs_to_mutation_count((':4*nt-tc:2+g', 4, 8))
     4
+
     """
+    if type(cs) is tuple:
+        if len(cs) != 3:
+            raise ValueError(f"Invalid `cs` tuple of {cs}")
+        cs = cs[0]
+
     mut_count = 0
     cs_list = split_cs(cs)
     for cs_op in cs_list:
@@ -461,26 +452,38 @@ def cs_to_mutation_count(cs):
     return mut_count
 
 
-def cs_to_clip_count(cs, *, custom_cs=True):
+def cs_to_clip_count(cs):
     """Count the number of clipped nucleotides in `cs` string.
-    
-    Paramters
-    ---------
-    cs : str
-        `cs` string for feature
-    custom_cs : bool
-        If `True`, will process custom `cs` strings that include clip amounts
-        on the 5' and/or 3' ends. Default is `True` as this function uses the
-        information specified in the custom `cs` strings that specify 5' and/
-        or 3' clipping.
+
+    Parameters
+    ----------
+    cs : str or tuple
+        `cs` string or tuple for feature. If there was clipping, this is a
+        tuple of (cs string, clip5, clip3).
 
     Returns
     -------
     clip_count : int
         Number of nucleotides that are clipped from either or both ends of the
         feature in the query sequence.
+
+    Example
+    -------
+    >>> cs_to_clip_count((':4*nt-tc:2+g', 4, 8))
+    12
+    >>> cs_to_clip_count(':4*nt-tc:2+g')
+    0
+
     """
-    raise RuntimeError('not yet implemented')
+    if type(cs) is tuple:
+        if len(cs) != 3:
+            raise ValueError(f"Invalid `cs` tuple of {cs}")
+        clip_count = cs[1] + cs[2]
+    else:
+        clip_count = 0
+
+    return clip_count
+
 
 if __name__ == '__main__':
     import doctest

diff --git a/alignparse/targets.py b/alignparse/targets.py
@@ -558,9 +558,9 @@ def parse_alignment_cs(self, samfile, *, multi_align='primary'):
 
                 - a column with the name of each feature in the target giving
                   the ``cs`` string for that feature's alignment. If feature's
-                  alignment is clipped (incomplete), this is indicated by
-                  adding '<clipN>' (where 'N' is the amount of clipping) to the
-                  ``cs`` string. For instance: '<clip7>:5*cg:3<clip2>'
+                  alignment is clipped (incomplete), this column gives a tuple
+                  of (cs, clip5, clip3). This is the same format as the
+                  output from `extract_cs`. For instance: (':5*cg:3', 7, 2)
                   indicates 7 nucleotides clipped at 5' end, 2 nucleotides at
                   3' end, and a ``cs`` string of ':5*cg:3' for aligned portion.
 
@@ -601,10 +601,10 @@ def parse_alignment_cs(self, samfile, *, multi_align='primary'):
                     else:
                         feat_cs, clip5, clip3 = feat_info
                         if clip5 != 0:
-                            feat_cs = f"<clip{clip5}>{feat_cs}"
+                            feat_cs = feat_info
 
                         if clip3 != 0:
-                            feat_cs = f"{feat_cs}<clip{clip3}>"
+                            feat_cs = feat_info
 
                         d[tname][feature.name].append(feat_cs)