jbloomlab · jbloom · Aug 26, 2019 · Aug 12, 2019 · Aug 13, 2019 · Aug 13, 2019
diff --git a/alignparse/cs_tag.py b/alignparse/cs_tag.py
@@ -18,7 +18,7 @@
     'identity': ':[0-9]+',
     'substitution': r'\*[acgtn][acgtn]',
     'insertion': r'\+[acgtn]+',
-    'deletion': r'\-[acgtn]+',
+    'deletion': r'\-[acgtn]+'
     }
 """dict: Short ``cs`` tag operation regular expression matches."""
 
@@ -366,6 +366,185 @@ def extract_cs(self, start, end):
         return (feature_cs, clip5, clip3)
 
 
+def cs_to_sequence(cs, seq):
+    """Convert ``cs`` tag to a sequence.
+
+    Parameters
+    ----------
+    cs : str
+        `cs` string
+    seq : str
+        Sequence of target for region corresponding to `cs` string.
+
+    Returns
+    -------
+    sequence : str
+        Nucleotide sequence generated by applying `cs` to `seq`.
+
+    Example
+    -------
+    >>> cs_to_sequence(':4*nt-tc:2+g:2', 'CGGANTCCAAT')
+    'CGGATCAGAT'
+
+    """
+    cs_list = split_cs(cs)
+    seq_loc = 0
+    seq_list = []
+    for cs_op in cs_list:
+        op_type = cs_op_type(cs_op)
+        if op_type == 'identity':
+            op_len = cs_op_len_target(cs_op)
+            seq_list.append(seq[seq_loc: seq_loc + op_len])
+            seq_loc += op_len
+        elif op_type == 'substitution':
+            seq_list.append(cs_op[2])
+            seq_loc += 1
+        elif op_type == 'insertion':
+            seq_list.append(cs_op[1:])
+        elif op_type == 'deletion':
+            seq_loc += len(cs_op) - 1
+        else:
+            raise ValueError(f"Invalid cs `op_type` of {op_type}")
+
+    return ''.join(seq_list).upper()
+
+
+def cs_to_mutation_str(cs):
+    """Convert ``cs`` tag to a descriptive string of mutations.
+
+    Parameters
+    ----------
+    cs : str
+        A ``cs`` tag.
+
+    Returns
+    -------
+    mut_str : str
+        Space-delimited string of form 'A5T G86A ins7ACG del19to24'
+        for all mutations specified in `cs`.
+
+    Example
+    -------
+    >>> cs_to_mutation_str(':4*nt-tc:2+ga:6')
+    'del6to7 ins10GA'
+    >>> cs_to_mutation_str(':4*at-tc:2+ga:6')
+    'A5T del6to7 ins10GA'
+    >>> cs_to_mutation_str(':45')
+    ''
+
+    Note
+    ----
+    Mutation strings use "human readable" indexing, so the first nucleotide of
+    the sequence is 1 and deletions are inclusive of the last number.
+
+    Changes from ambiguous nucleotides to any other identity are **not**
+    considered mutations in the returned strings.
+
+    """
+    cs_list = split_cs(cs)
+    seq_loc = 1
+    mut_strs_list = []
+    for cs_op in cs_list:
+        op_type = cs_op_type(cs_op)
+        if op_type == 'identity':
+            seq_loc += cs_op_len_target(cs_op)
+        elif op_type == 'substitution':
+            if cs_op[1] != 'n':
+                sub = ''.join([cs_op[1], str(seq_loc), cs_op[2]]).upper()
+                mut_strs_list.append(sub)
+            seq_loc += 1
+        elif op_type == 'insertion':
+            ins = ''.join(['ins', str(seq_loc), cs_op[1:].upper()])
+            mut_strs_list.append(ins)
+        elif op_type == 'deletion':
+            deletion = ''.join(['del', str(seq_loc), 'to',
+                                str(seq_loc+len(cs_op)-2)])
+            mut_strs_list.append(deletion)
+            seq_loc += len(cs_op) - 1
+        else:
+            raise ValueError(f"Invalid cs `op_type` of {op_type}")
+
+    return ' '.join(mut_strs_list)
+
+
+def cs_to_nt_mutation_count(cs):
+    """Count the number of nucleotide mutations in ``cs`` tag.
+
+    Parameters
+    ----------
+    cs : str
+        `cs` string
+
+    Returns
+    -------
+    nt_mut_count : int
+        Number of nucleotides that are mutated. Insertions / deletions
+        are counted as the number of nucleotides in the indel.
+        Changes from an ambiguous nucleotide to are **not** considered
+        mutations.
+
+    Example
+    -------
+    >>> cs_to_nt_mutation_count(':4*nt-tc:2+g')
+    3
+    >>> cs_to_nt_mutation_count(':4*gt-tc:2+g')
+    4
+
+    """
+    nt_mut_count = 0
+    cs_list = split_cs(cs)
+    for cs_op in cs_list:
+        op_type = cs_op_type(cs_op)
+        if op_type == 'substitution':
+            if cs_op[1] != 'n':
+                nt_mut_count += 1
+        elif op_type == 'insertion' or op_type == 'deletion':
+            nt_mut_count += len(cs_op) - 1
+        elif op_type != 'identity':
+            raise ValueError(f'Invalid cs `op_type` of {op_type}.')
+
+    return nt_mut_count
+
+
+def cs_to_op_mutation_count(cs):
+    """Count the number of mutation operations in ``cs`` tag.
+
+    Parameters
+    ----------
+    cs : str
+        The ``cs`` tag.
+
+    Returns
+    -------
+    op_mut_count : int
+        Number of mutation operations in the query sequence. Each indel or
+        substitution is counted as a single mutation operation regardless
+        of how many mutations it contains. Changes from ambiguous nucleotides
+        to another nucleotide are **not** counted.
+
+    Example
+    -------
+    >>> cs_to_op_mutation_count(':4*nt-tc:2+g')
+    2
+    >>> cs_to_op_mutation_count(':4*gt-tc:2+g')
+    3
+
+    """
+    op_mut_count = 0
+    cs_list = split_cs(cs)
+    for cs_op in cs_list:
+        op_type = cs_op_type(cs_op)
+        if op_type == 'substitution':
+            if cs_op[1] != 'n':
+                op_mut_count += 1
+        elif op_type == 'insertion' or op_type == 'deletion':
+            op_mut_count += 1
+        elif op_type != 'identity':
+            raise ValueError(f'Invalid cs `op_type` of {op_type}.')
+
+    return op_mut_count
+
+
 if __name__ == '__main__':
     import doctest
     doctest.testmod()