Skip to content

Commit

Permalink
Removed custom '<clipN>' cs notation.
Browse files Browse the repository at this point in the history
Instead of writing a script to process the '<clipN>' notation, I
decided to stick with just using tuples to designate features
that have clipping. This makes counting mutations and clipping
easier.
  • Loading branch information
khdcrawford committed Aug 22, 2019
1 parent 74760e2 commit dcdb543
Show file tree
Hide file tree
Showing 3 changed files with 1,184 additions and 124 deletions.
115 changes: 59 additions & 56 deletions alignparse/cs_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
'identity': ':[0-9]+',
'substitution': r'\*[acgtn][acgtn]',
'insertion': r'\+[acgtn]+',
'deletion': r'\-[acgtn]+',
'clip': '<clip[0-9]+>'
'deletion': r'\-[acgtn]+'
}
"""dict: Short ``cs`` tag operation regular expression matches."""

Expand Down Expand Up @@ -53,9 +52,6 @@ def split_cs(cs_string, *, invalid='raise'):
>>> split_cs(':32*nt*na:10-gga:5+aaa:10')
[':32', '*nt', '*na', ':10', '-gga', ':5', '+aaa', ':10']
>>> split_cs('<clip8>:32*nt*na:10-gga:5')
['<clip8>', ':32', '*nt', '*na', ':10', '-gga', ':5']
>>> split_cs('bad:32*nt*na:10-gga:5', invalid='ignore') is None
True
Expand Down Expand Up @@ -100,8 +96,6 @@ def cs_op_type(cs_op, *, invalid='raise'):
'substitution'
>>> cs_op_type(':45')
'identity'
>>> cs_op_type('<clip8>')
'clip'
>>> cs_op_type('*nt:45')
Traceback (most recent call last):
...
Expand Down Expand Up @@ -151,10 +145,6 @@ def cs_op_len_target(cs_op, *, invalid='raise'):
2
>>> cs_op_len_target('+gc')
0
>>> cs_op_len_target('<clip8>')
8
>>> cs_op_len_target('<clip46>')
46
>>> cs_op_len_target('*nt:45')
Traceback (most recent call last):
...
Expand Down Expand Up @@ -377,60 +367,56 @@ def extract_cs(self, start, end):

return (feature_cs, clip5, clip3)

def cs_to_sequence(cs, feat_name, feat_seq, *, custom_cs=False):

def cs_to_sequence(cs, seq):
"""Convert `cs` string for a feature in `feat_seq` to nt sequence.
Paramters
---------
cs : str
`cs` string for feature
feat_name : str
Name of feature for which the cs string is being converted to seq.
custom_cs : bool
If `True`, will process custom `cs` strings that include clip amounts
on the 5' and/or 3' ends. Default is False to require users to
to consider if they want to process `cs` strings with clipping.
Parameters
----------
cs : str or tuple
`cs` string or tuple for feature. If there was clipping, this is a
tuple of (cs string, clip5, clip3).
seq : str
Sequence of target for region corresponding to `cs` string.
Returns
-------
sequence : str
Nucleotide sequence for specified feature in the query.
"""
raise RuntimeError('not yet implemented')

def cs_to_mutation_str(cs, feat_name, *, custom_cs=False):

def cs_to_mutation_str(cs, seq):
"""Convert `cs` string for a feature in `feat_seq` to mutation string.
Paramters
---------
cs : str
`cs` string for feature
feat_name : str
Name of feature for which the cs string is being converted to a
mutation string.
custom_cs : bool
If `True`, will process custom `cs` strings that include clip amounts
on the 5' and/or 3' ends. Default is False to require users to
to consider if they want to process `cs` strings with clipping.
Parameters
----------
cs : str or tuple
`cs` string or tuple for feature. If there was clipping, this is a
tuple of (cs string, clip5, clip3).
seq : str
Sequence of target for region corresponding to `cs` string.
Returns
-------
mut_str : str
Mutation string of form 'A56T G86A' for all mutations in the feature
in the query compared to the target sequence.
"""
raise RuntimeError('not yet implemented')


def cs_to_mutation_count(cs):
"""Count the number of nucleotide mutations in `cs` string.
Paramters
---------
cs : str
`cs` string for feature
There is no `custom_cs` parameter becuase this function does ignores
clipping.
Parameters
----------
cs : str or tuple
`cs` string or tuple for feature. If there was clipping, this is a
tuple of (cs string, clip5, clip3).
Returns
-------
Expand All @@ -443,10 +429,15 @@ def cs_to_mutation_count(cs):
-------
>>> cs_to_mutation_count(':4*nt-tc:2+g')
4
>>> cs_to_mutation_count('<clip4>:4*nt-tc:2+g')
>>> cs_to_mutation_count((':4*nt-tc:2+g', 4, 8))
4
"""
if type(cs) is tuple:
if len(cs) != 3:
raise ValueError(f"Invalid `cs` tuple of {cs}")
cs = cs[0]

mut_count = 0
cs_list = split_cs(cs)
for cs_op in cs_list:
Expand All @@ -461,26 +452,38 @@ def cs_to_mutation_count(cs):
return mut_count


def cs_to_clip_count(cs, *, custom_cs=True):
def cs_to_clip_count(cs):
"""Count the number of clipped nucleotides in `cs` string.
Paramters
---------
cs : str
`cs` string for feature
custom_cs : bool
If `True`, will process custom `cs` strings that include clip amounts
on the 5' and/or 3' ends. Default is `True` as this function uses the
information specified in the custom `cs` strings that specify 5' and/
or 3' clipping.
Parameters
----------
cs : str or tuple
`cs` string or tuple for feature. If there was clipping, this is a
tuple of (cs string, clip5, clip3).
Returns
-------
clip_count : int
Number of nucleotides that are clipped from either or both ends of the
feature in the query sequence.
Example
-------
>>> cs_to_clip_count((':4*nt-tc:2+g', 4, 8))
12
>>> cs_to_clip_count(':4*nt-tc:2+g')
0
"""
raise RuntimeError('not yet implemented')
if type(cs) is tuple:
if len(cs) != 3:
raise ValueError(f"Invalid `cs` tuple of {cs}")
clip_count = cs[1] + cs[2]
else:
clip_count = 0

return clip_count


if __name__ == '__main__':
import doctest
Expand Down
10 changes: 5 additions & 5 deletions alignparse/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,9 +558,9 @@ def parse_alignment_cs(self, samfile, *, multi_align='primary'):
- a column with the name of each feature in the target giving
the ``cs`` string for that feature's alignment. If feature's
alignment is clipped (incomplete), this is indicated by
adding '<clipN>' (where 'N' is the amount of clipping) to the
``cs`` string. For instance: '<clip7>:5*cg:3<clip2>'
alignment is clipped (incomplete), this column gives a tuple
of (cs, clip5, clip3). This is the same format as the
output from `extract_cs`. For instance: (':5*cg:3', 7, 2)
indicates 7 nucleotides clipped at 5' end, 2 nucleotides at
3' end, and a ``cs`` string of ':5*cg:3' for aligned portion.
Expand Down Expand Up @@ -601,10 +601,10 @@ def parse_alignment_cs(self, samfile, *, multi_align='primary'):
else:
feat_cs, clip5, clip3 = feat_info
if clip5 != 0:
feat_cs = f"<clip{clip5}>{feat_cs}"
feat_cs = feat_info

if clip3 != 0:
feat_cs = f"{feat_cs}<clip{clip3}>"
feat_cs = feat_info

d[tname][feature.name].append(feat_cs)

Expand Down
Loading

1 comment on commit dcdb543

@khdcrawford
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can chat about this this afternoon, but I think simply keeping cs strings with clipping as tuples in parse_alignment_cs makes more sense than adding in clipping, which we'll just process away in downstream functions anyway. It makes the parse_alignment_cs dataframe a bit less easy to look at, but most users won't use that dataframe anyway.

Please sign in to comment.