Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements Targets.parse_alignment #20

Merged
merged 30 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0c368b8
Set up `Targets` to have `feature_parse_specs`
jbloom Aug 12, 2019
783438d
update `feature_parse_specs`, add to RecA example
jbloom Aug 13, 2019
f686f57
`Targets` `feature_parse_specs` as YAML or dict
jbloom Aug 13, 2019
3262f1b
Refactored VEP example notebook to work with `feature_parse_specs`
khdcrawford Aug 22, 2019
74760e2
Initial docs and `cs_to_mutation_count` function
khdcrawford Aug 22, 2019
dcdb543
Removed custom '<clipN>' `cs` notation.
khdcrawford Aug 22, 2019
e994bb2
`Targets.parse_alignment_cs` cs, clip separate col
jbloom Aug 22, 2019
af99af1
continued progress on `cs_to___` functions
khdcrawford Aug 22, 2019
7c35ceb
Merge branch 'parse_alignment' of https://github.com/jbloomlab/alignp…
khdcrawford Aug 22, 2019
3c3caf9
`parse_alignment_cs` only gets features in specs
jbloom Aug 22, 2019
c480df0
`parse_alignment_cs` does not return target clip
jbloom Aug 22, 2019
5c54039
Finished `cs_to____` functions for parsing cs str.
khdcrawford Aug 22, 2019
786614f
fixed errors with VEP pilot notebooks
khdcrawford Aug 22, 2019
6c7c2fe
Merge branch 'parse_alignment' of https://github.com/jbloomlab/alignp…
khdcrawford Aug 22, 2019
44ab13c
update to new `pandas` and `plotnine`
jbloom Aug 23, 2019
4ac5e49
tweak functions to get mutations from `cs` tags
jbloom Aug 23, 2019
434c164
update format for `feature_parse_specs`
jbloom Aug 23, 2019
fbf7fd8
updated `VEP_target_feature_parse_specs.yaml`
khdcrawford Aug 23, 2019
aa03b38
partial progress on `parse_alignment`
jbloom Aug 23, 2019
8a3797d
Initial filtering on feature clipping.
khdcrawford Aug 23, 2019
6ed3cf1
tweaks to pass tests
jbloom Aug 24, 2019
a6a5790
refactor `Targets.__init__`
jbloom Aug 24, 2019
70fa968
Update `parse_alignments` docs.
jbloom Aug 25, 2019
fc49dde
implement `multi_align` in alignment parsing
jbloom Aug 25, 2019
f41b670
new `parse_alignment` skeleton
jbloom Aug 25, 2019
837fcc5
fully implemented `Targets.parse_alignment`
jbloom Aug 25, 2019
682f837
minor code / doc cleanup and update test
jbloom Aug 26, 2019
1d2bd92
`recA_DMS.ipynb` now illustrates `parse_alignments`
jbloom Aug 26, 2019
b6a4f87
minor docs proofreading
khdcrawford Aug 26, 2019
cee13bf
fix doc caught by @khdusenbury
jbloom Aug 26, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 180 additions & 1 deletion alignparse/cs_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
'identity': ':[0-9]+',
'substitution': r'\*[acgtn][acgtn]',
'insertion': r'\+[acgtn]+',
'deletion': r'\-[acgtn]+',
'deletion': r'\-[acgtn]+'
}
"""dict: Short ``cs`` tag operation regular expression matches."""

Expand Down Expand Up @@ -366,6 +366,185 @@ def extract_cs(self, start, end):
return (feature_cs, clip5, clip3)


def cs_to_sequence(cs, seq):
"""Convert ``cs`` tag to a sequence.

Parameters
----------
cs : str
`cs` string
seq : str
Sequence of target for region corresponding to `cs` string.

Returns
-------
sequence : str
Nucleotide sequence generated by applying `cs` to `seq`.

Example
-------
>>> cs_to_sequence(':4*nt-tc:2+g:2', 'CGGANTCCAAT')
'CGGATCAGAT'

"""
cs_list = split_cs(cs)
seq_loc = 0
seq_list = []
for cs_op in cs_list:
op_type = cs_op_type(cs_op)
if op_type == 'identity':
op_len = cs_op_len_target(cs_op)
seq_list.append(seq[seq_loc: seq_loc + op_len])
seq_loc += op_len
elif op_type == 'substitution':
seq_list.append(cs_op[2])
seq_loc += 1
elif op_type == 'insertion':
seq_list.append(cs_op[1:])
elif op_type == 'deletion':
seq_loc += len(cs_op) - 1
else:
raise ValueError(f"Invalid cs `op_type` of {op_type}")

return ''.join(seq_list).upper()


def cs_to_mutation_str(cs):
"""Convert ``cs`` tag to a descriptive string of mutations.

Parameters
----------
cs : str
A ``cs`` tag.

Returns
-------
mut_str : str
Space-delimited string of form 'A5T G86A ins7ACG del19to24'
for all mutations specified in `cs`.

Example
-------
>>> cs_to_mutation_str(':4*nt-tc:2+ga:6')
'del6to7 ins10GA'
>>> cs_to_mutation_str(':4*at-tc:2+ga:6')
'A5T del6to7 ins10GA'
>>> cs_to_mutation_str(':45')
''

Note
----
Mutation strings use "human readable" indexing, so the first nucleotide of
the sequence is 1 and deletions are inclusive of the last number.

Changes from ambiguous nucleotides to any other identity are **not**
considered mutations in the returned strings.

"""
cs_list = split_cs(cs)
seq_loc = 1
mut_strs_list = []
for cs_op in cs_list:
op_type = cs_op_type(cs_op)
if op_type == 'identity':
seq_loc += cs_op_len_target(cs_op)
elif op_type == 'substitution':
if cs_op[1] != 'n':
sub = ''.join([cs_op[1], str(seq_loc), cs_op[2]]).upper()
mut_strs_list.append(sub)
seq_loc += 1
elif op_type == 'insertion':
ins = ''.join(['ins', str(seq_loc), cs_op[1:].upper()])
mut_strs_list.append(ins)
elif op_type == 'deletion':
deletion = ''.join(['del', str(seq_loc), 'to',
str(seq_loc+len(cs_op)-2)])
mut_strs_list.append(deletion)
seq_loc += len(cs_op) - 1
else:
raise ValueError(f"Invalid cs `op_type` of {op_type}")

return ' '.join(mut_strs_list)


def cs_to_nt_mutation_count(cs):
"""Count the number of nucleotide mutations in ``cs`` tag.

Parameters
----------
cs : str
`cs` string

Returns
-------
nt_mut_count : int
Number of nucleotides that are mutated. Insertions / deletions
are counted as the number of nucleotides in the indel.
Changes from an ambiguous nucleotide to are **not** considered
mutations.

Example
-------
>>> cs_to_nt_mutation_count(':4*nt-tc:2+g')
3
>>> cs_to_nt_mutation_count(':4*gt-tc:2+g')
4

"""
nt_mut_count = 0
cs_list = split_cs(cs)
for cs_op in cs_list:
op_type = cs_op_type(cs_op)
if op_type == 'substitution':
if cs_op[1] != 'n':
nt_mut_count += 1
elif op_type == 'insertion' or op_type == 'deletion':
nt_mut_count += len(cs_op) - 1
elif op_type != 'identity':
raise ValueError(f'Invalid cs `op_type` of {op_type}.')

return nt_mut_count


def cs_to_op_mutation_count(cs):
"""Count the number of mutation operations in ``cs`` tag.

Parameters
----------
cs : str
The ``cs`` tag.

Returns
-------
op_mut_count : int
Number of mutation operations in the query sequence. Each indel or
substitution is counted as a single mutation operation regardless
of how many mutations it contains. Changes from ambiguous nucleotides
to another nucleotide are **not** counted.

Example
-------
>>> cs_to_op_mutation_count(':4*nt-tc:2+g')
2
>>> cs_to_op_mutation_count(':4*gt-tc:2+g')
3

"""
op_mut_count = 0
cs_list = split_cs(cs)
for cs_op in cs_list:
op_type = cs_op_type(cs_op)
if op_type == 'substitution':
if cs_op[1] != 'n':
op_mut_count += 1
elif op_type == 'insertion' or op_type == 'deletion':
op_mut_count += 1
elif op_type != 'identity':
raise ValueError(f'Invalid cs `op_type` of {op_type}.')

return op_mut_count


if __name__ == '__main__':
import doctest
doctest.testmod()
Loading