Skip to content

Commit

Permalink
pulled function for constructing VariantRead, trim sequence to get ri…
Browse files Browse the repository at this point in the history
…d of N characters
  • Loading branch information
iskandr committed Apr 24, 2016
1 parent 6ab307a commit 049a4af
Showing 1 changed file with 102 additions and 69 deletions.
171 changes: 102 additions & 69 deletions isovar/variant_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,104 @@
VariantRead = namedtuple(
"VariantRead", "prefix alt suffix name")

def variant_read_from_single_read_at_locus(read, ref, alt):
"""
Given a single ReadAtLocus object, return either a VariantRead or None
(if the read's sequence didn't contain the variant nucleotides).
Parameters
----------
read: ReadAtLocus
ref : str
alt : str
"""
sequence = read.sequence
reference_positions = read.reference_positions

# positions of the nucleotides before and after the variant within
# the read sequence
read_pos_before = read.base0_read_position_before_variant
read_pos_after = read.base0_read_position_after_variant

# positions of the nucleotides before and after the variant on the
# reference genome
ref_pos_before = reference_positions[read_pos_before]

if ref_pos_before is None:
logging.warn(
"Missing reference pos for nucleotide before variant on read: %s" % (
read,))
return None

ref_pos_after = reference_positions[read_pos_after]

if ref_pos_after is None:
logging.warn(
"Missing reference pos for nucleotide after variant on read: %s" % (
read,))
return None

if len(ref) == 0:
if ref_pos_after - ref_pos_before != 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logging.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
ref_pos_before,
ref_pos_after,
read))
return None

# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
ref_positions_for_inserted = reference_positions[
read_pos_before + 1:read_pos_after]
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
# all these inserted nucleotides should *not* align to the
# reference
logging.debug(
"Skipping read, inserted nucleotides shouldn't map to reference")
return None
else:
# substitutions and deletions
if ref_pos_after - ref_pos_before != len(ref) + 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logging.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
ref_pos_before,
ref_pos_after,
read))
return None

prefix = sequence[:read_pos_before + 1]
suffix = sequence[read_pos_after:]

if isinstance(prefix, bytes):
prefix = prefix.decode('ascii')

if isinstance(suffix, bytes):
suffix = suffix.decode('ascii')

if 'N' in prefix:
# trim prefix to exclude all occurrences of N
rightmost_index = prefix.rfind('N')
prefix = prefix[rightmost_index + 1:]

if 'N' in suffix:
leftmost_index = suffix.find('N')
suffix = suffix[:leftmost_index]

return VariantRead(prefix, alt, suffix, name=read.name)

def variant_reads_from_reads_at_locus(reads, ref, alt):
"""
Given a collection of pysam.AlignedSegment objects, generates a
sequence of VariantRead objects (which are split into prefix/variant/suffix
Given a collection of ReadAtLocus objects, returns a
list of VariantRead objects (which are split into prefix/variant/suffix
nucleotides).
Parameters
Expand All @@ -50,75 +144,14 @@ def variant_reads_from_reads_at_locus(reads, ref, alt):
alt : str
Alternate sequence of the variant (empty for deletions)
Returns a sequence of VariantRead objects.
Returns a list of VariantRead objects.
"""
variant_reads = []
for read in reads:
sequence = read.sequence
reference_positions = read.reference_positions

# positions of the nucleotides before and after the variant within
# the read sequence
read_pos_before = read.base0_read_position_before_variant
read_pos_after = read.base0_read_position_after_variant

# positions of the nucleotides before and after the variant on the
# reference genome
ref_pos_before = reference_positions[read_pos_before]
if ref_pos_before is None:
logging.warn(
"Missing reference pos for nucleotide before variant on read: %s" % (
read,))
continue

ref_pos_after = reference_positions[read_pos_after]
if ref_pos_after is None:
logging.warn(
"Missing reference pos for nucleotide after variant on read: %s" % (
read,))
continue

if len(ref) == 0:
if ref_pos_after - ref_pos_before != 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logging.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
ref_pos_before,
ref_pos_after,
read))
continue
# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
ref_positions_for_inserted = reference_positions[
read_pos_before + 1:read_pos_after]
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
# all these inserted nucleotides should *not* align to the
# reference
logging.debug(
"Skipping read, inserted nucleotides shouldn't map to reference")
else:
# substitutions and deletions
if ref_pos_after - ref_pos_before != len(ref) + 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logging.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
ref_pos_before,
ref_pos_after,
read))
continue
prefix = sequence[:read_pos_before + 1]
suffix = sequence[read_pos_after:]

if isinstance(prefix, bytes):
prefix = prefix.decode('ascii')
if isinstance(suffix, bytes):
suffix = suffix.decode('ascii')

yield VariantRead(prefix, alt, suffix, name=read.name)

variant_read = variant_read_from_single_read_at_locus(read, ref, alt)
if variant_read is not None:
variant_reads.append(variant_read)
return variant_reads

def gather_reads_for_single_variant(
samfile,
Expand Down

0 comments on commit 049a4af

Please sign in to comment.