Skip to content

Commit

Permalink
working match_variant_sequence test and changing namedtuple base clas…
Browse files Browse the repository at this point in the history
…ses to use slots instead
  • Loading branch information
iskandr committed Nov 15, 2016
1 parent 48e28c4 commit b13fde4
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 81 deletions.
49 changes: 33 additions & 16 deletions isovar/allele_reads.py
Expand Up @@ -18,7 +18,7 @@
"""

from __future__ import print_function, division, absolute_import
from collections import namedtuple, defaultdict
from collections import defaultdict
import logging

from .locus_reads import locus_read_generator
Expand All @@ -34,23 +34,40 @@

logger = logging.getLogger(__name__)

# subclassing from namedtuple to get a lightweight object with built-in
# hashing and comparison while also being able to add methods
AlleleReadBase = namedtuple(
"AlleleRead",
"prefix allele suffix name sequence")

class AlleleRead(AlleleReadBase):
def __new__(cls, prefix, allele, suffix, name):
return AlleleReadBase(
prefix=prefix,
allele=allele,
suffix=suffix,
name=name,
sequence=prefix + allele + suffix)

class AlleleRead(object):
__slots__ = ["prefix", "allele", "suffix", "name", "sequence"]

def __init__(self, prefix, allele, suffix, name):
self.prefix = prefix
self.allele = allele
self.suffix = suffix
self.name = name
self.sequence = prefix + allele + suffix

def __len__(self):
return len(self.prefix) + len(self.allele) + len(self.suffix)
return len(self.sequence)

def __str__(self):
return "AlleleRead(prefix='%s', allele='%s', suffix='%s', name='%s')" % (
self.prefix,
self.allele,
self.suffix,
self.name)

def __repr__(self):
return str(self)

def __hash__(self):
return hash(self.sequence)

def __eq__(self, other):
return (
other.__class__ is AlleleRead and
self.prefix == other.prefix and
self.allele == other.allele and
self.suffix == other.suffix and
self.name == other.name)

@classmethod
def from_locus_read(cls, locus_read, n_ref):
Expand Down
9 changes: 5 additions & 4 deletions isovar/assembly.py
Expand Up @@ -19,11 +19,10 @@
# I'd rather just use list.copy but Python 2.7 doesn't support it
from copy import copy

from .default_parameters import MIN_VARIANT_CDNA_SEQUENCE_ASSEMBLY_OVERLAP_SIZE

logger = logging.getLogger(__name__)

MIN_OVERLAP_SIZE = 30

def sort_by_decreasing_prefix_length(seq):
"""
Key function for sorting from longest to shortest prefix length.
Expand Down Expand Up @@ -64,7 +63,9 @@ def sort_by_decreasing_total_length(seq):
"""
return -len(seq.sequence)

def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
def greedy_merge(
variant_sequences,
min_overlap_size=MIN_VARIANT_CDNA_SEQUENCE_ASSEMBLY_OVERLAP_SIZE):
"""
Greedily merge overlapping sequences into longer sequences.
Expand Down Expand Up @@ -153,7 +154,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):

def iterative_overlap_assembly(
variant_sequences,
min_overlap_size=30,
min_overlap_size=MIN_VARIANT_CDNA_SEQUENCE_ASSEMBLY_OVERLAP_SIZE,
n_merge_iters=2):
"""
Assembles longer sequences from reads centered on a variant by alternating
Expand Down
6 changes: 6 additions & 0 deletions isovar/default_parameters.py
Expand Up @@ -77,3 +77,9 @@
# sequences from multiple reads which only partially
# overlap (rather than fully spanning a coding sequence)
VARIANT_CDNA_SEQUENCE_ASSEMBLY = False

# Only merge variant cDNA sequences which at least share
# this number of nucleotides. Should be sufficiently high
# to minimize false assembly of isoforms which don't
# actually exist.
MIN_VARIANT_CDNA_SEQUENCE_ASSEMBLY_OVERLAP_SIZE = 30
141 changes: 107 additions & 34 deletions isovar/reference_sequence_key.py
Expand Up @@ -14,24 +14,87 @@

from __future__ import print_function, division, absolute_import
import logging
from collections import namedtuple

from .common import reverse_complement_dna
from .variant_helpers import interbase_range_affected_by_variant_on_transcript

logger = logging.getLogger(__name__)


class ReferenceSequenceKey(namedtuple("ReferenceSequenceKey", [
"strand",
"sequence_before_variant_locus",
"sequence_at_variant_locus",
"sequence_after_variant_locus"])):
class ReferenceSequenceKey(object):
"""
Used to identify and group the distinct sequences occurring on a set of
transcripts overlapping a variant locus
transcripts overlapping a variant locus.
"""

__slots__ = [
"strand",
"reference_cdna_sequence",
"base0_variant_start_offset",
"base0_variant_end_offset"
]

def __init__(
self,
strand,
reference_cdna_sequence,
base0_variant_start_offset,
base0_variant_end_offset):

if strand not in {'+', '-'}:
raise ValueError("Invalid strand: '%s'" % strand)
self.strand = strand

if len(reference_cdna_sequence) == 0:
raise ValueError("Empty reference cDNA sequence")

self.reference_cdna_sequence = reference_cdna_sequence

if base0_variant_start_offset > base0_variant_end_offset:
raise ValueError(
"Invalid variant start offset %d cannot be after end %d" % (
base0_variant_start_offset,
base0_variant_end_offset))

self.base0_variant_start_offset = base0_variant_start_offset
self.base0_variant_end_offset = base0_variant_end_offset

def __str__(self):
return (
"ReferenceSequenceKey("
"strand='%s', reference_cdna_sequence='%s', "
"base0_variant_start_offset=%d, base0_variant_end_offset=%d") % (
self.strand,
self.reference_cdna_sequence,
self.base0_variant_start_offset,
self.base0_variant_end_offset)

def __repr__(self):
return str(self)

def __hash__(self):
return hash((
self.strand,
self.reference_cdna_sequence,
self.base0_variant_start_offset,
self.base0_variant_end_offset))

@property
def sequence_before_variant_locus(self):
return self.reference_cdna_sequence[:self.base0_variant_start_offset]

@property
def sequence_at_variant_locus(self):
return self.reference_cdna_sequence[
self.base0_variant_start_offset:
self.base0_variant_end_offset]

@property
def sequence_after_variant_locus(self):
return self.reference_cdna_sequence[self.base0_variant_end_offset:]

def __len__(self):
return len(self.reference_cdna_sequence)

@classmethod
def from_variant_and_transcript(cls, variant, transcript, context_size):
"""
Expand All @@ -54,6 +117,7 @@ def from_variant_and_transcript(cls, variant, transcript, context_size):
Can also return None if Transcript lacks sufficiently long sequence
"""

full_transcript_sequence = transcript.sequence

if full_transcript_sequence is None:
Expand All @@ -63,6 +127,29 @@ def from_variant_and_transcript(cls, variant, transcript, context_size):
variant)
return None

# get the interbase range of offsets which capture all reference
# bases modified by the variant
variant_start_offset, variant_end_offset = \
interbase_range_affected_by_variant_on_transcript(
variant=variant,
transcript=transcript)

reference_cdna_at_variant = full_transcript_sequence[
variant_start_offset:variant_end_offset]

if not variant_matches_reference_sequence(
variant=variant,
strand=transcript.strand,
ref_seq_on_transcript=reference_cdna_at_variant):
# TODO: once we're more confident about other logic in isovar,
# change this to a warning and return None to allow for modest
# differences between reference sequence patches, since
# GRCh38.p1 may differ at some positions from GRCh38.p5
raise ValueError(
"Wrong reference sequence for variant %s on transcript %s" % (
variant,
transcript))

if len(full_transcript_sequence) < 6:
# need at least 6 nucleotides for a start and stop codon
logger.warn(
Expand All @@ -72,47 +159,33 @@ def from_variant_and_transcript(cls, variant, transcript, context_size):
len(full_transcript_sequence))
return None

# get the interbase range of offsets which capture all reference
# bases modified by the variant
variant_start_offset, variant_end_offset = \
interbase_range_affected_by_variant_on_transcript(
variant=variant,
transcript=transcript)

logger.info(
"Interbase offset range on %s for variant %s = %d:%d",
transcript,
variant,
variant_start_offset,
variant_end_offset)

prefix = full_transcript_sequence[
reference_cdna_before_variant = full_transcript_sequence[
max(0, variant_start_offset - context_size):
variant_start_offset]

suffix = full_transcript_sequence[
reference_cdna_after_variant = full_transcript_sequence[
variant_end_offset:
variant_end_offset + context_size]

ref_nucleotides_at_variant = full_transcript_sequence[
variant_start_offset:variant_end_offset]
if not variant_matches_reference_sequence(
variant=variant,
strand=transcript.strand,
ref_seq_on_transcript=ref_nucleotides_at_variant):
# TODO: once we're more confident about other logic in isovar,
# change this to a warning and return None to allow for modest
# differences between reference sequence patches, since
# GRCh38.p1 may differ at some positions from GRCh38.p5
raise ValueError(
"Wrong reference sequence for variant %s on transcript %s" % (
variant,
transcript))
reference_cdna_sequence = (
reference_cdna_before_variant +
reference_cdna_at_variant +
reference_cdna_after_variant)

n_before = len(reference_cdna_before_variant)

return cls(
strand=transcript.strand,
sequence_before_variant_locus=prefix,
sequence_at_variant_locus=ref_nucleotides_at_variant,
sequence_after_variant_locus=suffix)
reference_cdna_sequence=reference_cdna_sequence,
base0_variant_start_offset=n_before,
base0_variant_end_offset=n_before + len(reference_cdna_at_variant))


def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand):
Expand Down
28 changes: 20 additions & 8 deletions isovar/translation.py
Expand Up @@ -138,6 +138,7 @@ def from_variant_sequence_and_reference_context(
cls,
variant_sequence,
reference_context,
min_transcript_prefix_length,
max_transcript_mismatches,
protein_sequence_length=None):
"""
Expand All @@ -150,6 +151,11 @@ def from_variant_sequence_and_reference_context(
reference_context : ReferenceContext
min_transcript_prefix_length : int
Minimum number of nucleotides before the variant to test whether
our variant sequence can use the reading frame from a reference
transcript.
max_transcript_mismatches : int
Don't use the reading frame from a context where the cDNA variant
sequences disagrees at more than this number of positions before the
Expand All @@ -166,7 +172,8 @@ def from_variant_sequence_and_reference_context(
variant_sequence_in_reading_frame = match_variant_sequence_to_reference_context(
variant_sequence,
reference_context,
max_transcript_mismatches)
min_transcript_prefix_length=min_transcript_prefix_length,
max_transcript_mismatches=max_transcript_mismatches)

cdna_sequence = variant_sequence_in_reading_frame.cdna_sequence
cdna_codon_offset = variant_sequence_in_reading_frame.offset_to_first_complete_codon
Expand Down Expand Up @@ -313,6 +320,7 @@ def find_mutant_amino_acid_interval(
def translation_generator(
variant_sequences,
reference_contexts,
min_transcript_prefix_length,
max_transcript_mismatches,
protein_sequence_length=None):
"""
Expand All @@ -329,10 +337,18 @@ def translation_generator(
reference_contexts : list of ReferenceContext objects
Reference sequence contexts from the same variant as the variant_sequences
min_transcript_prefix_length : int
Minimum number of nucleotides before the variant to test whether
our variant sequence can use the reading frame from a reference
transcript.
max_transcript_mismatches : int
Maximum number of mismatches between coding sequence before variant
and reference transcript we're considering for determing the reading
frame.
protein_sequence_length : int, optional
Truncate protein to be at most this long
Truncate protein to be at most this long.
Yields a sequence of Translation objects.
"""
Expand All @@ -341,6 +357,7 @@ def translation_generator(
translation = Translation.from_variant_sequence_and_reference_context(
variant_sequence=variant_sequence,
reference_context=reference_context,
min_transcript_prefix_length=min_transcript_prefix_length,
max_transcript_mismatches=max_transcript_mismatches,
protein_sequence_length=protein_sequence_length)
if translation is not None:
Expand Down Expand Up @@ -383,12 +400,6 @@ def translate_variant_reads(
len(variant_sequence.prefix)
for variant_sequence in variant_sequences)

if context_size < min_transcript_prefix_length:
logger.info(
"Skipping variant %s, none of the cDNA sequences have sufficient context",
variant)
return []

reference_contexts = reference_contexts_for_variant(
variant,
context_size=context_size,
Expand All @@ -397,6 +408,7 @@ def translate_variant_reads(
return list(translation_generator(
variant_sequences=variant_sequences,
reference_contexts=reference_contexts,
min_transcript_prefix_length=min_transcript_prefix_length,
max_transcript_mismatches=max_transcript_mismatches,
protein_sequence_length=protein_sequence_length))

Expand Down

0 comments on commit b13fde4

Please sign in to comment.