Skip to content

Commit

Permalink
Merge pull request #112 from Hoeze/use_strand
Browse files Browse the repository at this point in the history
Allow to directly specify whether or not to use strand information
  • Loading branch information
MuhammedHasan committed Jun 20, 2022
2 parents f57493e + 8b4dbfc commit 643a1ee
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 19 deletions.
9 changes: 7 additions & 2 deletions kipoiseq/extractors/fasta.py
Expand Up @@ -26,20 +26,25 @@ def __init__(self, fasta_file, use_strand=False, force_upper=False):
self.fasta = Fasta(self.fasta_file)
self.force_upper = force_upper

def extract(self, interval: Interval, **kwargs) -> str:
def extract(self, interval: Interval, use_strand=None, **kwargs) -> str:
"""
Returns the FASTA sequence in some given interval as string
Args:
interval: the interval to query
use_strand (bool, optional): if True, the extracted sequence
is reverse complemented in case interval.strand == "-".
Overrides `self.use_strand`
**kwargs:
Returns:
sequence of requested interval
"""
# reverse-complement seq the negative strand
rc = self.use_strand and interval.strand == "-"
if use_strand is None:
use_strand = self.use_strand
rc = use_strand and interval.strand == "-"

# pyfaidx wants a 1-based interval
seq = str(self.fasta.get_seq(
Expand Down
45 changes: 28 additions & 17 deletions kipoiseq/extractors/vcf_seq.py
Expand Up @@ -58,21 +58,25 @@ def concat(self):

class VariantSeqExtractor(BaseExtractor):

def __init__(self, fasta_file: str = None, reference_sequence: BaseExtractor = None):
def __init__(self, fasta_file: str = None, reference_sequence: BaseExtractor = None, use_strand=True):
"""
Sequence extractor which allows to obtain the alternative sequence,
given some interval and variants inside this interval.
Args:
fasta_file: path to the fasta file (can be gzipped)
reference_sequence: extractor returning the reference sequence given some interval
fasta_file: path to the fasta file (can be gzipped)
reference_sequence: extractor returning the reference sequence given some interval
use_strand (bool): if True, the extracted sequence
is reverse complemented in case interval.strand == "-"
"""
self._use_strand = use_strand

if fasta_file is not None:
if reference_sequence is not None:
raise ValueError(
"either fasta_file or ref_seq_extractor have to be specified")
self._ref_seq_extractor = FastaStringExtractor(
fasta_file, use_strand=True)
fasta_file, use_strand=False)
else:
if reference_sequence is None:
raise ValueError(
Expand All @@ -96,22 +100,25 @@ def ref_seq_extractor(self) -> BaseExtractor:
"""
return self._ref_seq_extractor

def extract(self, interval, variants, anchor, fixed_len=True, **kwargs):
def extract(self, interval, variants, anchor, fixed_len=True, use_strand=None, **kwargs):
"""
Args:
interval: pybedtools.Interval Region of interest from
which to query the sequence. 0-based
variants: List[cyvcf2.Variant]: variants overlapping the `interval`.
can also be indels. 1-based
anchor: absolution position w.r.t. the interval start. (0-based).
E.g. for an interval of `chr1:10-20` the anchor of 10 denotes
the point chr1:10 in the 0-based coordinate system.
fixed_len: if True, the return sequence will have the same length
as the `interval` (e.g. `interval.end - interval.start`)
interval: pybedtools.Interval Region of interest from
which to query the sequence. 0-based
variants: List[cyvcf2.Variant]: variants overlapping the `interval`.
can also be indels. 1-based
anchor: absolution position w.r.t. the interval start. (0-based).
E.g. for an interval of `chr1:10-20` the anchor of 10 denotes
the point chr1:10 in the 0-based coordinate system.
fixed_len: if True, the return sequence will have the same length
as the `interval` (e.g. `interval.end - interval.start`)
use_strand (bool, optional): if True, the extracted sequence
is reverse complemented in case interval.strand == "-".
Overrides `self.use_strand`
Returns:
A single sequence (`str`) with all the variants applied.
A single sequence (`str`) with all the variants applied.
"""
# Preprocessing
anchor = max(min(anchor, interval.end), interval.start)
Expand Down Expand Up @@ -174,7 +181,10 @@ def extract(self, interval, variants, anchor, fixed_len=True, **kwargs):

seq = down_str + up_str

if interval.strand == '-':
if use_strand is None:
use_strand = self.use_strand
if use_strand and interval.strand == '-':
# reverse-complement
seq = complement(seq)[::-1]

return seq
Expand Down Expand Up @@ -256,7 +266,8 @@ def _upstream_builder(up_variants, interval, anchor, iend):
return up_sb

def _fetch(self, interval, istart, iend):
seq = self._ref_seq_extractor.extract(
# fetch interval, ignore strand
seq = self.ref_seq_extractor.extract(
Interval(interval.chrom, istart, iend))
seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend)
return seq
Expand Down

0 comments on commit 643a1ee

Please sign in to comment.