Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
411 lines (324 sloc) 13.3 KB
"""
An simple interval class for DNA sequences from FASTA files that provides
fast access to sequences and methods for interval logic on those sequences.
Usually you will create a `Genome` and then use that object to create
intervals. The intervals have a sequence property that will look up the
actual sequence::
>>> from fastinterval import Genome, Interval
>>> test_genome = Genome('test/example.fa')
>>> int1 = test_genome.interval(100, 150, chrom='1')
>>> print int1
1:100-150:
>>> print int1.sequence
GATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA
fastinterval uses pyfasta to retrieve the sequence, so the access is mmapped
(i.e fast). It supports strandedness, which will be respected when accessing
the sequence::
>>> int2 = test_genome.interval(100, 150, chrom='1', strand=-1)
>>> print int2.sequence
TCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATC
The Interval class supports many interval operations::
>>> int1 = test_genome.interval(100, 150, chrom='1')
>>> int2 = test_genome.interval(125, 175, chrom='1')
>>> int1.distance(int2)
0
>>> int1.span(int2)
Interval(100, 175)
>>> int1.overlaps(int2)
True
>>> int1.is_contiguous(int2)
True
>>> int1 in int2
False
>>> int1.intersection(int2)
Interval(125, 150)
>>> int1.union(int2)
Interval(100, 175)
>>> Interval.merge([int1, int2, test_genome.interval(200,250, chrom='1')])
[Interval(100, 175), Interval(200, 250)]
The Interval class is also based on bx python intervals. So you can pass in
a value attritbue to point to an external object, and create interval trees and
so on.
>>> from bx.intervals.intersection import IntervalTree
>>> int3 = test_genome.interval(150, 200, chrom='1', value='foo')
>>> tree = IntervalTree()
>>> _ = map(tree.insert_interval, (int1, int2, int3))
>>> tree.find(190, 195)
[Interval(150, 200, value=foo)]
"""
VERSION = '0.1.1'
from pyfasta import Fasta
from bx.intervals import Interval as BaseInterval
def _convert_strand(strand):
""" convert UCSC +/- to +1/-1"""
if strand == '-': return -1
if strand == '+': return 1
return strand
class Interval(BaseInterval):
""" A genomic interval """
def __init__(self, start, stop, genome=None, **kws):
self.genome = genome
if 'strand' in kws:
kws['strand'] = _convert_strand(kws['strand'])
BaseInterval.__init__(self, start, stop, **kws)
@property
def sequence(self):
""" Return the DNA from this intecal as a string """
if not self.genome:
raise Exception('Cannot retrieve sequence without a genome')
return self.genome.sequence(dict(
start = self.start,
stop = self.end,
chr = self.chrom,
strand = self.strand
), one_based=False).upper()
def __str__(self):
""" Return a chr:start-stop:strand representation of the interval """
return "%s:%s-%s:%s" % (self.chrom, self.start, self.end, self.strand if self.strand else '')
def __len__(self):
""" Return interval length """
return self.end - self.start
@classmethod
def from_string(cls, loc, **kws):
""" Create an interval from a chrx:start-end style string """
# chr1:10858-10967:1
toks = loc.split(':')
chrom = toks[0]
start, end = map(int, toks[1].split('-'))
if len(toks) == 3:
try:
strand = int(toks[2])
except ValueError:
strand = toks[2]
if strand == '+':
strand = 1
elif strand == '-':
strand = -1
elif strand == '':
strand = None
else:
raise Exception('Unknown strand %s' % strand)
else:
strand = None
return cls(start, end, chrom=chrom, strand=strand, **kws)
def distance(self, other):
""" return the distance between two intervals """
if self.chrom != other.chrom: return float('Inf')
if self.start > other.end: return self.start - other.end
if self.end < other.start: return other.start - self.end
return 0
def span(self, other, **kws):
""" Return an interval spanning two intervals """
if self.chrom != other.chrom:
raise Exception('cannot get span over two chromosomes')
return self.copy(
start = min([self.start, other.start]),
end = max([self.end, other.end]),
**kws
)
def span_between(self, other, **kws):
"""Return an Inteval spanning the gap between two intervals """
if self.chrom != other.chrom:
raise Exception('cannot get span over two chromosomes')
if self.overlaps(other):
return None
return self.copy(
start = min([self.end, other.end]),
end = max([self.start, other.start]),
**kws
)
def overlaps(self, other):
""" Return True if the intervals share at least one base """
return self.distance(other) == 0 and not (self.start==other.end or self.end==other.start)
def is_contiguous(self, other):
""" Return True if the intervals are overlapping or contiguous """
return self.distance(other) == 0
def __contains__(self, other):
""" Return true if one interval contains the other """
return self.start <= other.start and self.end >= other.end
def intersection(self, other):
""" Return the interval containing the intersection of two intervals """
if not self.overlaps(other):
return None
return Interval(
max(self.start, other.start),
min(self.end, other.end),
chrom = self.chrom
)
def union(self, other, merge_contiguous=False):
""" Return an interval containing the interval of two overlapping interval"""
test = Interval.is_contiguous if merge_contiguous else Interval.overlaps
if not test(self, other):
raise Exception('cannot get union of non overlapping intervals')
return self.span(other)
def copy(self, **kws):
""" Copy this interval, and optionally provide a dict of new attrs """
if 'start' in kws:
start = kws.pop('start')
else:
start = self.start
if 'end' in kws:
end = kws.pop('end')
else :
end = self.end
template = dict(chrom=self.chrom, strand=self.strand,
value=self.value, genome=self.genome)
template.update(kws)
return Interval(start, end, **template)
def __sub__(self, other):
""" Subtract an interval and return a list of intervals
i.e. (10,50) - (20,30) returns [(10,20), (30,50)]
"""
if not self.overlaps(other):
return [self]
if self in other:
return []
elif other in self and not (self.start == other.start or self.end == other.end):
left = self.copy()
left.end = other.start
right = self.copy()
right.start = other.end
return [left, right]
elif self.start >= other.start:
right = self.copy()
right.start = other.end
return [right]
elif self.end <= other.end:
left = self.copy()
left.end = other.start
return [left]
raise Exception('unhandled subtraction')
@classmethod
def merge(cls, intervals, merge_contiguous=False, **kwargs):
""" merge a list of intervals and return a list of intervals
By default, the intervals must be overlapping to be merged. If you
want to merge contiguous intervals, set merge_contiguous to True.
"""
is_overlapping = cls.is_contiguous if merge_contiguous else cls.overlaps
intervals = sorted(intervals, key=lambda x: (x.chrom, x.end))
if len(intervals) > 1:
done, todo = [intervals[0].copy(**kwargs)], intervals[1:]
while todo:
item = todo.pop(0).copy(**kwargs)
while done and is_overlapping(item, done[-1]):
item = done.pop().union(item, merge_contiguous=merge_contiguous)
done.append(item)
else:
done = intervals
return done
@classmethod
def coverage(cls, intervals):
if not intervals: return []
chrom = intervals[0].chrom
starts = [x.start for x in intervals]
ends = [x.end for x in intervals]
breaks = sorted(set(starts + ends))
return [
cls(start, end, chrom=chrom,
value=1 + len([x for x in starts if x<= start]) - len([x for x in ends if x <= end])
)
for start, end in zip(breaks, breaks[1:])
]
def add_border(self, size=0, upstream=0, downstream=0):
""" return interval with some bases added to each end """
if not (size or upstream or downstream):
return self.copy()
if size and (upstream or downstream):
raise Exception('please either size or upstream/downstream')
if size:
return self.copy(start=self.start-size, end = self.end+size)
if not self.strand:
raise Exception('Cannot add upstrea/downstream to strandless interval')
if self.strand == 1:
return self.copy(start=self.start - upstream, end = self.end + downstream)
else:
return self.copy(start=self.start - downstream, end = self.end + upstream)
def truncate(self, size):
""" truncate this interval to size, respecting the orientation """
if self.strand is None:
raise Exception('cannot truncate unstranded interval')
elif self.strand > 0:
return self.copy(end=min(self.start+size, self.end))
elif self.strand < 0:
return self.copy(start=max(self.end-size, self.start))
class Genome(object):
""" A convienience for creating intervals on the same genome """
def __init__(self, fname, *args, **kws):
""" Create a genome using a Fasta file. Other args passed to pyfasta """
self.fasta = Fasta(fname, *args, **kws)
def interval(self, start, end, **kws):
""" return an interval on this genome """
return Interval(start, end, genome=self.fasta, **kws)
def from_string(self, data):
"""docstring for from_string"""
return Interval.from_string(data, genome=self.fasta)
class MinimalSpanningSet(object):
""" Create a minimal spanning set for target intervals from a set of candidates """
def score_candidate(self, candidate):
return sum([
len(candidate.intersection(x))
for x in self.remaining_targets
if candidate.overlaps(x)
])
def coverage(self, choices):
""" work out the covered based for a set of choices"""
covered = Interval.merge(
[
choice.intersection(x)
for x in self.targets
for choice in choices
if choice.overlaps(x)
]
)
return sum(map(len, covered))
def __init__(self, targets, candidates, score_function=None, sort_key=None):
self.targets = targets
self.remaining_targets = list(targets)
self.candidates = candidates
self.chosen = []
self.sort_key = sort_key
if score_function is None:
self.score_function = MinimalSpanningSet.score_candidate
self._find_set()
def _find_set(self):
""" main loop """
while True:
scores = {}
# work out the score by summing the length of overlaps with the target
for candidate in self.candidates:
scores[candidate] = self.score_candidate(candidate)
# choose the best candidates
rankings = sorted(self.candidates, key=scores.get, reverse=True)
if rankings == []:
break
best = rankings[0]
# break if no improvement is possible
if scores[best] == 0:
break
if self.sort_key:
equiv = [x for x in rankings if scores.get(x)==scores.get(best)]
equiv = sorted(equiv, key=self.sort_key)
best = equiv[-1]
# update the data
self.chosen.append(best)
self.candidates.remove(best)
self._update_targets(best)
# break if no targets or candidates left
if len(self.targets) == 0 or len(self.candidates) == 0:
break
self._remove_redundant()
def _update_targets(self, choice):
""" remove chosen interval from targets """
new_targets = []
for target in self.remaining_targets:
new_targets.extend(target - choice)
self.remaining_targets = new_targets
def _remove_redundant(self):
""" drop any candidates that are completely covered by the rest"""
total_coverage = self.coverage(self.chosen)
for c in list(self.chosen):
# if the coverage without a choice is the same, remove it
test = list(self.chosen)
test.remove(c)
if self.coverage(test) == total_coverage:
self.chosen.remove(c)
Jump to Line
Something went wrong with that request. Please try again.