Skip to content
This repository
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 74 lines (60 sloc) 2.398 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
"""
Utilities for VCF files.
"""


def walk_together(*readers):
    """ Simultaneously iteratate two or more VCF readers and return
lists of concurrent records from each
reader, with None if no record present. Caller must check the
inputs are sorted in the same way and use the same reference
otherwise behaviour is undefined.
"""
    # if one of the VCFs has no records, StopIteration is
    # raised immediately, so we need to check for that and
    # deal appropriately
    nexts = []
    for reader in readers:
        try:
            nexts.append(reader.next())
        except StopIteration:
            nexts.append(None)

    while True:
        min_next = min([x for x in nexts if x is not None])

        # this line uses equality on Records, which checks the ALTs
        # not sure what to do with records that have overlapping but different
        # variation
        yield [x if x is None or x == min_next else None for x in nexts]

        # update nexts that we just yielded
        for i, n in enumerate(nexts):

            if n is not None and n == min_next:
                try:
                    nexts[i] = readers[i].next()
                except StopIteration:
                    nexts[i] = None

        if all([x is None for x in nexts]):
            break


def trim_common_suffix(*sequences):
    """
Trim a list of sequences by removing the longest common suffix while
leaving all of them at least one character in length.

Standard convention with VCF is to place an indel at the left-most
position, but some tools add additional context to the right of the
sequences (e.g. samtools). These common suffixes are undesirable when
comparing variants, for example in variant databases.

>>> trim_common_suffix('TATATATA', 'TATATA')
['TAT', 'T']

>>> trim_common_suffix('ACCCCC', 'ACCCCCCCC', 'ACCCCCCC', 'ACCCCCCCCC')
['A', 'ACCC', 'ACC', 'ACCCC']

"""
    if not sequences:
        return []
    reverses = [seq[::-1] for seq in sequences]
    rev_min = min(reverses)
    rev_max = max(reverses)
    if len(rev_min) < 2:
        return sequences
    for i, c in enumerate(rev_min[:-1]):
        if c != rev_max[i]:
            if i == 0:
                return sequences
            return [seq[:-i] for seq in sequences]
    return [seq[:-(i + 1)] for seq in sequences]
Something went wrong with that request. Please try again.