# Compare VCF data

In [47]:
!samtools depth WES_ICC/HCC1239_1_3.dedupped.realigned.recal.bam > reads.sort.coverage

^C


In [17]:
!venv/bin/pip install intervaltree

Collecting intervaltree
Installing collected packages: intervaltree
Successfully installed intervaltree-3.0.2
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
!source echo $PWD

/d0/home/adamk/pysccnv


In [42]:
from intervaltree import Interval, IntervalTree

In [43]:
test_coverage = """chr1    10009   1
chr1    10010   2
chr1    10011   5
chr1    10012   4
chr1    10013   5
chr1    10014   5
chr1    10015   5
chr2    10016   5
chr2    10017   5
chr2    10018   5
chr2    10019   2
chr2    10020   5"""

In [98]:
def coverage_by_chromosome(iterable, threshold):
    coverage_intervals = []
    total_coverage = {}
    last_chromosome = None
    last_streak = False
    try:
        while True:
            elem = next(iterable)
            chromosome, coord, coverage = elem.split()
            if not last_chromosome:
                last_chromosome = chromosome
            coord = int(coord)
            coverage = int(coverage)
            streak = coverage >= threshold
            if chromosome != last_chromosome:
                if streak:
                    current_interval.append(coord - 1)
                    coverage_intervals.append(current_interval)
                total_coverage[last_chromosome] = coverage_intervals
                coverage_intervals = []
                last_streak = False

            if not last_streak and streak:
                current_interval = [coord]
            if last_streak and not streak:
                current_interval.append(coord - 1)
                coverage_intervals.append(current_interval)
                current_interval = []
            last_streak = streak
            last_chromosome = chromosome
    except StopIteration:
        if streak:
            current_interval.append(coord)
            coverage_intervals.append(current_interval)
        total_coverage[last_chromosome] = coverage_intervals
    return total_coverage

In [99]:
iterable = (i for i in test_coverage.split("\n"))
threshold = 5
result = coverage_by_chromosome(iterable, threshold)
assert result == {'chr1': [[10011, 10011], [10013, 10015]], 'chr2': [[10016, 10018], [10020, 10020]]}

In [100]:
def make_intervaltrees(chromomsome_intervals):
    intervaltrees = {}
    for c, intervals in chromomsome_intervals.items():
        it = intervaltrees.setdefault(c, IntervalTree())
        for interval in intervals:
            it[interval[0]: interval[1] + 1] = None
    return intervaltrees

In [101]:
test_coverage2 = """chr1    110703   5
chr1    110704   2
chr1    110705   5
chr1    110706   4
chr1    110707   5
chr1    110708   5
chr1    110709   5
chr1    110710   5
chr1    110711   5
chr1    110712   5
chr1    110713   2
chr1    110714   5"""

In [102]:
coverage_by_chromosome((i for i in test_coverage2.split("\n")), 5)

{'chr1': [[110703, 110703],
  [110705, 110705],
  [110707, 110712],
  [110714, 110714]]}

In [103]:
test_intervaltrees2 = make_intervaltrees(coverage_by_chromosome((i for i in test_coverage2.split("\n")), 5))

In [104]:
test_intervaltrees2

{'chr1': IntervalTree([Interval(110703, 110704), Interval(110705, 110706), Interval(110707, 110713), Interval(110714, 110715)])}

In [105]:
with open("reads.sort.coverage") as f:
    coverage_intervals = coverage_by_chromosome(f, 5)

KeyboardInterrupt: 

In [106]:
test_intervaltrees2

{'chr1': IntervalTree([Interval(110703, 110704), Interval(110705, 110706), Interval(110707, 110713), Interval(110714, 110715)])}

In [113]:
help(vcf)

Help on package vcf:

NAME
    vcf - A VCFv4.0 and 4.1 parser for Python.

DESCRIPTION
    Online version of PyVCF documentation is available at http://pyvcf.rtfd.org/

PACKAGE CONTENTS
    filters
    model
    parser
    sample_filter
    test (package)
    utils

DATA
    RESERVED_FORMAT = {'AHAP': 'Integer', 'CN': 'Integer', 'CNL': 'Float',...
    RESERVED_INFO = {'1000G': 'Flag', 'AA': 'String', 'AC': 'Integer', 'AF...
    VERSION = '0.6.8'

FILE
    /d0/home/adamk/pysccnv/venv/lib/python3.7/site-packages/vcf/__init__.py




In [114]:
help(vcf.Writer)

Help on class Writer in module vcf.parser:

class Writer(builtins.object)
 |  Writer(stream, template, lineterminator='\n')
 |  
 |  VCF Writer. On Windows Python 2, open stream with 'wb'.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, stream, template, lineterminator='\n')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  close(self)
 |      Close the writer
 |  
 |  flush(self)
 |      Flush the writer
 |  
 |  write_record(self, record)
 |      write a record to the file
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  counts = {None: '.', -1: 'A', -2: 'G', -3: 'R'}



In [121]:
import vcf
def filter_vcf(filename, limit=-1, chrom_limit="", phased=False, min_qual=50):
    write_to = ".".join(filename.split(".")[:-1]) + ".filtered.vcf"
    reader = vcf.Reader(filename=filename)
    with open(write_to, "w") as f:
        writer = vcf.Writer(f, reader)
        for i, record in enumerate(reader):
            #print(record.samples[0])
            #print(record.CHROM, record.POS)
            if record.CHROM == chrom_limit:
                break
            if i == limit:
                break
            if record.samples[0].data.GQ < min_qual:
                continue
            if test_intervaltrees2[record.CHROM][record.POS]:
                writer.write_record(record)

In [122]:
wes_vcf = filter_vcf("reverse_jewish_son.vcf.gz", limit=100)