Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script to set genotypes for small deletions to missing #253

Merged
merged 2 commits into from
Mar 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions scripts/del_pe_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python

from __future__ import division
import json
from collections import Counter
import argparse


def calc_insert_density(hist):
'''Transform a histogram of counts to a density'''
dens = Counter()
total = sum(hist.values())
for i in list(hist):
dens[i] = float(hist[i])/total
return dens


def overlap(dens, shift):
'''Shift a density over by "shift" bp and calculate the overlap'''
total = 0.0
for x in xrange(shift, max(dens) + 1):
total += min(dens[x], dens[x - shift])
return total


def find_overlap(dens, target):
'''Find amount to shift the density to achieve the target overlap value'''
shift = max(dens) - 1
current = overlap(dens, shift)
last = current
while shift >= 0 and current <= target:
last = current
shift -= 1
current = overlap(dens, shift)
return (shift + 1, last)


def load_svtyper_json(json_file):
'''Load an svtyper json'''
with open(json_file) as f:
doc = json.load(f)
return doc


def create_hist(lib):
'''Create a histogram from svtyper json information'''
return Counter({
int(k): int(v) for k, v in lib['histogram'].items()
})


def calculate_overlaps(doc, target):
'''Calculate the minimum variant size with target discriminating power'''
for sample in doc:
for lib in doc[sample]['libraryArray']:
hist = create_hist(lib)
dens = calc_insert_density(hist)
(size, overlap_prob) = find_overlap(dens, target)
return (sample, size, overlap_prob)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Calculate variant size resolution based on cutoff',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('json_file', nargs='+',
help='svtyper json files to evaluate'
)
parser.add_argument('--overlap', type=float, metavar='FLOAT',
default=0.05,
help='maximum density of overlap between discordant \
and concordant insert size distributions'
)
args = parser.parse_args()
print '\t'.join(('Sample', 'MinimumSize', 'Overlap'))
for f in args.json_file:
doc = load_svtyper_json(f)
results = calculate_overlaps(doc, 0.05)
print '\t'.join([str(x) for x in results])


# Here thar be tests
def test_calc_insert_density():
t = Counter({1: 1, 2: 2, 3: 1})
expected = Counter({1: 0.25, 2: 0.5, 3: 0.25})
assert(calc_insert_density(t) == expected)


def test_overlap():
t = Counter({1: 0.25, 2: 0.5, 3: 0.25})
assert(overlap(t, 0) == 1.0)
assert(overlap(t, 3) == 0.0)
assert(overlap(t, 1) == 0.5)


def test_find_overlap():
t = Counter({1: 0.2, 2: 0.5, 3: 0.3})
assert(find_overlap(t, 1.0) == (0, 1.0))
assert(find_overlap(t, 0.5) == (1, 0.5))
101 changes: 101 additions & 0 deletions scripts/filter_del.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python

from __future__ import division
import argparse
import sys
from svtools.vcf.file import Vcf
from svtools.vcf.variant import Variant
import svtools.utils as su

class VCFReader(object):
def __init__(self, stream):
self.vcf_obj = Vcf()
self.stream = stream
header = list()
for line in stream:
if line[0] != '#':
raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line))
header.append(line)
if line.startswith('#CHROM\t'):
# end of header
break
self.vcf_obj.add_header(header)

def __iter__(self):
for line in self.stream:
yield Variant(line.rstrip().split('\t'), self.vcf_obj)


def load_deletion_sizes(stream):
minimum_del_size = dict()
for line in stream:
if line.startswith('Sample\t'):
continue
sample, size, overlap = line.rstrip().split('\t')
if sample not in minimum_del_size:
minimum_del_size[sample] = abs(int(size))
else:
raise RuntimeError('Size for {0} already set. Does your file of sizes include multiple lines with the same sample name?'.format(sample))
return minimum_del_size, max(minimum_del_size.values())

def set_missing(input_stream, deletion_sizes, output_stream, max_del_size, sr_cutoff):
valid_types = set(('DEL', 'MEI'))
for variant in input_stream:
if variant.get_info('SVTYPE') in valid_types:
# NOTE this will raise an exception if SVLEN is null
length = abs(int(variant.get_info('SVLEN')))
if length < max_del_size:
split_read_support = 0
total_depth = 0
for s in variant.sample_list:
g = variant.genotype(s)
if g.get_format('GT') not in ('./.', '0/0'):
split_read_support += int(g.get_format('AS'))
total_depth += int(g.get_format('DP'))
if total_depth > 0 and (split_read_support / total_depth) < sr_cutoff:
# Only set to null if PE support is our only source of
# information. Not counting soft-clips here.
# This may be a bad idea as even with SR support the
# lack of power to detect PE reads could skew us away from
# the correct genotype.
# A better method might be to regenotype using only
# split-read support if the SV is too small.
logged = False
for sample in variant.sample_list:
if sample in deletion_sizes and length < deletion_sizes[sample]:
gt = variant.genotype(sample)
gt.set_format('GT', './.')
if not logged:
sys.stderr.write('Applying small deletion filter to {0}\n'.format(variant.var_id))
logged=True

output_stream.write(variant.get_var_string())
output_stream.write('\n')

def description():
return 'set genotypes of deletions smaller than a per-sample cutoff to missing if no splitread support in the sample'

def add_arguments_to_parser(parser):
parser.add_argument("-i", "--input", required=True, dest="input", metavar='<VCF>', help="VCF file containing variants to be output")
parser.add_argument("-t", "--thresholds", required=True, dest="threshold_file", metavar='<TXT>', type=argparse.FileType('r'), help="Tab-separated file of sample name and minimum deletion size used to determine if site should be output")
parser.add_argument("-s", "--split-read-fraction", required=True, dest="sr_cutoff", metavar='<FLOAT>', type=float, help="Minimum fraction of split read support for the site to be excluded from filtering.")
parser.add_argument('-o', '--output', type=argparse.FileType('w'), metavar='<VCF>', default=sys.stdout, help='output VCF to write (default: stdout)')
parser.set_defaults(entry_point=run_from_args)

def command_parser():
parser = argparse.ArgumentParser(description=description())
add_arguments_to_parser(parser)
return parser

def run_from_args(args):
deletion_size_map, max_size = load_deletion_sizes(args.threshold_file)
with su.InputStream(args.input) as stream:
variant_stream = VCFReader(stream)
args.output.write(variant_stream.vcf_obj.get_header())
args.output.write('\n')
return set_missing(variant_stream, deletion_size_map, args.output, max_size, args.sr_cutoff)

if __name__ == '__main__':
parser = command_parser()
args = parser.parse_args()
sys.exit(args.entry_point(args))