-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #253 from ernfrid/del_size_filter
Add script to set genotypes for small deletions to missing
- Loading branch information
Showing
2 changed files
with
201 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import division | ||
import json | ||
from collections import Counter | ||
import argparse | ||
|
||
|
||
def calc_insert_density(hist): | ||
'''Transform a histogram of counts to a density''' | ||
dens = Counter() | ||
total = sum(hist.values()) | ||
for i in list(hist): | ||
dens[i] = float(hist[i])/total | ||
return dens | ||
|
||
|
||
def overlap(dens, shift): | ||
'''Shift a density over by "shift" bp and calculate the overlap''' | ||
total = 0.0 | ||
for x in xrange(shift, max(dens) + 1): | ||
total += min(dens[x], dens[x - shift]) | ||
return total | ||
|
||
|
||
def find_overlap(dens, target): | ||
'''Find amount to shift the density to achieve the target overlap value''' | ||
shift = max(dens) - 1 | ||
current = overlap(dens, shift) | ||
last = current | ||
while shift >= 0 and current <= target: | ||
last = current | ||
shift -= 1 | ||
current = overlap(dens, shift) | ||
return (shift + 1, last) | ||
|
||
|
||
def load_svtyper_json(json_file): | ||
'''Load an svtyper json''' | ||
with open(json_file) as f: | ||
doc = json.load(f) | ||
return doc | ||
|
||
|
||
def create_hist(lib): | ||
'''Create a histogram from svtyper json information''' | ||
return Counter({ | ||
int(k): int(v) for k, v in lib['histogram'].items() | ||
}) | ||
|
||
|
||
def calculate_overlaps(doc, target): | ||
'''Calculate the minimum variant size with target discriminating power''' | ||
for sample in doc: | ||
for lib in doc[sample]['libraryArray']: | ||
hist = create_hist(lib) | ||
dens = calc_insert_density(hist) | ||
(size, overlap_prob) = find_overlap(dens, target) | ||
return (sample, size, overlap_prob) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='Calculate variant size resolution based on cutoff', | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
parser.add_argument('json_file', nargs='+', | ||
help='svtyper json files to evaluate' | ||
) | ||
parser.add_argument('--overlap', type=float, metavar='FLOAT', | ||
default=0.05, | ||
help='maximum density of overlap between discordant \ | ||
and concordant insert size distributions' | ||
) | ||
args = parser.parse_args() | ||
print '\t'.join(('Sample', 'MinimumSize', 'Overlap')) | ||
for f in args.json_file: | ||
doc = load_svtyper_json(f) | ||
results = calculate_overlaps(doc, 0.05) | ||
print '\t'.join([str(x) for x in results]) | ||
|
||
|
||
# Here thar be tests | ||
def test_calc_insert_density(): | ||
t = Counter({1: 1, 2: 2, 3: 1}) | ||
expected = Counter({1: 0.25, 2: 0.5, 3: 0.25}) | ||
assert(calc_insert_density(t) == expected) | ||
|
||
|
||
def test_overlap(): | ||
t = Counter({1: 0.25, 2: 0.5, 3: 0.25}) | ||
assert(overlap(t, 0) == 1.0) | ||
assert(overlap(t, 3) == 0.0) | ||
assert(overlap(t, 1) == 0.5) | ||
|
||
|
||
def test_find_overlap(): | ||
t = Counter({1: 0.2, 2: 0.5, 3: 0.3}) | ||
assert(find_overlap(t, 1.0) == (0, 1.0)) | ||
assert(find_overlap(t, 0.5) == (1, 0.5)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import division | ||
import argparse | ||
import sys | ||
from svtools.vcf.file import Vcf | ||
from svtools.vcf.variant import Variant | ||
import svtools.utils as su | ||
|
||
class VCFReader(object): | ||
def __init__(self, stream): | ||
self.vcf_obj = Vcf() | ||
self.stream = stream | ||
header = list() | ||
for line in stream: | ||
if line[0] != '#': | ||
raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line)) | ||
header.append(line) | ||
if line.startswith('#CHROM\t'): | ||
# end of header | ||
break | ||
self.vcf_obj.add_header(header) | ||
|
||
def __iter__(self): | ||
for line in self.stream: | ||
yield Variant(line.rstrip().split('\t'), self.vcf_obj) | ||
|
||
|
||
def load_deletion_sizes(stream): | ||
minimum_del_size = dict() | ||
for line in stream: | ||
if line.startswith('Sample\t'): | ||
continue | ||
sample, size, overlap = line.rstrip().split('\t') | ||
if sample not in minimum_del_size: | ||
minimum_del_size[sample] = abs(int(size)) | ||
else: | ||
raise RuntimeError('Size for {0} already set. Does your file of sizes include multiple lines with the same sample name?'.format(sample)) | ||
return minimum_del_size, max(minimum_del_size.values()) | ||
|
||
def set_missing(input_stream, deletion_sizes, output_stream, max_del_size, sr_cutoff): | ||
valid_types = set(('DEL', 'MEI')) | ||
for variant in input_stream: | ||
if variant.get_info('SVTYPE') in valid_types: | ||
# NOTE this will raise an exception if SVLEN is null | ||
length = abs(int(variant.get_info('SVLEN'))) | ||
if length < max_del_size: | ||
split_read_support = 0 | ||
total_depth = 0 | ||
for s in variant.sample_list: | ||
g = variant.genotype(s) | ||
if g.get_format('GT') not in ('./.', '0/0'): | ||
split_read_support += int(g.get_format('AS')) | ||
total_depth += int(g.get_format('DP')) | ||
if total_depth > 0 and (split_read_support / total_depth) < sr_cutoff: | ||
# Only set to null if PE support is our only source of | ||
# information. Not counting soft-clips here. | ||
# This may be a bad idea as even with SR support the | ||
# lack of power to detect PE reads could skew us away from | ||
# the correct genotype. | ||
# A better method might be to regenotype using only | ||
# split-read support if the SV is too small. | ||
logged = False | ||
for sample in variant.sample_list: | ||
if sample in deletion_sizes and length < deletion_sizes[sample]: | ||
gt = variant.genotype(sample) | ||
gt.set_format('GT', './.') | ||
if not logged: | ||
sys.stderr.write('Applying small deletion filter to {0}\n'.format(variant.var_id)) | ||
logged=True | ||
|
||
output_stream.write(variant.get_var_string()) | ||
output_stream.write('\n') | ||
|
||
def description(): | ||
return 'set genotypes of deletions smaller than a per-sample cutoff to missing if no splitread support in the sample' | ||
|
||
def add_arguments_to_parser(parser): | ||
parser.add_argument("-i", "--input", required=True, dest="input", metavar='<VCF>', help="VCF file containing variants to be output") | ||
parser.add_argument("-t", "--thresholds", required=True, dest="threshold_file", metavar='<TXT>', type=argparse.FileType('r'), help="Tab-separated file of sample name and minimum deletion size used to determine if site should be output") | ||
parser.add_argument("-s", "--split-read-fraction", required=True, dest="sr_cutoff", metavar='<FLOAT>', type=float, help="Minimum fraction of split read support for the site to be excluded from filtering.") | ||
parser.add_argument('-o', '--output', type=argparse.FileType('w'), metavar='<VCF>', default=sys.stdout, help='output VCF to write (default: stdout)') | ||
parser.set_defaults(entry_point=run_from_args) | ||
|
||
def command_parser(): | ||
parser = argparse.ArgumentParser(description=description()) | ||
add_arguments_to_parser(parser) | ||
return parser | ||
|
||
def run_from_args(args): | ||
deletion_size_map, max_size = load_deletion_sizes(args.threshold_file) | ||
with su.InputStream(args.input) as stream: | ||
variant_stream = VCFReader(stream) | ||
args.output.write(variant_stream.vcf_obj.get_header()) | ||
args.output.write('\n') | ||
return set_missing(variant_stream, deletion_size_map, args.output, max_size, args.sr_cutoff) | ||
|
||
if __name__ == '__main__': | ||
parser = command_parser() | ||
args = parser.parse_args() | ||
sys.exit(args.entry_point(args)) |