diff --git a/scripts/del_pe_resolution.py b/scripts/del_pe_resolution.py new file mode 100644 index 00000000..14913037 --- /dev/null +++ b/scripts/del_pe_resolution.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +from __future__ import division +import json +from collections import Counter +import argparse + + +def calc_insert_density(hist): + '''Transform a histogram of counts to a density''' + dens = Counter() + total = sum(hist.values()) + for i in list(hist): + dens[i] = float(hist[i])/total + return dens + + +def overlap(dens, shift): + '''Shift a density over by "shift" bp and calculate the overlap''' + total = 0.0 + for x in xrange(shift, max(dens) + 1): + total += min(dens[x], dens[x - shift]) + return total + + +def find_overlap(dens, target): + '''Find amount to shift the density to achieve the target overlap value''' + shift = max(dens) - 1 + current = overlap(dens, shift) + last = current + while shift >= 0 and current <= target: + last = current + shift -= 1 + current = overlap(dens, shift) + return (shift + 1, last) + + +def load_svtyper_json(json_file): + '''Load an svtyper json''' + with open(json_file) as f: + doc = json.load(f) + return doc + + +def create_hist(lib): + '''Create a histogram from svtyper json information''' + return Counter({ + int(k): int(v) for k, v in lib['histogram'].items() + }) + + +def calculate_overlaps(doc, target): + '''Calculate the minimum variant size with target discriminating power''' + for sample in doc: + for lib in doc[sample]['libraryArray']: + hist = create_hist(lib) + dens = calc_insert_density(hist) + (size, overlap_prob) = find_overlap(dens, target) + return (sample, size, overlap_prob) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Calculate variant size resolution based on cutoff', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument('json_file', nargs='+', + help='svtyper json files to evaluate' + ) + parser.add_argument('--overlap', type=float, metavar='FLOAT', + default=0.05, + help='maximum density of overlap between discordant \ + and concordant insert size distributions' + ) + args = parser.parse_args() + print '\t'.join(('Sample', 'MinimumSize', 'Overlap')) + for f in args.json_file: + doc = load_svtyper_json(f) + results = calculate_overlaps(doc, 0.05) + print '\t'.join([str(x) for x in results]) + + +# Here thar be tests +def test_calc_insert_density(): + t = Counter({1: 1, 2: 2, 3: 1}) + expected = Counter({1: 0.25, 2: 0.5, 3: 0.25}) + assert(calc_insert_density(t) == expected) + + +def test_overlap(): + t = Counter({1: 0.25, 2: 0.5, 3: 0.25}) + assert(overlap(t, 0) == 1.0) + assert(overlap(t, 3) == 0.0) + assert(overlap(t, 1) == 0.5) + + +def test_find_overlap(): + t = Counter({1: 0.2, 2: 0.5, 3: 0.3}) + assert(find_overlap(t, 1.0) == (0, 1.0)) + assert(find_overlap(t, 0.5) == (1, 0.5)) diff --git a/scripts/filter_del.py b/scripts/filter_del.py new file mode 100644 index 00000000..27977a8d --- /dev/null +++ b/scripts/filter_del.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +from __future__ import division +import argparse +import sys +from svtools.vcf.file import Vcf +from svtools.vcf.variant import Variant +import svtools.utils as su + +class VCFReader(object): + def __init__(self, stream): + self.vcf_obj = Vcf() + self.stream = stream + header = list() + for line in stream: + if line[0] != '#': + raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line)) + header.append(line) + if line.startswith('#CHROM\t'): + # end of header + break + self.vcf_obj.add_header(header) + + def __iter__(self): + for line in self.stream: + yield Variant(line.rstrip().split('\t'), self.vcf_obj) + + +def load_deletion_sizes(stream): + minimum_del_size = dict() + for line in stream: + if line.startswith('Sample\t'): + continue + sample, size, overlap = line.rstrip().split('\t') + if sample not in minimum_del_size: + minimum_del_size[sample] = abs(int(size)) + else: + raise RuntimeError('Size for {0} already set. Does your file of sizes include multiple lines with the same sample name?'.format(sample)) + return minimum_del_size, max(minimum_del_size.values()) + +def set_missing(input_stream, deletion_sizes, output_stream, max_del_size, sr_cutoff): + valid_types = set(('DEL', 'MEI')) + for variant in input_stream: + if variant.get_info('SVTYPE') in valid_types: + # NOTE this will raise an exception if SVLEN is null + length = abs(int(variant.get_info('SVLEN'))) + if length < max_del_size: + split_read_support = 0 + total_depth = 0 + for s in variant.sample_list: + g = variant.genotype(s) + if g.get_format('GT') not in ('./.', '0/0'): + split_read_support += int(g.get_format('AS')) + total_depth += int(g.get_format('DP')) + if total_depth > 0 and (split_read_support / total_depth) < sr_cutoff: + # Only set to null if PE support is our only source of + # information. Not counting soft-clips here. + # This may be a bad idea as even with SR support the + # lack of power to detect PE reads could skew us away from + # the correct genotype. + # A better method might be to regenotype using only + # split-read support if the SV is too small. + logged = False + for sample in variant.sample_list: + if sample in deletion_sizes and length < deletion_sizes[sample]: + gt = variant.genotype(sample) + gt.set_format('GT', './.') + if not logged: + sys.stderr.write('Applying small deletion filter to {0}\n'.format(variant.var_id)) + logged=True + + output_stream.write(variant.get_var_string()) + output_stream.write('\n') + +def description(): + return 'set genotypes of deletions smaller than a per-sample cutoff to missing if no splitread support in the sample' + +def add_arguments_to_parser(parser): + parser.add_argument("-i", "--input", required=True, dest="input", metavar='', help="VCF file containing variants to be output") + parser.add_argument("-t", "--thresholds", required=True, dest="threshold_file", metavar='', type=argparse.FileType('r'), help="Tab-separated file of sample name and minimum deletion size used to determine if site should be output") + parser.add_argument("-s", "--split-read-fraction", required=True, dest="sr_cutoff", metavar='', type=float, help="Minimum fraction of split read support for the site to be excluded from filtering.") + parser.add_argument('-o', '--output', type=argparse.FileType('w'), metavar='', default=sys.stdout, help='output VCF to write (default: stdout)') + parser.set_defaults(entry_point=run_from_args) + +def command_parser(): + parser = argparse.ArgumentParser(description=description()) + add_arguments_to_parser(parser) + return parser + +def run_from_args(args): + deletion_size_map, max_size = load_deletion_sizes(args.threshold_file) + with su.InputStream(args.input) as stream: + variant_stream = VCFReader(stream) + args.output.write(variant_stream.vcf_obj.get_header()) + args.output.write('\n') + return set_missing(variant_stream, deletion_size_map, args.output, max_size, args.sr_cutoff) + +if __name__ == '__main__': + parser = command_parser() + args = parser.parse_args() + sys.exit(args.entry_point(args))