Merge pull request #253 from ernfrid/del_size_filter

Add script to set genotypes for small deletions to missing
hall-lab · Mar 27, 2018 · f9b0fcb · f9b0fcb
2 parents eb0444b + 3fc7275
commit f9b0fcb
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 0 deletions.
diff --git a/scripts/del_pe_resolution.py b/scripts/del_pe_resolution.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+from __future__ import division
+import json
+from collections import Counter
+import argparse
+
+
+def calc_insert_density(hist):
+    '''Transform a histogram of counts to a density'''
+    dens = Counter()
+    total = sum(hist.values())
+    for i in list(hist):
+        dens[i] = float(hist[i])/total
+    return dens
+
+
+def overlap(dens, shift):
+    '''Shift a density over by "shift" bp and calculate the overlap'''
+    total = 0.0
+    for x in xrange(shift, max(dens) + 1):
+        total += min(dens[x], dens[x - shift])
+    return total
+
+
+def find_overlap(dens, target):
+    '''Find amount to shift the density to achieve the target overlap value'''
+    shift = max(dens) - 1
+    current = overlap(dens, shift)
+    last = current
+    while shift >= 0 and current <= target:
+        last = current
+        shift -= 1
+        current = overlap(dens, shift)
+    return (shift + 1, last)
+
+
+def load_svtyper_json(json_file):
+    '''Load an svtyper json'''
+    with open(json_file) as f:
+        doc = json.load(f)
+    return doc
+
+
+def create_hist(lib):
+    '''Create a histogram from svtyper json information'''
+    return Counter({
+        int(k): int(v) for k, v in lib['histogram'].items()
+        })
+
+
+def calculate_overlaps(doc, target):
+    '''Calculate the minimum variant size with target discriminating power'''
+    for sample in doc:
+        for lib in doc[sample]['libraryArray']:
+            hist = create_hist(lib)
+            dens = calc_insert_density(hist)
+            (size, overlap_prob) = find_overlap(dens, target)
+            return (sample, size, overlap_prob)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Calculate variant size resolution based on cutoff',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+        )
+    parser.add_argument('json_file', nargs='+',
+                        help='svtyper json files to evaluate'
+                        )
+    parser.add_argument('--overlap', type=float, metavar='FLOAT',
+                        default=0.05,
+                        help='maximum density of overlap between discordant \
+                        and concordant insert size distributions'
+                        )
+    args = parser.parse_args()
+    print '\t'.join(('Sample', 'MinimumSize', 'Overlap'))
+    for f in args.json_file:
+        doc = load_svtyper_json(f)
+        results = calculate_overlaps(doc, 0.05)
+        print '\t'.join([str(x) for x in results])
+
+
+# Here thar be tests
+def test_calc_insert_density():
+    t = Counter({1: 1, 2: 2, 3: 1})
+    expected = Counter({1: 0.25, 2: 0.5, 3: 0.25})
+    assert(calc_insert_density(t) == expected)
+
+
+def test_overlap():
+    t = Counter({1: 0.25, 2: 0.5, 3: 0.25})
+    assert(overlap(t, 0) == 1.0)
+    assert(overlap(t, 3) == 0.0)
+    assert(overlap(t, 1) == 0.5)
+
+
+def test_find_overlap():
+    t = Counter({1: 0.2, 2: 0.5, 3: 0.3})
+    assert(find_overlap(t, 1.0) == (0, 1.0))
+    assert(find_overlap(t, 0.5) == (1, 0.5))
diff --git a/scripts/filter_del.py b/scripts/filter_del.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+from __future__ import division
+import argparse
+import sys
+from svtools.vcf.file import Vcf
+from svtools.vcf.variant import Variant
+import svtools.utils as su
+
+class VCFReader(object):
+    def __init__(self, stream):
+        self.vcf_obj = Vcf()
+        self.stream = stream
+        header = list()
+        for line in stream:
+            if line[0] != '#':
+                raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line))
+            header.append(line)
+            if line.startswith('#CHROM\t'):
+                # end of header
+                break
+        self.vcf_obj.add_header(header)
+
+    def __iter__(self):
+        for line in self.stream:
+            yield Variant(line.rstrip().split('\t'), self.vcf_obj)
+
+
+def load_deletion_sizes(stream):
+    minimum_del_size = dict()
+    for line in stream:
+        if line.startswith('Sample\t'):
+            continue
+        sample, size, overlap = line.rstrip().split('\t')
+        if sample not in minimum_del_size:
+            minimum_del_size[sample] = abs(int(size))
+        else:
+            raise RuntimeError('Size for {0} already set. Does your file of sizes include multiple lines with the same sample name?'.format(sample))
+    return minimum_del_size, max(minimum_del_size.values())
+
+def set_missing(input_stream, deletion_sizes, output_stream, max_del_size, sr_cutoff):
+    valid_types = set(('DEL', 'MEI'))
+    for variant in input_stream:
+        if variant.get_info('SVTYPE') in valid_types:
+            # NOTE this will raise an exception if SVLEN is null
+            length = abs(int(variant.get_info('SVLEN')))
+            if length < max_del_size:
+                split_read_support = 0
+                total_depth = 0
+                for s in variant.sample_list:
+                    g = variant.genotype(s)
+                    if g.get_format('GT') not in ('./.', '0/0'):
+                        split_read_support += int(g.get_format('AS'))
+                        total_depth += int(g.get_format('DP'))
+                if total_depth > 0 and (split_read_support / total_depth) < sr_cutoff:
+                    # Only set to null if PE support is our only source of
+                    # information. Not counting soft-clips here.
+                    # This may be a bad idea as even with SR support the
+                    # lack of power to detect PE reads could skew us away from 
+                    # the correct genotype.
+                    # A better method might be to regenotype using only 
+                    # split-read support if the SV is too small.
+                    logged = False
+                    for sample in variant.sample_list:
+                        if sample in deletion_sizes and length < deletion_sizes[sample]:
+                            gt = variant.genotype(sample)
+                            gt.set_format('GT', './.')
+                            if not logged:
+                                sys.stderr.write('Applying small deletion filter to {0}\n'.format(variant.var_id))
+                                logged=True
+
+        output_stream.write(variant.get_var_string())
+        output_stream.write('\n')
+
+def description():
+    return 'set genotypes of deletions smaller than a per-sample cutoff to missing if no splitread support in the sample'
+
+def add_arguments_to_parser(parser):
+    parser.add_argument("-i", "--input", required=True, dest="input", metavar='<VCF>', help="VCF file containing variants to be output")
+    parser.add_argument("-t", "--thresholds", required=True, dest="threshold_file", metavar='<TXT>', type=argparse.FileType('r'), help="Tab-separated file of sample name and minimum deletion size used to determine if site should be output")
+    parser.add_argument("-s", "--split-read-fraction", required=True, dest="sr_cutoff", metavar='<FLOAT>', type=float, help="Minimum fraction of split read support for the site to be excluded from filtering.")
+    parser.add_argument('-o', '--output', type=argparse.FileType('w'), metavar='<VCF>', default=sys.stdout, help='output VCF to write (default: stdout)')
+    parser.set_defaults(entry_point=run_from_args)
+
+def command_parser():
+    parser = argparse.ArgumentParser(description=description())
+    add_arguments_to_parser(parser)
+    return parser
+
+def run_from_args(args):
+    deletion_size_map, max_size = load_deletion_sizes(args.threshold_file)
+    with su.InputStream(args.input) as stream:
+        variant_stream = VCFReader(stream)
+        args.output.write(variant_stream.vcf_obj.get_header())
+        args.output.write('\n')
+        return set_missing(variant_stream, deletion_size_map, args.output, max_size, args.sr_cutoff)
+
+if __name__ == '__main__':
+    parser = command_parser()
+    args = parser.parse_args()
+    sys.exit(args.entry_point(args))