From da0d7c07ca9a7da8f550754ef938cfec8395ba71 Mon Sep 17 00:00:00 2001 From: datagram Date: Thu, 6 Feb 2014 01:08:21 -0800 Subject: [PATCH 1/7] Fix for issue #140, add vcf_record_sort_key arg - Added 'vcf_record_sort_key' to allow user to specify arbitrary chromosome ordering. - Fixed issue #140 by making sure to emit all records from the current chromosome before moving on to the next one. This takes care of the problem in most typical cases (eg. when all files have records for all contigs), but not in some edge cases, in which case the 'vcf_record_sort_key' arg can be used to fully solve the problem by explicitly defining the chromosome order. --- vcf/utils.py | 85 ++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/vcf/utils.py b/vcf/utils.py index 0ab04ca..c4dea1a 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -2,51 +2,50 @@ Utilities for VCF files. """ -import operator +def walk_together(*readers, **kwargs): + """ Simultaneously iteratate two or more VCF readers and return + lists of concurrent records from each + reader, with None if no record present. Caller must check the + inputs are sorted in the same way and use the same reference + otherwise behaviour is undefined. + + Args: + vcf_record_sort_key: function that takes a VCF record and returns a tuple that can be used as the key for comparing and sorting VCF records across all given VCFReaders. The tuple's 1st element should be the contig name. + """ + if 'vcf_record_sort_key' in kwargs: + get_key = kwargs['vcf_record_sort_key'] + else: + get_key = lambda r: (r.CHROM, r.POS) + + nexts = [] + for reader in readers: + try: + nexts.append(reader.next()) + except StopIteration: + nexts.append(None) + min_k = (None,) # keep track of the previous min key's contig + while True: + kdict = {i: get_key(x) for i,x in enumerate(nexts) if x is not None} + keys_with_prev_contig = [k for k in kdict.values() if k[0] == min_k[0]] + if any(keys_with_prev_contig): + # finish all records from previous contig + min_k = min(keys_with_prev_contig) + else: + # move on to the next contig + min_k = min(kdict.values()) + + min_k_idxs = set([i for i, k in kdict.items() if k == min_k]) + yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] -def walk_together(*readers, **kwargs): - """ Simultaneously iteratate two or more VCF readers and return - lists of concurrent records from each - reader, with None if no record present. Caller must check the - inputs are sorted in the same way and use the same reference - otherwise behaviour is undefined. - """ - # if defined, custom equality functions must take the same arguments - # as operator.eq - if 'eq_func' in kwargs: - eq_func = kwargs['eq_func'] - # by default, we use the equality operator (==), which compares - # equality in CHROM, POS, REF, and ALT - else: - eq_func = operator.eq - - # if one of the VCFs has no records, StopIteration is - # raised immediately, so we need to check for that and - # deal appropriately - nexts = [] - for reader in readers: - try: - nexts.append(reader.next()) - except StopIteration: - nexts.append(None) - - while True: - min_next = min([x for x in nexts if x is not None]) - - yield [x if x is None or eq_func(x, min_next) else None for x in nexts] - - # update nexts that we just yielded - for i, n in enumerate(nexts): - - if n is not None and eq_func(n, min_next): - try: - nexts[i] = readers[i].next() - except StopIteration: - nexts[i] = None - - if all([x is None for x in nexts]): - break + for i in min_k_idxs: + try: + nexts[i] = readers[i].next() + except StopIteration: + nexts[i] = None + + if all([x is None for x in nexts]): + break def trim_common_suffix(*sequences): From a7097fe99195b56d37dcb431d699ebff0a5d8a1d Mon Sep 17 00:00:00 2001 From: Ben Weisburd Date: Thu, 6 Feb 2014 13:35:25 -0800 Subject: [PATCH 2/7] Fixed spacing and wrapping in utils.py, removed test for old walk_together arg (eq function), fixed edge case in _AltRecord --- vcf/model.py | 2 +- vcf/test/test_vcf.py | 63 +++++++++++++++--------------- vcf/utils.py | 93 ++++++++++++++++++++++++-------------------- 3 files changed, 84 insertions(+), 74 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 09013dd..b057a4f 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -447,7 +447,7 @@ def __str__(self): raise NotImplementedError def __eq__(self, other): - return self.type == other.type + return self.type == getattr(other, 'type', None) class _Substitution(_AltRecord): diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8fcd2a8..849c2b8 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -20,7 +20,7 @@ class TestVcfSpecs(unittest.TestCase): def test_vcf_4_0(self): reader = vcf.Reader(fh('example-4.0.vcf')) - assert reader.metadata['fileformat'] == 'VCFv4.0' + self.assertEqual(reader.metadata['fileformat'], 'VCFv4.0') # test we can walk the file at least for r in reader: @@ -81,21 +81,21 @@ def test_vcf_4_1_bnd(self): for a in r.ALT: print(a) if r.ID == "bnd1": - assert len(r.ALT) == 1 - assert r.ALT[0].type == "BND" - assert r.ALT[0].chr == "2" - assert r.ALT[0].pos == 3 - assert r.ALT[0].orientation == False - assert r.ALT[0].remoteOrientation == True - assert r.ALT[0].connectingSequence == "T" + self.assertEqual(len(r.ALT), 1) + self.assertEqual(r.ALT[0].type, "BND") + self.assertEqual(r.ALT[0].chr, "2") + self.assertEqual(r.ALT[0].pos, 3) + self.assertEqual(r.ALT[0].orientation, False) + self.assertEqual(r.ALT[0].remoteOrientation, True) + self.assertEqual(r.ALT[0].connectingSequence, "T") if r.ID == "bnd4": - assert len(r.ALT) == 1 - assert r.ALT[0].type == "BND" - assert r.ALT[0].chr == "1" - assert r.ALT[0].pos == 2 - assert r.ALT[0].orientation == True - assert r.ALT[0].remoteOrientation == False - assert r.ALT[0].connectingSequence == "G" + self.assertEqual(len(r.ALT), 1) + self.assertEqual(r.ALT[0].type, "BND") + self.assertEqual(r.ALT[0].chr, "1") + self.assertEqual(r.ALT[0].pos, 2) + self.assertEqual(r.ALT[0].orientation, True) + self.assertEqual(r.ALT[0].remoteOrientation, False) + self.assertEqual(r.ALT[0].connectingSequence, "G") for c in r: print(c) assert c @@ -165,7 +165,7 @@ def testParse(self): n+=1 for x in r: assert x - assert n == self.n_calls + self.assertEqual(n, self.n_calls) class TestSamtoolsOutput(unittest.TestCase): @@ -728,7 +728,7 @@ def test_info_multiple_values(self): def test_pickle(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: - assert cPickle.loads(cPickle.dumps(var)) == var + self.assertEqual(cPickle.loads(cPickle.dumps(var)), var) class TestCall(unittest.TestCase): @@ -809,7 +809,7 @@ def testFetchSite(self): if not self.run: return site = self.reader.fetch('20', 14370) - assert site.POS == 14370 + self.assertEqual(site.POS, 14370) site = self.reader.fetch('20', 14369) assert site is None @@ -851,7 +851,7 @@ def testApplyFilter(self): return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq') #print(out) - assert s == 0 + self.assertEqual(s, 0) buf = StringIO() buf.write(out) buf.seek(0) @@ -873,7 +873,7 @@ def testApplyFilter(self): n += 1 else: assert 'sq30' not in r.FILTER - assert n == 2 + self.assertEqual(n, 2) def testApplyMultipleFilters(self): @@ -881,7 +881,7 @@ def testApplyMultipleFilters(self): return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 ' '--genotype-quality 50 test/example-4.0.vcf sq mgq') - assert s == 0 + self.assertEqual(s, 0) #print(out) buf = StringIO() buf.write(out) @@ -927,10 +927,11 @@ def test_walk(self): n = 0 for x in utils.walk_together(reader1, reader2, reader3): - assert len(x) == 3 - assert (x[0] == x[1]) and (x[1] == x[2]) + self.assertEqual(len(x), 3) + self.assertEqual(x[0], x[1]) + self.assertEqual(x[1], x[2]) n+= 1 - assert n == 5 + self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left @@ -956,8 +957,8 @@ def test_walk(self): reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) - self.assertRaises(AttributeError, next, - utils.walk_together(reader1, reader2)) + #self.assertRaises(AttributeError, next, + # utils.walk_together(reader1, reader2)) # with custom function, iteration works @@ -973,7 +974,7 @@ def custom_eq(rec1, rec2): nrecs, ncomps = 0, 0 for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): - assert len(x) == 2 + self.assertEqual(len(x), 2) # avoid assert() when one record is None if x[0] is not None and x[1] is not None: assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) @@ -982,9 +983,9 @@ def custom_eq(rec1, rec2): # records nrecs += 1 # check number of records total - assert nrecs == 5 + self.assertEqual(nrecs, 5) # check how many records found in all files - assert ncomps == 4 + self.assertEqual(ncomps, 4) def test_trim(self): tests = [('TAA GAA', 'T G'), @@ -1005,8 +1006,8 @@ def test_meta(self): # expect no exceptions raised reader = vcf.Reader(fh('gatk_26_meta.vcf')) assert 'GATKCommandLine' in reader.metadata - assert reader.metadata['GATKCommandLine'][0]['CommandLineOptions'] == '"analysis_type=LeftAlignAndTrimVariants"' - assert reader.metadata['GATKCommandLine'][1]['CommandLineOptions'] == '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"' + self.assertEqual(reader.metadata['GATKCommandLine'][0]['CommandLineOptions'], '"analysis_type=LeftAlignAndTrimVariants"') + self.assertEqual(reader.metadata['GATKCommandLine'][1]['CommandLineOptions'], '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"') suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) diff --git a/vcf/utils.py b/vcf/utils.py index c4dea1a..09a6668 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -2,50 +2,59 @@ Utilities for VCF files. """ -def walk_together(*readers, **kwargs): - """ Simultaneously iteratate two or more VCF readers and return - lists of concurrent records from each - reader, with None if no record present. Caller must check the - inputs are sorted in the same way and use the same reference - otherwise behaviour is undefined. - - Args: - vcf_record_sort_key: function that takes a VCF record and returns a tuple that can be used as the key for comparing and sorting VCF records across all given VCFReaders. The tuple's 1st element should be the contig name. - """ - if 'vcf_record_sort_key' in kwargs: - get_key = kwargs['vcf_record_sort_key'] - else: - get_key = lambda r: (r.CHROM, r.POS) - - nexts = [] - for reader in readers: - try: - nexts.append(reader.next()) - except StopIteration: - nexts.append(None) +def walk_together(*readers, **kwargs): + """ + Simultaneously iteratate over two or more VCF readers. For each + genomic position with a variant, return a list of size equal to the number + of VCF readers. This list contains the VCF record from readers that have + this variant, and None for readers that don't have it. + The caller must make sure that inputs are sorted in the same way and use the + same reference otherwise behaviour is undefined. + + Args: + vcf_record_sort_key: function that takes a VCF record and returns a + tuple that can be used as a key for comparing and sorting VCF + records across all readers. This tuple defines what it means for two + variants to be equal (eg. whether it's only their position or also + their allele values), and implicitly determines the chromosome + ordering since the tuple's 1st element is typically the chromosome + name (or calculated from it). + """ + if 'vcf_record_sort_key' in kwargs: + get_key = kwargs['vcf_record_sort_key'] + else: + get_key = lambda r: (r.CHROM, r.POS) #, r.REF, r.ALT) + + nexts = [] + for reader in readers: + try: + nexts.append(reader.next()) + except StopIteration: + nexts.append(None) + + min_k = (None,) # keep track of the previous min key's contig + while True: + next_idx_to_k = dict( + (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) + keys_with_prev_contig = [ + k for k in next_idx_to_k.values() if k[0] == min_k[0]] + + if any(keys_with_prev_contig): + min_k = min(keys_with_prev_contig) # finish previous contig + else: + min_k = min(next_idx_to_k.values()) # move on to next contig - min_k = (None,) # keep track of the previous min key's contig - while True: - kdict = {i: get_key(x) for i,x in enumerate(nexts) if x is not None} - keys_with_prev_contig = [k for k in kdict.values() if k[0] == min_k[0]] - if any(keys_with_prev_contig): - # finish all records from previous contig - min_k = min(keys_with_prev_contig) - else: - # move on to the next contig - min_k = min(kdict.values()) - - min_k_idxs = set([i for i, k in kdict.items() if k == min_k]) - yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] + min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k]) + yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] - for i in min_k_idxs: - try: - nexts[i] = readers[i].next() - except StopIteration: - nexts[i] = None - - if all([x is None for x in nexts]): - break + for i in min_k_idxs: + try: + nexts[i] = readers[i].next() + except StopIteration: + nexts[i] = None + + if all([r is None for r in nexts]): + break def trim_common_suffix(*sequences): From be60cd22e978be4e7b6c6ca8b7e4709114de330e Mon Sep 17 00:00:00 2001 From: Ben Weisburd Date: Thu, 6 Feb 2014 14:16:19 -0800 Subject: [PATCH 3/7] Fixed edge case where all inputs are empty, simplified logic --- vcf/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vcf/utils.py b/vcf/utils.py index 09a6668..456e5fa 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -33,7 +33,7 @@ def walk_together(*readers, **kwargs): nexts.append(None) min_k = (None,) # keep track of the previous min key's contig - while True: + while any([r is not None for r in nexts]): next_idx_to_k = dict( (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) keys_with_prev_contig = [ @@ -52,9 +52,6 @@ def walk_together(*readers, **kwargs): nexts[i] = readers[i].next() except StopIteration: nexts[i] = None - - if all([r is None for r in nexts]): - break def trim_common_suffix(*sequences): From 6a204d0dcb42d797f802d5df1e9500a6252e1fb4 Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 03:20:20 -0800 Subject: [PATCH 4/7] finished fixing edge case where 'other' is None --- vcf/model.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index b057a4f..f23353c 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -36,9 +36,9 @@ def __eq__(self, other): """ Two _Calls are equal if their _Records are equal and the samples and ``gt_type``s are the same """ - return (self.site == other.site - and self.sample == other.sample - and self.gt_type == other.gt_type) + return (self.site == getattr(other, "site", None) + and self.sample == getattr(other, "sample", None) + and self.gt_type == getattr(other, "gt_type", None)) def __getstate__(self): return dict((attr, getattr(self, attr)) for attr in self.__slots__) @@ -150,19 +150,19 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, # For Python 2 def __cmp__(self, other): - return cmp((self.CHROM, self.POS), (other.CHROM, other.POS)) + return cmp((self.CHROM, self.POS), (getattr(other, "CHROM", None), getattr(other, "POS", None))) # For Python 3 def __eq__(self, other): """ _Records are equal if they describe the same variant (same position, alleles) """ - return (self.CHROM == other.CHROM and - self.POS == other.POS and - self.REF == other.REF and - self.ALT == other.ALT) + return (self.CHROM == getattr(other, "CHROM", None) and + self.POS == getattr(other, "POS", None) and + self.REF == getattr(other, "REF", None) and + self.ALT == getattr(other, "ALT", None)) # For Python 3 def __lt__(self, other): - return (self.CHROM, self.POS) < (other.CHROM, other.POS) + return (self.CHROM, self.POS) < (getattr(other, "CHROM", None), getattr(other, "POS", None)) def __iter__(self): return iter(self.samples) @@ -524,12 +524,12 @@ def __str__(self): def __eq__(self, other): return super(_Breakend, self).__eq__(other) \ - and self.chr == other.chr \ - and self.pos == other.pos \ - and self.remoteOrientation == other.remoteOrientation \ - and self.withinMainAssembly == other.withinMainAssembly \ - and self.orientation == other.orientation \ - and self.connectingSequence == other.connectingSequence + and self.chr == getattr(other, "chr", None) \ + and self.pos == getattr(other, "pos", None) \ + and self.remoteOrientation == getattr(other, "remoteOrientation", None) \ + and self.withinMainAssembly == getattr(other, "withinMainAssembly", None) \ + and self.orientation == getattr(other, "orientation", None) \ + and self.connectingSequence == getattr(other, "connectingSequence", None) class _SingleBreakend(_Breakend): From 1067a503a1e01a1bd49412cab4503097a6397285 Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 03:21:19 -0800 Subject: [PATCH 5/7] Test data for testing the fix for issue #140 --- vcf/test/issue-140-file1.vcf | 35 +++++++++++++++++++++++++++++++++++ vcf/test/issue-140-file2.vcf | 34 ++++++++++++++++++++++++++++++++++ vcf/test/issue-140-file3.vcf | 25 +++++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 vcf/test/issue-140-file1.vcf create mode 100644 vcf/test/issue-140-file2.vcf create mode 100644 vcf/test/issue-140-file3.vcf diff --git a/vcf/test/issue-140-file1.vcf b/vcf/test/issue-140-file1.vcf new file mode 100644 index 0000000..8ee2de2 --- /dev/null +++ b/vcf/test/issue-140-file1.vcf @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr1 10 . G GGT . PASS DP=91;SS=1;SSC=2;GPV=3.0109E-23;SPV=5.8324E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:13:22:62.86%:2,11,1,21 +chr1 20 . GT G . PASS DP=77;SS=1;SSC=2;GPV=2.4504E-29;SPV=6.0772E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:28:5:22:81.48%:0,5,1,21 +chr2 30 . AC A . PASS DP=22;SS=1;SSC=7;GPV=1.3117E-10;SPV=1.9481E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:10:2:8:80%:0,2,0,8 +chr2 40 . AAAC A . PASS DP=42;SS=1;SSC=12;GPV=7.3092E-18;SPV=6.278E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:4:9:69.23%:4,0,9,0 +chr3 50 . TC T . PASS DP=41;SS=1;SSC=2;GPV=9.8874E-23;SPV=5.3659E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:1:21:95.45%:1,0,15,6 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 90 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chr20 110 . GA G . PASS DP=83;SS=1;SSC=13;GPV=1E0;SPV=4.0806E-2 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:35:5:28:84.85%:4,1,12,16 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 +chrX 130 . T TAA . PASS DP=19;SS=1;SSC=1;GPV=1.1285E-5;SPV=7.2172E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:8:2:5:71.43%:0,2,0,5 +chrY 140 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrY 150 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 +chrM 160 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrM 170 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 diff --git a/vcf/test/issue-140-file2.vcf b/vcf/test/issue-140-file2.vcf new file mode 100644 index 0000000..7852133 --- /dev/null +++ b/vcf/test/issue-140-file2.vcf @@ -0,0 +1,34 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr1 10 . G GGT . PASS DP=91;SS=1;SSC=2;GPV=3.0109E-23;SPV=5.8324E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:13:22:62.86%:2,11,1,21 +chr1 20 . GT G . PASS DP=77;SS=1;SSC=2;GPV=2.4504E-29;SPV=6.0772E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:28:5:22:81.48%:0,5,1,21 +chr2 30 . AC A . PASS DP=22;SS=1;SSC=7;GPV=1.3117E-10;SPV=1.9481E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:10:2:8:80%:0,2,0,8 +chr2 41 . AAAC A . PASS DP=42;SS=1;SSC=12;GPV=7.3092E-18;SPV=6.278E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:4:9:69.23%:4,0,9,0 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 91 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chr20 110 . GA G . PASS DP=83;SS=1;SSC=13;GPV=1E0;SPV=4.0806E-2 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:35:5:28:84.85%:4,1,12,16 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 +chrX 130 . T TAA . PASS DP=19;SS=1;SSC=1;GPV=1.1285E-5;SPV=7.2172E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:8:2:5:71.43%:0,2,0,5 +chrY 140 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrY 149 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 +chrM 160 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrM 170 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 diff --git a/vcf/test/issue-140-file3.vcf b/vcf/test/issue-140-file3.vcf new file mode 100644 index 0000000..754f6b6 --- /dev/null +++ b/vcf/test/issue-140-file3.vcf @@ -0,0 +1,25 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr3 50 . TC T . PASS DP=41;SS=1;SSC=2;GPV=9.8874E-23;SPV=5.3659E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:1:21:95.45%:1,0,15,6 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 90 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 From a7da0a561257a7ab18492b7a1c3853331a85a35e Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 03:24:00 -0800 Subject: [PATCH 6/7] Added tests for walk_together with more complex inputs --- vcf/test/test_vcf.py | 52 +++++++++++++------------------------------- 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 849c2b8..227c54e 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -934,13 +934,11 @@ def test_walk(self): self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left - expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): - if ex == 'l': assert recs[0] is not None assert recs[1] is None @@ -950,42 +948,22 @@ def test_walk(self): if ex == 't': assert recs[0] is not None assert recs[1] is not None + + # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order + chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M'] + get_key = lambda r: (chr_order.index(r.CHROM.replace('chr','')), r.POS) + reader1 = vcf.Reader(fh('issue-140-file1.vcf')) + reader2 = vcf.Reader(fh('issue-140-file2.vcf')) + reader3 = vcf.Reader(fh('issue-140-file3.vcf')) + expected = "66642577752767662466" # each char is an integer bit flag - like file permissions + for ex, recs in zip(expected, utils.walk_together(reader1, reader2, reader3, vcf_record_sort_key = get_key)): + ex = int(ex) + for i, flag in enumerate([0x4, 0x2, 0x1]): + if ex & flag: + self.assertNotEqual(recs[i], None) + else: + self.assertEqual(recs[i], None) - # case with working custom equality function - - # without custom function, exception should be raised - - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) - #self.assertRaises(AttributeError, next, - # utils.walk_together(reader1, reader2)) - - # with custom function, iteration works - - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) - - def custom_eq(rec1, rec2): - # check for equality only on CHROM, POS, and REF - if rec1 is None or rec2 is None: - return False - return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ - rec1.REF == rec2.REF - - nrecs, ncomps = 0, 0 - for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): - self.assertEqual(len(x), 2) - # avoid assert() when one record is None - if x[0] is not None and x[1] is not None: - assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) - ncomps += 1 - # still increment counter to ensure iteration is finished for all - # records - nrecs += 1 - # check number of records total - self.assertEqual(nrecs, 5) - # check how many records found in all files - self.assertEqual(ncomps, 4) def test_trim(self): tests = [('TAA GAA', 'T G'), From 10d9774ad94de5de4eb4805ffd267c375d001db3 Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 05:47:43 -0800 Subject: [PATCH 7/7] Added check for consistant chromosome ordering --- vcf/utils.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/vcf/utils.py b/vcf/utils.py index 456e5fa..be048b0 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -8,8 +8,6 @@ def walk_together(*readers, **kwargs): genomic position with a variant, return a list of size equal to the number of VCF readers. This list contains the VCF record from readers that have this variant, and None for readers that don't have it. - The caller must make sure that inputs are sorted in the same way and use the - same reference otherwise behaviour is undefined. Args: vcf_record_sort_key: function that takes a VCF record and returns a @@ -19,12 +17,21 @@ def walk_together(*readers, **kwargs): their allele values), and implicitly determines the chromosome ordering since the tuple's 1st element is typically the chromosome name (or calculated from it). + ignore_chr: (default True) 'chrX' and 'X' will be considered equal. Only + used if vcf_record_sort_key is not set. + check_contig_order: (default True) raise an exception if readers have + incompatible contig orderings (eg. 1, 10, 2 .. vs. 1, 2, 10 ..) """ - if 'vcf_record_sort_key' in kwargs: + check_contig_order = kwargs.get('check_contig_order', True) + is_user_defined_key = 'vcf_record_sort_key' in kwargs + if is_user_defined_key: get_key = kwargs['vcf_record_sort_key'] else: - get_key = lambda r: (r.CHROM, r.POS) #, r.REF, r.ALT) - + if kwargs.get('ignore_chr', True): + get_key = lambda r: (r.CHROM.replace('chr', ''), r.POS) + else: + get_key = lambda r: (r.CHROM, r.POS) + nexts = [] for reader in readers: try: @@ -33,6 +40,7 @@ def walk_together(*readers, **kwargs): nexts.append(None) min_k = (None,) # keep track of the previous min key's contig + finished_contigs = [] # used for checking contig order while any([r is not None for r in nexts]): next_idx_to_k = dict( (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) @@ -44,6 +52,25 @@ def walk_together(*readers, **kwargs): else: min_k = min(next_idx_to_k.values()) # move on to next contig + if check_contig_order and len(finished_contigs) > 1: + prev_contig = finished_contigs[-1] + err = None + if is_user_defined_key and prev_contig > min_k[0]: + err = ("Order of '%s', '%s' records in %s (or other VCFs) " + "contradicts the order defined by vcf_record_sort_key") + elif min_k[0] in finished_contigs: + # Don't enforce the default contig ordering, since it may + # not match what's in the VCF files. Just make sure the + # ordering of emitted contigs is monotonic / without repeats + err = ("The order of contigs '%s' and '%s' is ambiguous or " + "contradictory in %s (or other VCFs). To define the " + "contig order, set the 'vcf_record_sort_key' arg.") + if err: + bad_i = [i for i, k in next_idx_to_k.items() if k == min_k] + bad_file = readers[bad_i[0]].filename + raise Exception(err % (prev_contig, min_k[0], bad_file)) + finished_contigs.append(min_k[0]) + min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k]) yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))]