diff --git a/vcf/model.py b/vcf/model.py index 09013dd..f23353c 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -36,9 +36,9 @@ def __eq__(self, other): """ Two _Calls are equal if their _Records are equal and the samples and ``gt_type``s are the same """ - return (self.site == other.site - and self.sample == other.sample - and self.gt_type == other.gt_type) + return (self.site == getattr(other, "site", None) + and self.sample == getattr(other, "sample", None) + and self.gt_type == getattr(other, "gt_type", None)) def __getstate__(self): return dict((attr, getattr(self, attr)) for attr in self.__slots__) @@ -150,19 +150,19 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, # For Python 2 def __cmp__(self, other): - return cmp((self.CHROM, self.POS), (other.CHROM, other.POS)) + return cmp((self.CHROM, self.POS), (getattr(other, "CHROM", None), getattr(other, "POS", None))) # For Python 3 def __eq__(self, other): """ _Records are equal if they describe the same variant (same position, alleles) """ - return (self.CHROM == other.CHROM and - self.POS == other.POS and - self.REF == other.REF and - self.ALT == other.ALT) + return (self.CHROM == getattr(other, "CHROM", None) and + self.POS == getattr(other, "POS", None) and + self.REF == getattr(other, "REF", None) and + self.ALT == getattr(other, "ALT", None)) # For Python 3 def __lt__(self, other): - return (self.CHROM, self.POS) < (other.CHROM, other.POS) + return (self.CHROM, self.POS) < (getattr(other, "CHROM", None), getattr(other, "POS", None)) def __iter__(self): return iter(self.samples) @@ -447,7 +447,7 @@ def __str__(self): raise NotImplementedError def __eq__(self, other): - return self.type == other.type + return self.type == getattr(other, 'type', None) class _Substitution(_AltRecord): @@ -524,12 +524,12 @@ def __str__(self): def __eq__(self, other): return super(_Breakend, self).__eq__(other) \ - and self.chr == other.chr \ - and self.pos == other.pos \ - and self.remoteOrientation == other.remoteOrientation \ - and self.withinMainAssembly == other.withinMainAssembly \ - and self.orientation == other.orientation \ - and self.connectingSequence == other.connectingSequence + and self.chr == getattr(other, "chr", None) \ + and self.pos == getattr(other, "pos", None) \ + and self.remoteOrientation == getattr(other, "remoteOrientation", None) \ + and self.withinMainAssembly == getattr(other, "withinMainAssembly", None) \ + and self.orientation == getattr(other, "orientation", None) \ + and self.connectingSequence == getattr(other, "connectingSequence", None) class _SingleBreakend(_Breakend): diff --git a/vcf/test/issue-140-file1.vcf b/vcf/test/issue-140-file1.vcf new file mode 100644 index 0000000..8ee2de2 --- /dev/null +++ b/vcf/test/issue-140-file1.vcf @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr1 10 . G GGT . PASS DP=91;SS=1;SSC=2;GPV=3.0109E-23;SPV=5.8324E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:13:22:62.86%:2,11,1,21 +chr1 20 . GT G . PASS DP=77;SS=1;SSC=2;GPV=2.4504E-29;SPV=6.0772E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:28:5:22:81.48%:0,5,1,21 +chr2 30 . AC A . PASS DP=22;SS=1;SSC=7;GPV=1.3117E-10;SPV=1.9481E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:10:2:8:80%:0,2,0,8 +chr2 40 . AAAC A . PASS DP=42;SS=1;SSC=12;GPV=7.3092E-18;SPV=6.278E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:4:9:69.23%:4,0,9,0 +chr3 50 . TC T . PASS DP=41;SS=1;SSC=2;GPV=9.8874E-23;SPV=5.3659E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:1:21:95.45%:1,0,15,6 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 90 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chr20 110 . GA G . PASS DP=83;SS=1;SSC=13;GPV=1E0;SPV=4.0806E-2 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:35:5:28:84.85%:4,1,12,16 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 +chrX 130 . T TAA . PASS DP=19;SS=1;SSC=1;GPV=1.1285E-5;SPV=7.2172E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:8:2:5:71.43%:0,2,0,5 +chrY 140 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrY 150 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 +chrM 160 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrM 170 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 diff --git a/vcf/test/issue-140-file2.vcf b/vcf/test/issue-140-file2.vcf new file mode 100644 index 0000000..7852133 --- /dev/null +++ b/vcf/test/issue-140-file2.vcf @@ -0,0 +1,34 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr1 10 . G GGT . PASS DP=91;SS=1;SSC=2;GPV=3.0109E-23;SPV=5.8324E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:13:22:62.86%:2,11,1,21 +chr1 20 . GT G . PASS DP=77;SS=1;SSC=2;GPV=2.4504E-29;SPV=6.0772E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:28:5:22:81.48%:0,5,1,21 +chr2 30 . AC A . PASS DP=22;SS=1;SSC=7;GPV=1.3117E-10;SPV=1.9481E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:10:2:8:80%:0,2,0,8 +chr2 41 . AAAC A . PASS DP=42;SS=1;SSC=12;GPV=7.3092E-18;SPV=6.278E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:4:9:69.23%:4,0,9,0 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 91 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chr20 110 . GA G . PASS DP=83;SS=1;SSC=13;GPV=1E0;SPV=4.0806E-2 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:35:5:28:84.85%:4,1,12,16 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 +chrX 130 . T TAA . PASS DP=19;SS=1;SSC=1;GPV=1.1285E-5;SPV=7.2172E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:8:2:5:71.43%:0,2,0,5 +chrY 140 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrY 149 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 +chrM 160 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrM 170 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 diff --git a/vcf/test/issue-140-file3.vcf b/vcf/test/issue-140-file3.vcf new file mode 100644 index 0000000..754f6b6 --- /dev/null +++ b/vcf/test/issue-140-file3.vcf @@ -0,0 +1,25 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr3 50 . TC T . PASS DP=41;SS=1;SSC=2;GPV=9.8874E-23;SPV=5.3659E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:1:21:95.45%:1,0,15,6 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 90 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8fcd2a8..227c54e 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -20,7 +20,7 @@ class TestVcfSpecs(unittest.TestCase): def test_vcf_4_0(self): reader = vcf.Reader(fh('example-4.0.vcf')) - assert reader.metadata['fileformat'] == 'VCFv4.0' + self.assertEqual(reader.metadata['fileformat'], 'VCFv4.0') # test we can walk the file at least for r in reader: @@ -81,21 +81,21 @@ def test_vcf_4_1_bnd(self): for a in r.ALT: print(a) if r.ID == "bnd1": - assert len(r.ALT) == 1 - assert r.ALT[0].type == "BND" - assert r.ALT[0].chr == "2" - assert r.ALT[0].pos == 3 - assert r.ALT[0].orientation == False - assert r.ALT[0].remoteOrientation == True - assert r.ALT[0].connectingSequence == "T" + self.assertEqual(len(r.ALT), 1) + self.assertEqual(r.ALT[0].type, "BND") + self.assertEqual(r.ALT[0].chr, "2") + self.assertEqual(r.ALT[0].pos, 3) + self.assertEqual(r.ALT[0].orientation, False) + self.assertEqual(r.ALT[0].remoteOrientation, True) + self.assertEqual(r.ALT[0].connectingSequence, "T") if r.ID == "bnd4": - assert len(r.ALT) == 1 - assert r.ALT[0].type == "BND" - assert r.ALT[0].chr == "1" - assert r.ALT[0].pos == 2 - assert r.ALT[0].orientation == True - assert r.ALT[0].remoteOrientation == False - assert r.ALT[0].connectingSequence == "G" + self.assertEqual(len(r.ALT), 1) + self.assertEqual(r.ALT[0].type, "BND") + self.assertEqual(r.ALT[0].chr, "1") + self.assertEqual(r.ALT[0].pos, 2) + self.assertEqual(r.ALT[0].orientation, True) + self.assertEqual(r.ALT[0].remoteOrientation, False) + self.assertEqual(r.ALT[0].connectingSequence, "G") for c in r: print(c) assert c @@ -165,7 +165,7 @@ def testParse(self): n+=1 for x in r: assert x - assert n == self.n_calls + self.assertEqual(n, self.n_calls) class TestSamtoolsOutput(unittest.TestCase): @@ -728,7 +728,7 @@ def test_info_multiple_values(self): def test_pickle(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: - assert cPickle.loads(cPickle.dumps(var)) == var + self.assertEqual(cPickle.loads(cPickle.dumps(var)), var) class TestCall(unittest.TestCase): @@ -809,7 +809,7 @@ def testFetchSite(self): if not self.run: return site = self.reader.fetch('20', 14370) - assert site.POS == 14370 + self.assertEqual(site.POS, 14370) site = self.reader.fetch('20', 14369) assert site is None @@ -851,7 +851,7 @@ def testApplyFilter(self): return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq') #print(out) - assert s == 0 + self.assertEqual(s, 0) buf = StringIO() buf.write(out) buf.seek(0) @@ -873,7 +873,7 @@ def testApplyFilter(self): n += 1 else: assert 'sq30' not in r.FILTER - assert n == 2 + self.assertEqual(n, 2) def testApplyMultipleFilters(self): @@ -881,7 +881,7 @@ def testApplyMultipleFilters(self): return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 ' '--genotype-quality 50 test/example-4.0.vcf sq mgq') - assert s == 0 + self.assertEqual(s, 0) #print(out) buf = StringIO() buf.write(out) @@ -927,19 +927,18 @@ def test_walk(self): n = 0 for x in utils.walk_together(reader1, reader2, reader3): - assert len(x) == 3 - assert (x[0] == x[1]) and (x[1] == x[2]) + self.assertEqual(len(x), 3) + self.assertEqual(x[0], x[1]) + self.assertEqual(x[1], x[2]) n+= 1 - assert n == 5 + self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left - expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): - if ex == 'l': assert recs[0] is not None assert recs[1] is None @@ -949,42 +948,22 @@ def test_walk(self): if ex == 't': assert recs[0] is not None assert recs[1] is not None + + # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order + chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M'] + get_key = lambda r: (chr_order.index(r.CHROM.replace('chr','')), r.POS) + reader1 = vcf.Reader(fh('issue-140-file1.vcf')) + reader2 = vcf.Reader(fh('issue-140-file2.vcf')) + reader3 = vcf.Reader(fh('issue-140-file3.vcf')) + expected = "66642577752767662466" # each char is an integer bit flag - like file permissions + for ex, recs in zip(expected, utils.walk_together(reader1, reader2, reader3, vcf_record_sort_key = get_key)): + ex = int(ex) + for i, flag in enumerate([0x4, 0x2, 0x1]): + if ex & flag: + self.assertNotEqual(recs[i], None) + else: + self.assertEqual(recs[i], None) - # case with working custom equality function - - # without custom function, exception should be raised - - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) - self.assertRaises(AttributeError, next, - utils.walk_together(reader1, reader2)) - - # with custom function, iteration works - - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) - - def custom_eq(rec1, rec2): - # check for equality only on CHROM, POS, and REF - if rec1 is None or rec2 is None: - return False - return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ - rec1.REF == rec2.REF - - nrecs, ncomps = 0, 0 - for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): - assert len(x) == 2 - # avoid assert() when one record is None - if x[0] is not None and x[1] is not None: - assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) - ncomps += 1 - # still increment counter to ensure iteration is finished for all - # records - nrecs += 1 - # check number of records total - assert nrecs == 5 - # check how many records found in all files - assert ncomps == 4 def test_trim(self): tests = [('TAA GAA', 'T G'), @@ -1005,8 +984,8 @@ def test_meta(self): # expect no exceptions raised reader = vcf.Reader(fh('gatk_26_meta.vcf')) assert 'GATKCommandLine' in reader.metadata - assert reader.metadata['GATKCommandLine'][0]['CommandLineOptions'] == '"analysis_type=LeftAlignAndTrimVariants"' - assert reader.metadata['GATKCommandLine'][1]['CommandLineOptions'] == '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"' + self.assertEqual(reader.metadata['GATKCommandLine'][0]['CommandLineOptions'], '"analysis_type=LeftAlignAndTrimVariants"') + self.assertEqual(reader.metadata['GATKCommandLine'][1]['CommandLineOptions'], '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"') suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) diff --git a/vcf/utils.py b/vcf/utils.py index 0ab04ca..be048b0 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -2,28 +2,36 @@ Utilities for VCF files. """ -import operator - - def walk_together(*readers, **kwargs): - """ Simultaneously iteratate two or more VCF readers and return - lists of concurrent records from each - reader, with None if no record present. Caller must check the - inputs are sorted in the same way and use the same reference - otherwise behaviour is undefined. """ - # if defined, custom equality functions must take the same arguments - # as operator.eq - if 'eq_func' in kwargs: - eq_func = kwargs['eq_func'] - # by default, we use the equality operator (==), which compares - # equality in CHROM, POS, REF, and ALT - else: - eq_func = operator.eq + Simultaneously iteratate over two or more VCF readers. For each + genomic position with a variant, return a list of size equal to the number + of VCF readers. This list contains the VCF record from readers that have + this variant, and None for readers that don't have it. - # if one of the VCFs has no records, StopIteration is - # raised immediately, so we need to check for that and - # deal appropriately + Args: + vcf_record_sort_key: function that takes a VCF record and returns a + tuple that can be used as a key for comparing and sorting VCF + records across all readers. This tuple defines what it means for two + variants to be equal (eg. whether it's only their position or also + their allele values), and implicitly determines the chromosome + ordering since the tuple's 1st element is typically the chromosome + name (or calculated from it). + ignore_chr: (default True) 'chrX' and 'X' will be considered equal. Only + used if vcf_record_sort_key is not set. + check_contig_order: (default True) raise an exception if readers have + incompatible contig orderings (eg. 1, 10, 2 .. vs. 1, 2, 10 ..) + """ + check_contig_order = kwargs.get('check_contig_order', True) + is_user_defined_key = 'vcf_record_sort_key' in kwargs + if is_user_defined_key: + get_key = kwargs['vcf_record_sort_key'] + else: + if kwargs.get('ignore_chr', True): + get_key = lambda r: (r.CHROM.replace('chr', ''), r.POS) + else: + get_key = lambda r: (r.CHROM, r.POS) + nexts = [] for reader in readers: try: @@ -31,22 +39,46 @@ def walk_together(*readers, **kwargs): except StopIteration: nexts.append(None) - while True: - min_next = min([x for x in nexts if x is not None]) + min_k = (None,) # keep track of the previous min key's contig + finished_contigs = [] # used for checking contig order + while any([r is not None for r in nexts]): + next_idx_to_k = dict( + (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) + keys_with_prev_contig = [ + k for k in next_idx_to_k.values() if k[0] == min_k[0]] - yield [x if x is None or eq_func(x, min_next) else None for x in nexts] + if any(keys_with_prev_contig): + min_k = min(keys_with_prev_contig) # finish previous contig + else: + min_k = min(next_idx_to_k.values()) # move on to next contig - # update nexts that we just yielded - for i, n in enumerate(nexts): + if check_contig_order and len(finished_contigs) > 1: + prev_contig = finished_contigs[-1] + err = None + if is_user_defined_key and prev_contig > min_k[0]: + err = ("Order of '%s', '%s' records in %s (or other VCFs) " + "contradicts the order defined by vcf_record_sort_key") + elif min_k[0] in finished_contigs: + # Don't enforce the default contig ordering, since it may + # not match what's in the VCF files. Just make sure the + # ordering of emitted contigs is monotonic / without repeats + err = ("The order of contigs '%s' and '%s' is ambiguous or " + "contradictory in %s (or other VCFs). To define the " + "contig order, set the 'vcf_record_sort_key' arg.") + if err: + bad_i = [i for i, k in next_idx_to_k.items() if k == min_k] + bad_file = readers[bad_i[0]].filename + raise Exception(err % (prev_contig, min_k[0], bad_file)) + finished_contigs.append(min_k[0]) - if n is not None and eq_func(n, min_next): - try: - nexts[i] = readers[i].next() - except StopIteration: - nexts[i] = None + min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k]) + yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] - if all([x is None for x in nexts]): - break + for i in min_k_idxs: + try: + nexts[i] = readers[i].next() + except StopIteration: + nexts[i] = None def trim_common_suffix(*sequences):