From 2b99b138dae01b09ff649cf27b49e8915853ce88 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 Nov 2021 14:46:52 +0000 Subject: [PATCH] Fix indexing bug with "placed" unmapped reads. Unmapped-but-placed (having REF/POS) reads are not included in the index. Hence if an placed unmapped is the first record in a bin, then it may not be returned. Note most aligners write out mapped followed by unmapped which does not trigger this problem. The SAM spec states that all unmapped placed reads should be considered as having an alignment length of 1. While it doesn't seem to explicitly state these must therefore be in the index, it does imply it. It appears that picard also indexes placed reads in this manner. Originally reported by John Marshall. Fixes #1142 --- hts.c | 14 ++++++-------- test/index2.sam | 11 +++++++++++ test/test.pl | 25 +++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 test/index2.sam diff --git a/hts.c b/hts.c index 194fb1f7aa..39bb17cb0e 100644 --- a/hts.c +++ b/hts.c @@ -2384,14 +2384,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t if ( tid>=0 ) { if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); - if (is_mapped) { - // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin - if (beg < 0) beg = 0; - if (end <= 0) end = 1; - // idx->z.last_off points to the start of the current record - if (insert_to_l(&idx->lidx[tid], beg, end, - idx->z.last_off, idx->min_shift) < 0) return -1; - } + // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin + if (beg < 0) beg = 0; + if (end <= 0) end = 1; + // idx->z.last_off points to the start of the current record + if (insert_to_l(&idx->lidx[tid], beg, end, + idx->z.last_off, idx->min_shift) < 0) return -1; } else idx->n_no_coor++; bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); diff --git a/test/index2.sam b/test/index2.sam new file mode 100644 index 0000000000..97d39e65bc --- /dev/null +++ b/test/index2.sam @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:249250621 M5:1b22b98cdeb4a9304cb5d48026a85128 +@SQ SN:2 LN:243199373 M5:a0d9851da00400dec1098a9255ac712e +um1 69 1 1000000 0 * * 0 0 AAAAAAAAAA * +um1 137 1 1000000 44 10M * 0 0 AAAAAAAAAA * +um2 69 1 2000000 0 * * 0 0 AAAAAAAAAA * +um2 137 1 2000000 44 10M * 0 0 AAAAAAAAAA * +mu1 137 2 1000000 44 10M * 0 0 AAAAAAAAAA * +mu1 69 2 1000000 0 * * 0 0 AAAAAAAAAA * +mu2 137 2 2000000 44 10M * 0 0 AAAAAAAAAA * +mu2 69 2 2000000 0 * * 0 0 AAAAAAAAAA * diff --git a/test/test.pl b/test/test.pl index 5ba6022b41..3b7bf5be83 100755 --- a/test/test.pl +++ b/test/test.pl @@ -840,6 +840,31 @@ sub test_index $wtmp =~ s/\//\\\\/g; } test_cmd($opts,out=>'tabix.out',cmd=>"$$opts{bin}/tabix $wtmp/index.vcf.gz##idx##$wtmp/index.vcf.gz.tbi 1:10000060-10000060"); + + cmd("$$opts{path}/test_view $$opts{path}/index2.sam -b -p $$opts{tmp}/index2.bam -x $$opts{tmp}/index2.bam.bai"); + for (my $tid = 1; $tid <= 2; $tid++) { + for (my $pos = 1; $pos <= 2; $pos++) { + # All queries should return exactly two sequences. + # The input data consists of mapped/unmapped and unmapped/mapped + # in both orders. + # Done verbatim as test_cmd cannot return $out for us to check. + my $test = "$$opts{path}/test_view $$opts{tmp}/index2.bam $tid:${pos}000000-${pos}000000"; + print "test_index:\n\t$test\n"; + my ($ret, $out) = _cmd($test); + if ($ret ne 0) { + failed($opts, $test); + } else { + my $rnum = ($out =~ s/^[^@].*\n//gm); + if ($rnum ne 2) { + failed($opts, $test); + } else { + passed($opts, $test); + } + } + } + } + unlink("$$opts{tmp}/index2.bam"); + unlink("$$opts{tmp}/index2.bam.bai"); } sub test_bcf2vcf