diff --git a/src/SamRead.js b/src/SamRead.js index 0929f29e..f6a91fbc 100644 --- a/src/SamRead.js +++ b/src/SamRead.js @@ -12,6 +12,9 @@ */ 'use strict'; +import type * as ContigInterval from './ContigInterval'; +import type * as VirtualOffset from './VirtualOffset'; + var jDataView = require('jdataview'), jBinary = require('jbinary'), {nullString} = require('./formats/helpers'), @@ -21,31 +24,41 @@ var jDataView = require('jdataview'), class SamRead { buffer: ArrayBuffer; - reader: jDataView; + offset: VirtualOffset; pos: number; refID: number; l_seq: number; - constructor(buffer: ArrayBuffer) { + /** + * buffer contains the raw bytes of the serialized BAM read. It must contain + * at least one full read (but may contain more). + * offset records where this alignment is located in the BAM file. It's + * useful as a unique ID for alignments. + */ + constructor(buffer: ArrayBuffer, offset: VirtualOffset) { this.buffer = buffer; + this.offset = offset; // Go ahead and parse a few fields immediately. - var jv = new jDataView(buffer, 0, buffer.byteLength, true /* little endian */); + var jv = this._getJDataView(); this.refID = jv.getUint32(0); this.pos = jv.getUint32(4); this.l_seq = jv.getUint32(16); - - this.reader = jv; } toString(): string { var stop = this.pos + this.l_seq; - return `${this.refID}:${this.pos}-${stop}`; + return `${this.refID}:${1+this.pos}-${stop}`; + } + + _getJDataView(): jDataView { + var b = this.buffer; + return new jDataView(b, 0, b.byteLength, true /* little endian */); } getName(): string { - var l_read_name = this.reader.getUint8(8); + var l_read_name = this._getJDataView().getUint8(8); var jb = new jBinary(this.buffer, { 'jBinary.littleEndian': true }); diff --git a/src/VirtualOffset.js b/src/VirtualOffset.js index 7c14beeb..569ac1c9 100644 --- a/src/VirtualOffset.js +++ b/src/VirtualOffset.js @@ -42,6 +42,10 @@ class VirtualOffset { compareTo(other: VirtualOffset): number { return this.coffset - other.coffset || this.uoffset - other.uoffset; } + + clone(): VirtualOffset { + return new VirtualOffset(this.coffset, this.uoffset); + } } module.exports = VirtualOffset; diff --git a/src/bam.js b/src/bam.js index 287f194c..e3f374bf 100644 --- a/src/bam.js +++ b/src/bam.js @@ -43,31 +43,66 @@ type Chunk = { chunk_end: VirtualOffset; } +// TODO: import from src/utils.js +type InflatedBlock = { + offset: number; + compressedLength: number; + buffer: ArrayBuffer; +} + var kMaxFetch = 65536 * 2; +// Read a single alignment +function readAlignment(view: jDataView, pos: number, offset: VirtualOffset) { + var readLength = view.getInt32(pos); + pos += 4; + + if (pos + readLength >= view.byteLength) { + return null; + } + + var readSlice = view.buffer.slice(pos, pos + readLength); + + var read = new SamRead(readSlice, offset.clone()); + return { + read, + readLength: 4 + readLength + }; +} // This tracks how many bytes were read. function readAlignmentsToEnd(buffer: ArrayBuffer, idxRange: ContigInterval, contained: boolean, + offset: VirtualOffset, + blocks: InflatedBlock[], alignments: SamRead[]) { // We use jDataView and ArrayBuffer directly for a speedup over jBinary. // This parses reads ~2-3x faster than using ThinAlignment directly. var jv = new jDataView(buffer, 0, buffer.byteLength, true /* little endian */); - var lastStartOffset = 0; var shouldAbort = false; var pos = 0; + offset = offset.clone(); + var blockIndex = 0; try { while (pos < buffer.byteLength) { - var readLength = jv.getInt32(pos); pos += 4; - if (pos + readLength >= buffer.byteLength) break; + var readData = readAlignment(jv, pos, offset); + if (!readData) break; - var readSlice = buffer.slice(pos, pos + readLength); pos += readLength; - var read = new SamRead(readSlice); + var {read, readLength} = readData; + pos += readLength; if (isAlignmentInRange(read, idxRange, contained)) { alignments.push(read); } - lastStartOffset = pos; + + // Advance the VirtualOffset to reflect the new position + offset.uoffset += readLength; + var bufLen = blocks[blockIndex].buffer.byteLength; + if (offset.uoffset >= bufLen) { + offset.uoffset -= bufLen; + offset.coffset += blocks[blockIndex].compressedLength; + blockIndex++; + } // Optimization: if the last alignment started after the requested range, // then no other chunks can possibly contain matching alignments. @@ -90,27 +125,10 @@ function readAlignmentsToEnd(buffer: ArrayBuffer, return { shouldAbort, - lastByteRead: lastStartOffset - 1 + nextOffset: offset }; } -// Given an offset in a concatenated buffer, determine the offset it -// corresponds to in the original buffer. -function splitOffset(buffers: ArrayBuffer[], chunk: Chunk, lastByteRead: number): number { - for (var i = 0; i < buffers.length - 1; i++) { - lastByteRead -= buffers[i].byteLength; - } - if (lastByteRead < 0) { - throw 'Last byte read was not in last chunk'; - } - - if (buffers.length == 1) { - lastByteRead += chunk.chunk_beg.uoffset; - } - - return lastByteRead; -} - // Fetch alignments from the remote source at the locations specified by Chunks. // This can potentially result in many network requests. // The returned promise is fulfilled once it can be proved that no more @@ -148,11 +166,10 @@ function fetchAlignments(remoteFile: RemoteFile, var buffers = blocks.map(x => x.buffer); buffers[0] = buffers[0].slice(chunk.chunk_beg.uoffset); var decomp = utils.concatArrayBuffers(buffers); - var {lastByteRead, shouldAbort} = - readAlignmentsToEnd(decomp, idxRange, contained, alignments); + var {shouldAbort, nextOffset} = + readAlignmentsToEnd(decomp, idxRange, contained, chunk.chunk_beg, blocks, alignments); if (newChunk) { - var lastUOffset = splitOffset(buffers, chunk, lastByteRead); - newChunk.chunk_beg.uoffset = lastUOffset + 1; + newChunk.chunk_beg = nextOffset; } if (shouldAbort) { @@ -209,6 +226,23 @@ class Bam { }); } + /** + * Fetch a single read at the given VirtualOffset. + * This is insanely inefficient and should not be used outside of testing. + */ + readAtOffset(offset: VirtualOffset): Q.Promise { + return this.remoteFile.getBytes(offset.coffset, kMaxFetch).then(gzip => { + var buf = utils.inflateGzip(gzip); + var jv = new jDataView(buf, 0, buf.byteLength, true /* little endian */); + var readData = readAlignment(jv, offset.uoffset, offset); + if (!readData) { + throw `Unable to read alignment at ${offset} in ${this.remoteFile.url}`; + } else { + return readData.read; + } + }); + } + /** * Map a contig name to a contig index. */ diff --git a/test/SamRead-test.js b/test/SamRead-test.js index fdeebe30..0fe3fe63 100644 --- a/test/SamRead-test.js +++ b/test/SamRead-test.js @@ -12,7 +12,8 @@ var RemoteFile = require('../src/RemoteFile'), utils = require('../src/utils'), Bam = require('../src/bam'), bamTypes = require('../src/formats/bamTypes'), - SamRead = require('../src/SamRead'); + SamRead = require('../src/SamRead'), + VirtualOffset = require('../src/VirtualOffset'); describe('SamRead', function() { @@ -25,7 +26,7 @@ describe('SamRead', function() { return jb.read(['array', { block_size: 'int32', contents: ['blob', 'block_size'] - }]).map(block => new SamRead(block.contents)); + }]).map(block => new SamRead(block.contents, new VirtualOffset(0, 0))); }); } diff --git a/test/bam-test.js b/test/bam-test.js index 7b266896..e07afd8f 100644 --- a/test/bam-test.js +++ b/test/bam-test.js @@ -7,7 +7,8 @@ var expect = chai.expect; var Bam = require('../src/bam'), ContigInterval = require('../src/ContigInterval'), RemoteFile = require('../src/RemoteFile'), - MappedRemoteFile = require('./MappedRemoteFile'); + MappedRemoteFile = require('./MappedRemoteFile'), + VirtualOffset = require('../src/VirtualOffset'); describe('BAM', function() { it('should parse BAM files', function(done) { @@ -43,7 +44,6 @@ describe('BAM', function() { expect(aux[0]).to.contain({tag: 'RG', value: 'cow'}); expect(aux[1]).to.contain({tag: 'PG', value: 'bull'}); - // This one has more interesting auxiliary data: // XX:B:S,12561,2,20,112 aux = aligns[2].auxiliary; @@ -62,11 +62,6 @@ describe('BAM', function() { done(); }).done(); }); - - function alignmentRange(alignment) { - var stop = alignment.pos + alignment.l_seq; - return `${alignment.refID}:${1+alignment.pos}-${stop}`; - } // This matches htsjdk's BamFileIndexTest.testSpecificQueries it('should find sequences using an index', function(done) { @@ -77,11 +72,11 @@ describe('BAM', function() { var range = new ContigInterval('chrM', 10400, 10600); bam.getAlignmentsInRange(range, true).then(alignments => { expect(alignments).to.have.length(1); - expect(alignmentRange(alignments[0])).to.equal('0:10427-10477'); + expect(alignments[0].toString()).to.equal('0:10427-10477'); return bam.getAlignmentsInRange(range, false).then(alignments => { expect(alignments).to.have.length(2); - expect(alignmentRange(alignments[0])).to.equal('0:10388-10438'); - expect(alignmentRange(alignments[1])).to.equal('0:10427-10477'); + expect(alignments[0].toString()).to.equal('0:10388-10438'); + expect(alignments[1].toString()).to.equal('0:10427-10477'); done(); }); }).done(); @@ -104,7 +99,7 @@ describe('BAM', function() { bam.getAlignmentsInRange(range).then(reads => { // Note: htsjdk returns contig names like 'chr18', not 18. expect(reads).to.have.length(14); - expect(reads.map(alignmentRange)).to.deep.equal([ + expect(reads.map(r => r.toString())).to.deep.equal([ '18:3653516-3653566', '18:3653591-3653641', '18:4215486-4215536', @@ -130,7 +125,7 @@ describe('BAM', function() { var range = new ContigInterval('chr1', 90002285, 116992285); bam.getAlignmentsInRange(range).then(reads => { expect(reads).to.have.length(92); - expect(reads.slice(0, 5).map(alignmentRange)).to.deep.equal([ + expect(reads.slice(0, 5).map(r => r.toString())).to.deep.equal([ '1:90071452-90071502', '1:90071609-90071659', '1:90622416-90622466', @@ -138,18 +133,31 @@ describe('BAM', function() { '1:91182945-91182995' ]); - expect(reads.slice(-5).map(alignmentRange)).to.deep.equal([ + expect(reads.slice(-5).map(r => r.toString())).to.deep.equal([ '1:115379485-115379535', '1:116045704-116045754', '1:116045758-116045808', '1:116563764-116563814', '1:116563944-116563994' ]); + + // See "should fetch an alignment at a specific offset", below. + expect(reads.slice(-1)[0].offset.toString()).to.equal('28269:2247'); done(); }).done(); }); + it('should fetch an alignment at a specific offset', function(done) { + // This virtual offset matches the one above. + // This verifies that alignments are tagged with the correct offset. + var bam = new Bam(new RemoteFile('/test/data/index_test.bam')); + bam.readAtOffset(new VirtualOffset(28269, 2247)).then(read => { + expect(read.toString()).to.equal('1:116563944-116563994'); + done(); + }).done(); + }); + it('should fetch alignments in a wide interval', function(done) { var bam = new Bam(new RemoteFile('/test/data/index_test.bam'), new RemoteFile('/test/data/index_test.bam.bai')); @@ -182,8 +190,8 @@ describe('BAM', function() { bam.getAlignmentsInRange(range).then(reads => { expect(reads).to.have.length(1114); - expect(alignmentRange(reads[0])).to.equal('19:31511251-31511351'); - expect(alignmentRange(reads[1113])).to.equal('19:31514171-31514271'); + expect(reads[0].toString()).to.equal('19:31511251-31511351'); + expect(reads[1113].toString()).to.equal('19:31514171-31514271'); done(); }).done(); });