Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attach VirtualOffsets to SamReads #75

Merged
merged 2 commits into from
Apr 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions src/SamRead.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
*/
'use strict';

import type * as ContigInterval from './ContigInterval';
import type * as VirtualOffset from './VirtualOffset';

var jDataView = require('jdataview'),
jBinary = require('jbinary'),
{nullString} = require('./formats/helpers'),
Expand All @@ -21,31 +24,41 @@ var jDataView = require('jdataview'),

class SamRead {
buffer: ArrayBuffer;
reader: jDataView;
offset: VirtualOffset;

pos: number;
refID: number;
l_seq: number;

constructor(buffer: ArrayBuffer) {
/**
* buffer contains the raw bytes of the serialized BAM read. It must contain
* at least one full read (but may contain more).
* offset records where this alignment is located in the BAM file. It's
* useful as a unique ID for alignments.
*/
constructor(buffer: ArrayBuffer, offset: VirtualOffset) {
this.buffer = buffer;
this.offset = offset;

// Go ahead and parse a few fields immediately.
var jv = new jDataView(buffer, 0, buffer.byteLength, true /* little endian */);
var jv = this._getJDataView();
this.refID = jv.getUint32(0);
this.pos = jv.getUint32(4);
this.l_seq = jv.getUint32(16);

this.reader = jv;
}

toString(): string {
var stop = this.pos + this.l_seq;
return `${this.refID}:${this.pos}-${stop}`;
return `${this.refID}:${1+this.pos}-${stop}`;
}

_getJDataView(): jDataView {
var b = this.buffer;
return new jDataView(b, 0, b.byteLength, true /* little endian */);
}

getName(): string {
var l_read_name = this.reader.getUint8(8);
var l_read_name = this._getJDataView().getUint8(8);
var jb = new jBinary(this.buffer, {
'jBinary.littleEndian': true
});
Expand Down
4 changes: 4 additions & 0 deletions src/VirtualOffset.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class VirtualOffset {
compareTo(other: VirtualOffset): number {
return this.coffset - other.coffset || this.uoffset - other.uoffset;
}

clone(): VirtualOffset {
return new VirtualOffset(this.coffset, this.uoffset);
}
}

module.exports = VirtualOffset;
90 changes: 62 additions & 28 deletions src/bam.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,31 +43,66 @@ type Chunk = {
chunk_end: VirtualOffset;
}

// TODO: import from src/utils.js
type InflatedBlock = {
offset: number;
compressedLength: number;
buffer: ArrayBuffer;
}

var kMaxFetch = 65536 * 2;

// Read a single alignment
function readAlignment(view: jDataView, pos: number, offset: VirtualOffset) {
var readLength = view.getInt32(pos);
pos += 4;

if (pos + readLength >= view.byteLength) {
return null;
}

var readSlice = view.buffer.slice(pos, pos + readLength);

var read = new SamRead(readSlice, offset.clone());
return {
read,
readLength: 4 + readLength
};
}

// This tracks how many bytes were read.
function readAlignmentsToEnd(buffer: ArrayBuffer,
idxRange: ContigInterval<number>,
contained: boolean,
offset: VirtualOffset,
blocks: InflatedBlock[],
alignments: SamRead[]) {
// We use jDataView and ArrayBuffer directly for a speedup over jBinary.
// This parses reads ~2-3x faster than using ThinAlignment directly.
var jv = new jDataView(buffer, 0, buffer.byteLength, true /* little endian */);
var lastStartOffset = 0;
var shouldAbort = false;
var pos = 0;
offset = offset.clone();
var blockIndex = 0;
try {
while (pos < buffer.byteLength) {
var readLength = jv.getInt32(pos); pos += 4;
if (pos + readLength >= buffer.byteLength) break;
var readData = readAlignment(jv, pos, offset);
if (!readData) break;

var readSlice = buffer.slice(pos, pos + readLength); pos += readLength;
var read = new SamRead(readSlice);
var {read, readLength} = readData;
pos += readLength;
if (isAlignmentInRange(read, idxRange, contained)) {
alignments.push(read);
}
lastStartOffset = pos;

// Advance the VirtualOffset to reflect the new position
offset.uoffset += readLength;
var bufLen = blocks[blockIndex].buffer.byteLength;
if (offset.uoffset >= bufLen) {
offset.uoffset -= bufLen;
offset.coffset += blocks[blockIndex].compressedLength;
blockIndex++;
}

// Optimization: if the last alignment started after the requested range,
// then no other chunks can possibly contain matching alignments.
Expand All @@ -90,27 +125,10 @@ function readAlignmentsToEnd(buffer: ArrayBuffer,

return {
shouldAbort,
lastByteRead: lastStartOffset - 1
nextOffset: offset
};
}

// Given an offset in a concatenated buffer, determine the offset it
// corresponds to in the original buffer.
function splitOffset(buffers: ArrayBuffer[], chunk: Chunk, lastByteRead: number): number {
for (var i = 0; i < buffers.length - 1; i++) {
lastByteRead -= buffers[i].byteLength;
}
if (lastByteRead < 0) {
throw 'Last byte read was not in last chunk';
}

if (buffers.length == 1) {
lastByteRead += chunk.chunk_beg.uoffset;
}

return lastByteRead;
}

// Fetch alignments from the remote source at the locations specified by Chunks.
// This can potentially result in many network requests.
// The returned promise is fulfilled once it can be proved that no more
Expand Down Expand Up @@ -148,11 +166,10 @@ function fetchAlignments(remoteFile: RemoteFile,
var buffers = blocks.map(x => x.buffer);
buffers[0] = buffers[0].slice(chunk.chunk_beg.uoffset);
var decomp = utils.concatArrayBuffers(buffers);
var {lastByteRead, shouldAbort} =
readAlignmentsToEnd(decomp, idxRange, contained, alignments);
var {shouldAbort, nextOffset} =
readAlignmentsToEnd(decomp, idxRange, contained, chunk.chunk_beg, blocks, alignments);
if (newChunk) {
var lastUOffset = splitOffset(buffers, chunk, lastByteRead);
newChunk.chunk_beg.uoffset = lastUOffset + 1;
newChunk.chunk_beg = nextOffset;
}

if (shouldAbort) {
Expand Down Expand Up @@ -209,6 +226,23 @@ class Bam {
});
}

/**
* Fetch a single read at the given VirtualOffset.
* This is insanely inefficient and should not be used outside of testing.
*/
readAtOffset(offset: VirtualOffset): Q.Promise<SamRead> {
return this.remoteFile.getBytes(offset.coffset, kMaxFetch).then(gzip => {
var buf = utils.inflateGzip(gzip);
var jv = new jDataView(buf, 0, buf.byteLength, true /* little endian */);
var readData = readAlignment(jv, offset.uoffset, offset);
if (!readData) {
throw `Unable to read alignment at ${offset} in ${this.remoteFile.url}`;
} else {
return readData.read;
}
});
}

/**
* Map a contig name to a contig index.
*/
Expand Down
5 changes: 3 additions & 2 deletions test/SamRead-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ var RemoteFile = require('../src/RemoteFile'),
utils = require('../src/utils'),
Bam = require('../src/bam'),
bamTypes = require('../src/formats/bamTypes'),
SamRead = require('../src/SamRead');
SamRead = require('../src/SamRead'),
VirtualOffset = require('../src/VirtualOffset');

describe('SamRead', function() {

Expand All @@ -25,7 +26,7 @@ describe('SamRead', function() {
return jb.read(['array', {
block_size: 'int32',
contents: ['blob', 'block_size']
}]).map(block => new SamRead(block.contents));
}]).map(block => new SamRead(block.contents, new VirtualOffset(0, 0)));
});
}

Expand Down
38 changes: 23 additions & 15 deletions test/bam-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ var expect = chai.expect;
var Bam = require('../src/bam'),
ContigInterval = require('../src/ContigInterval'),
RemoteFile = require('../src/RemoteFile'),
MappedRemoteFile = require('./MappedRemoteFile');
MappedRemoteFile = require('./MappedRemoteFile'),
VirtualOffset = require('../src/VirtualOffset');

describe('BAM', function() {
it('should parse BAM files', function(done) {
Expand Down Expand Up @@ -43,7 +44,6 @@ describe('BAM', function() {
expect(aux[0]).to.contain({tag: 'RG', value: 'cow'});
expect(aux[1]).to.contain({tag: 'PG', value: 'bull'});


// This one has more interesting auxiliary data:
// XX:B:S,12561,2,20,112
aux = aligns[2].auxiliary;
Expand All @@ -62,11 +62,6 @@ describe('BAM', function() {
done();
}).done();
});

function alignmentRange(alignment) {
var stop = alignment.pos + alignment.l_seq;
return `${alignment.refID}:${1+alignment.pos}-${stop}`;
}

// This matches htsjdk's BamFileIndexTest.testSpecificQueries
it('should find sequences using an index', function(done) {
Expand All @@ -77,11 +72,11 @@ describe('BAM', function() {
var range = new ContigInterval('chrM', 10400, 10600);
bam.getAlignmentsInRange(range, true).then(alignments => {
expect(alignments).to.have.length(1);
expect(alignmentRange(alignments[0])).to.equal('0:10427-10477');
expect(alignments[0].toString()).to.equal('0:10427-10477');
return bam.getAlignmentsInRange(range, false).then(alignments => {
expect(alignments).to.have.length(2);
expect(alignmentRange(alignments[0])).to.equal('0:10388-10438');
expect(alignmentRange(alignments[1])).to.equal('0:10427-10477');
expect(alignments[0].toString()).to.equal('0:10388-10438');
expect(alignments[1].toString()).to.equal('0:10427-10477');
done();
});
}).done();
Expand All @@ -104,7 +99,7 @@ describe('BAM', function() {
bam.getAlignmentsInRange(range).then(reads => {
// Note: htsjdk returns contig names like 'chr18', not 18.
expect(reads).to.have.length(14);
expect(reads.map(alignmentRange)).to.deep.equal([
expect(reads.map(r => r.toString())).to.deep.equal([
'18:3653516-3653566',
'18:3653591-3653641',
'18:4215486-4215536',
Expand All @@ -130,26 +125,39 @@ describe('BAM', function() {
var range = new ContigInterval('chr1', 90002285, 116992285);
bam.getAlignmentsInRange(range).then(reads => {
expect(reads).to.have.length(92);
expect(reads.slice(0, 5).map(alignmentRange)).to.deep.equal([
expect(reads.slice(0, 5).map(r => r.toString())).to.deep.equal([
'1:90071452-90071502',
'1:90071609-90071659',
'1:90622416-90622466',
'1:90622572-90622622',
'1:91182945-91182995'
]);

expect(reads.slice(-5).map(alignmentRange)).to.deep.equal([
expect(reads.slice(-5).map(r => r.toString())).to.deep.equal([
'1:115379485-115379535',
'1:116045704-116045754',
'1:116045758-116045808',
'1:116563764-116563814',
'1:116563944-116563994'
]);

// See "should fetch an alignment at a specific offset", below.
expect(reads.slice(-1)[0].offset.toString()).to.equal('28269:2247');

done();
}).done();
});

it('should fetch an alignment at a specific offset', function(done) {
// This virtual offset matches the one above.
// This verifies that alignments are tagged with the correct offset.
var bam = new Bam(new RemoteFile('/test/data/index_test.bam'));
bam.readAtOffset(new VirtualOffset(28269, 2247)).then(read => {
expect(read.toString()).to.equal('1:116563944-116563994');
done();
}).done();
});

it('should fetch alignments in a wide interval', function(done) {
var bam = new Bam(new RemoteFile('/test/data/index_test.bam'),
new RemoteFile('/test/data/index_test.bam.bai'));
Expand Down Expand Up @@ -182,8 +190,8 @@ describe('BAM', function() {

bam.getAlignmentsInRange(range).then(reads => {
expect(reads).to.have.length(1114);
expect(alignmentRange(reads[0])).to.equal('19:31511251-31511351');
expect(alignmentRange(reads[1113])).to.equal('19:31514171-31514271');
expect(reads[0].toString()).to.equal('19:31511251-31511351');
expect(reads[1113].toString()).to.equal('19:31514171-31514271');
done();
}).done();
});
Expand Down