Skip to content

Commit

Permalink
Merge cefd848 into 3b069cb
Browse files Browse the repository at this point in the history
  • Loading branch information
danvk committed Apr 16, 2015
2 parents 3b069cb + cefd848 commit 0ce6ab0
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 65 deletions.
62 changes: 62 additions & 0 deletions src/SamRead.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* This class parses and represents a single read in a SAM/BAM file.
*
* This is used instead of parsing directly via jBinary in order to:
* - Make the parsing lazy (for significant performance wins)
* - Make the resulting object more precisely typed.
*
* Parsing reads using SamRead is ~2-3x faster than using jBinary and
* ThinAlignment directly.
*
* @flow
*/
'use strict';

var jDataView = require('jdataview'),
jBinary = require('jbinary'),
{nullString} = require('./formats/helpers'),
bamTypes = require('./formats/bamTypes');

// TODO: Make more extensive use of the jBinary specs.

class SamRead {
buffer: ArrayBuffer;
reader: jDataView;

pos: number;
refID: number;
l_seq: number;

constructor(buffer: ArrayBuffer) {
this.buffer = buffer;

// Go ahead and parse a few fields immediately.
var jv = new jDataView(buffer, 0, buffer.byteLength, true /* little endian */);
this.refID = jv.getUint32(0);
this.pos = jv.getUint32(4);
this.l_seq = jv.getUint32(16);

this.reader = jv;
}

toString(): string {
var stop = this.pos + this.l_seq;
return `${this.refID}:${this.pos}-${stop}`;
}

getName(): string {
var l_read_name = this.reader.getUint8(8);
var jb = new jBinary(this.buffer, {
'jBinary.littleEndian': true
});
return jb.read([nullString, l_read_name], 32);
}

// TODO: get rid of this; move all methods into SamRead.
getFull(): Object {
var jb = new jBinary(this.buffer, bamTypes.TYPE_SET);
return jb.read(bamTypes.ThickAlignment, 0);
}
}

module.exports = SamRead;
86 changes: 49 additions & 37 deletions src/bam.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,32 @@
import type * as RemoteFile from './RemoteFile';

var jBinary = require('jbinary'),
jDataView = require('jdataview'),
_ = require('underscore'),
Q = require('q');

var bamTypes = require('./formats/bamTypes'),
utils = require('./utils'),
BaiFile = require('./bai'),
ContigInterval = require('./ContigInterval'),
VirtualOffset = require('./VirtualOffset');
VirtualOffset = require('./VirtualOffset'),
SamRead = require('./SamRead');


/**
* Filter a list of alignments down to just those which overlap the range.
* The 'contained' parameter controls whether the alignments must be fully
* contained within the range, or need only overlap it.
*/
function filterAlignments(alignments: Object[],
idxRange: ContigInterval<number>,
contained: boolean): Object[] {
return alignments.filter(read => {
// TODO: Use cigar.getReferenceLength() instead of l_seq, like htsjdk.
var readRange = new ContigInterval(read.refID, read.pos, read.pos + read.l_seq - 1);
if (contained) {
return idxRange.containsInterval(readRange);
} else {
return readRange.intersects(idxRange);
}
});
function isAlignmentInRange(read: SamRead,
idxRange: ContigInterval<number>,
contained: boolean): boolean {
// TODO: Use cigar.getReferenceLength() instead of l_seq, like htsjdk.
var readRange = new ContigInterval(read.refID, read.pos, read.pos + read.l_seq - 1);
if (contained) {
return idxRange.containsInterval(readRange);
} else {
return readRange.intersects(idxRange);
}
}


Expand All @@ -48,16 +47,37 @@ var kMaxFetch = 65536 * 2;


// This tracks how many bytes were read.
function readAlignmentsToEnd(buffer: ArrayBuffer) {
var jb = new jBinary(buffer, bamTypes.TYPE_SET);
var alignments = [];
function readAlignmentsToEnd(buffer: ArrayBuffer,
idxRange: ContigInterval<number>,
contained: boolean,
alignments: SamRead[]) {
// We use jDataView and ArrayBuffer directly for a speedup over jBinary.
// This parses reads ~2-3x faster than using ThinAlignment directly.
var jv = new jDataView(buffer, 0, buffer.byteLength, true /* little endian */);
var lastStartOffset = 0;
var shouldAbort = false;
var pos = 0;
try {
while (jb.tell() < buffer.byteLength) {
var alignment = jb.read('ThinBamAlignment');
if (!alignment) break;
alignments.push(alignment.contents);
lastStartOffset = jb.tell();
while (pos < buffer.byteLength) {
var readLength = jv.getInt32(pos); pos += 4;
if (pos + readLength >= buffer.byteLength) break;

var readSlice = buffer.slice(pos, pos + readLength); pos += readLength;
var read = new SamRead(readSlice);
if (isAlignmentInRange(read, idxRange, contained)) {
alignments.push(read);
}
lastStartOffset = pos;

// Optimization: if the last alignment started after the requested range,
// then no other chunks can possibly contain matching alignments.
// TODO: use contigInterval.isAfterInterval when that's possible.
var range = new ContigInterval(read.refID, read.pos, read.pos + 1);
if (range.contig > idxRange.contig ||
(range.contig == idxRange.contig && range.start() > idxRange.stop())) {
shouldAbort = true;
break;
}
}
// Code gets here if the compression block ended exactly at the end of
// an Alignment.
Expand All @@ -69,7 +89,7 @@ function readAlignmentsToEnd(buffer: ArrayBuffer) {
}

return {
alignments,
shouldAbort,
lastByteRead: lastStartOffset - 1
};
}
Expand Down Expand Up @@ -99,7 +119,7 @@ function fetchAlignments(remoteFile: RemoteFile,
idxRange: ContigInterval<number>,
contained: boolean,
chunks: Chunk[],
alignments: Object[]): Q.Promise<Object[]> {
alignments: SamRead[]): Q.Promise<SamRead[]> {
if (chunks.length === 0) {
return Q.when(alignments);
}
Expand Down Expand Up @@ -128,22 +148,14 @@ function fetchAlignments(remoteFile: RemoteFile,
var buffers = blocks.map(x => x.buffer);
buffers[0] = buffers[0].slice(chunk.chunk_beg.uoffset);
var decomp = utils.concatArrayBuffers(buffers);
var {alignments: newAlignments, lastByteRead} = readAlignmentsToEnd(decomp);
var {lastByteRead, shouldAbort} =
readAlignmentsToEnd(decomp, idxRange, contained, alignments);
if (newChunk) {
var lastUOffset = splitOffset(buffers, chunk, lastByteRead);
newChunk.chunk_beg.uoffset = lastUOffset + 1;
}
alignments = alignments.concat(
filterAlignments(newAlignments, idxRange, contained));

// Optimization: if the last alignment started after the requested range,
// then no other chunks can possibly contain matching alignments.
var lastAlignment = newAlignments[newAlignments.length - 1],
lastStart = lastAlignment.pos,
lastRange = new ContigInterval(lastAlignment.refID, lastStart, lastStart + 1);
// TODO: use contigInterval.isAfterInterval when that's possible.
if (lastRange.contig > idxRange.contig ||
(lastRange.contig == idxRange.contig && lastRange.start() > idxRange.stop())) {

if (shouldAbort) {
return Q.when(alignments);
} else {
return fetchAlignments(remoteFile,
Expand Down Expand Up @@ -217,7 +229,7 @@ class Bam {
* The 'contained' parameter controls whether the alignments must be fully
* contained within the range, or need only overlap it.
*/
getAlignmentsInRange(range: ContigInterval<string>, contained?: boolean): Q.Promise<Object[]> {
getAlignmentsInRange(range: ContigInterval<string>, contained?: boolean): Q.Promise<SamRead[]> {
contained = contained || false;
if (!this.index) {
throw 'Range searches are only supported on BAMs with BAI indices.';
Expand Down
57 changes: 30 additions & 27 deletions src/formats/bamTypes.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,36 @@ var CIGAR_OPS = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'];
// Core alignment fields shared between BamAlignment and ThinBamAlignment.
// TODO: figure out why jBinary's 'extend' type doesn't work with this in TYPE_SET.
var ThinAlignment = {
refID: 'int32',
pos: 'int32',
l_read_name: 'uint8',
MAPQ: 'uint8',
bin: 'uint16',
n_cigar_op: 'uint16',
FLAG: 'uint16',
l_seq: 'int32',
next_refID: 'int32',
next_pos: 'int32',
tlen: 'int32'
refID: 'int32', // 0
pos: 'int32', // 4
l_read_name: 'uint8', // 8
MAPQ: 'uint8', // 9
bin: 'uint16', // 10
n_cigar_op: 'uint16', // 12
FLAG: 'uint16', // 14
l_seq: 'int32', // 16
next_refID: 'int32', // 20
next_pos: 'int32', // 24
tlen: 'int32' // 28
// length of fixed-size header = 32 bytes
};

var ThickAlignment = _.extend({}, ThinAlignment, {
read_name: [nullString, 'l_read_name'],
cigar: ['array', 'CigarOp', 'n_cigar_op'],
seq: ['FourBitSequence', 'l_seq'],
qual: ['array', 'uint8', 'l_seq'], // 255 = unknown
auxiliary: ['array', {
tag: ['string', 2],
val_type: 'char',
value: ['if', ctx => ctx.val_type == 'B', {
val_type: 'char',
num_values: 'int32',
values: ['array', 'AuxiliaryValue', 'num_values']
}, 'AuxiliaryValue']
}] // goes until the end of the block
});

var TYPE_SET: any = {
'jBinary.littleEndian': true,

Expand All @@ -49,21 +66,7 @@ var TYPE_SET: any = {

'BamAlignment': {
block_size: 'int32',
contents: [sizedBlock, _.extend({}, ThinAlignment, {
read_name: [nullString, 'l_read_name'],
cigar: ['array', 'CigarOp', 'n_cigar_op'],
seq: ['FourBitSequence', 'l_seq'],
qual: ['array', 'uint8', 'l_seq'], // 255 = unknown
auxiliary: ['array', {
tag: ['string', 2],
val_type: 'char',
value: ['if', ctx => ctx.val_type == 'B', {
val_type: 'char',
num_values: 'int32',
values: ['array', 'AuxiliaryValue', 'num_values']
}, 'AuxiliaryValue']
}] // goes until the end of the block
}), 'block_size']
contents: [sizedBlock, ThickAlignment, 'block_size']
},

'ThinBamAlignment': {
Expand Down Expand Up @@ -177,4 +180,4 @@ var TYPE_SET: any = {
};


module.exports = {TYPE_SET};
module.exports = {TYPE_SET, ThinAlignment, ThickAlignment};
81 changes: 81 additions & 0 deletions test/SamRead-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/* @flow */
'use strict';

import type * as Q from 'q';

var chai = require('chai');
var expect = chai.expect;

var jBinary = require('jbinary');

var RemoteFile = require('../src/RemoteFile'),
utils = require('../src/utils'),
Bam = require('../src/bam'),
bamTypes = require('../src/formats/bamTypes'),
SamRead = require('../src/SamRead');

describe('SamRead', function() {

function getSamArray(url): Q.Promise<SamRead[]> {
var file = new RemoteFile(url);
return file.getAll().then(gzipBuffer => {
var buf = utils.inflateGzip(gzipBuffer);
var jb = new jBinary(buf, bamTypes.TYPE_SET);
jb.read('BamHeader'); // skip past the header to get to alignments.
return jb.read(['array', {
block_size: 'int32',
contents: ['blob', 'block_size']
}]).map(block => new SamRead(block.contents));
});
}

var testReads = getSamArray('/test/data/test_input_1_a.bam');

// This is more of a test for the test than for SamRead.
it('should pull records from a BAM file', function(done) {
testReads.then(reads => {
expect(reads).to.have.length(15);
done();
}).done();
});

it('should parse BAM records', function(done) {
testReads.then(reads => {
// The first record in test_input_1_a.sam is:
// r000 99 insert 50 30 10M = 80 30 ATTTAGCTAC AAAAAAAAAA RG:Z:cow PG:Z:bull
var read = reads[0];
expect(read.getName()).to.equal('r000');
// expect(read.FLAG).to.equal(99);
expect(read.refID).to.equal(0);
expect(read.pos).to.equal(49); // 0-based
expect(read.l_seq).to.equal(10);
// expect(refs[r000.refID].name).to.equal('insert');

done();
}).done();
});

it('should read thick records', function(done) {
testReads.then(reads => {
// This mirrors the "BAM > should parse BAM files" test.
var r000 = reads[0].getFull();
expect(r000.read_name).to.equal('r000');
expect(r000.FLAG).to.equal(99);
expect(r000.refID).to.equal(0);
// .. POS
expect(r000.MAPQ).to.equal(30);
expect(Bam.makeCigarString(r000.cigar)).to.equal('10M');
// next ref
// next pos
expect(r000.tlen).to.equal(30);
expect(r000.seq).to.equal('ATTTAGCTAC');
expect(Bam.makeAsciiPhred(r000.qual)).to.equal('AAAAAAAAAA');

var aux = r000.auxiliary;
expect(aux).to.have.length(2);
expect(aux[0]).to.contain({tag: 'RG', value: 'cow'});
expect(aux[1]).to.contain({tag: 'PG', value: 'bull'});
done();
}).done();
});
});
3 changes: 2 additions & 1 deletion test/bam-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ describe('BAM', function() {
expect(aux[3]).to.contain({tag: 'PG', value: 'colt'});

// This one has a more interesting Cigar string
expect(Bam.makeCigarString(aligns[3].cigar)).to.equal('1S2I6M1P1I1P1I4M2I');
expect(Bam.makeCigarString(aligns[3].cigar))
.to.equal('1S2I6M1P1I1P1I4M2I');

// - one with a more interesting Phred string
done();
Expand Down

0 comments on commit 0ce6ab0

Please sign in to comment.