Skip to content

Commit

Permalink
Expand BigBed tests & cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
danvk committed Mar 16, 2015
1 parent e156fa3 commit a2e43f3
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 126 deletions.
114 changes: 71 additions & 43 deletions src/BigBed.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,16 @@ function parseHeader(buffer) {
// TODO: check Endianness using magic. Possibly use jDataView.littleEndian
// to flip the endianness for jBinary consumption.
// NB: dalliance doesn't support big endian formats.
var jb = new jBinary(buffer, bbi.TYPE_SET);
var header = jb.read('Header');

return header;
return new jBinary(buffer, bbi.TYPE_SET).read('Header');
}

// The "CIR" tree contains a mapping from sequence -> block offsets.
// It stands for "Chromosome Index R tree"
function parseCirTree(buffer) {
var jb = new jBinary(buffer, bbi.TYPE_SET);
var cirTree = jb.read('CirTree');

return cirTree;
return new jBinary(buffer, bbi.TYPE_SET).read('CirTree');
}

// Extract a map from contig name --> contig ID from the bigBed header.
function generateContigMap(twoBitHeader): {[key:string]: number} {
// Just assume it's a flat "tree" for now.
var nodes = twoBitHeader.chromosomeTree.nodes.contents;
Expand All @@ -46,6 +43,16 @@ function generateContigMap(twoBitHeader): {[key:string]: number} {
}));
}

// Generate the reverse map from contig ID --> contig name.
function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
var ary = [];
_.forEach(contigMap, (index, name) => {
ary[index] = name;
});
return ary;
}

// Map contig name to contig ID. Leading "chr" is optional.
function getContigId(contigMap, contig) {
if (contig in contigMap) {
return contigMap[contig];
Expand All @@ -57,15 +64,7 @@ function getContigId(contigMap, contig) {
return null;
}

function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
var ary = [];
_.forEach(contigMap, (index, name) => {
ary[index] = name;
});
return ary;
}

// Get all blocks in the file containing features which intersect with contigRange.
// Find all blocks containing features which intersect with contigRange.
function findOverlappingBlocks(twoBitHeader, cirTree, contigRange) {
// Do a recursive search through the index tree
var matchingBlocks = [];
Expand Down Expand Up @@ -101,32 +100,74 @@ function extractFeaturesInRange(buffer, dataRange, blocks, contigRange) {
var beds = jb.read('BedBlock');

beds = beds.filter(function(bed) {
var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop);
var r = contigRange.intersects(bedInterval);
return r;
// Note: BED intervals are explicitly half-open.
// The "- 1" converts them to closed intervals for ContigInterval.
var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop - 1);
return contigRange.intersects(bedInterval);
});

return beds;
}));
}

// Fetch the relevant blocks from the bigBed file and extract the features
// which overlap the given range.
function fetchFeatures(contigRange, header, cirTree, contigMap, remoteFile) {
var blocks = findOverlappingBlocks(header, cirTree, contigRange);
if (blocks.length == 0) {
return [];
}

// Find the range in the file which contains all relevant blocks.
// In theory there could be gaps between blocks, but it's hard to see how.
var range = Interval.boundingInterval(
blocks.map(n => new Interval(+n.offset, n.offset+n.size)));

return remoteFile.getBytes(range.start, range.length())
.then(buffer => {
var reverseMap = reverseContigMap(contigMap);
var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
features.forEach(f => {
f.contig = reverseMap[f.chrId];
delete f.chrId;
});
return features;
});
}


type BedRow = {
// Half-open interval for the BED row.
contig: string;
start: number;
stop: number;
// Remaining fields in the BED row (typically tab-delimited)
rest: string;
}


class BigBed {
remoteFile: RemoteFile;
header: Q.Promise<any>;
cirTree: Q.Promise<any>;
contigMap: Q.Promise<{[key:string]: number}>;

/**
* Prepare to request features from a remote bigBed file.
* The remote source must support HTTP Range headers.
* This will kick off several async requests for portions of the file.
*/
constructor(url: string) {
this.remoteFile = new RemoteFile(url);
this.header = this.remoteFile.getBytes(0, 64*1024).then(parseHeader);
this.contigMap = this.header.then(generateContigMap);

// Next: fetch [header.unzoomedIndexOffset, zoomHeaders[0].dataOffset] and parse
// the "CIR" tree.
// Next: fetch the block index and parse out the "CIR" tree.
this.cirTree = this.header.then(header => {
// zoomHeaders[0].dataOffset is the next entry in the file.
// We assume the "cirTree" section goes all the way to that point.
// Lacking zoom headers, assume it's 4k.
// TODO: fetch more than 4k if necessary
var start = header.unzoomedIndexOffset,
zoomHeader = header.zoomHeaders[0],
length = zoomHeader ? zoomHeader.dataOffset - start : 4096;
Expand All @@ -139,34 +180,21 @@ class BigBed {
this.cirTree.done();
}

// Returns all BED entries which overlap the range.
// TODO: factor logic out into a helper
getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<any> {
/**
* Returns all BED entries which overlap the range.
* Note: while the requested range is inclusive on both ends, ranges in
* bigBed format files are half-open (inclusive at the start, exclusive at
* the end).
*/
getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<Array<BedRow>> {
return Q.spread([this.header, this.cirTree, this.contigMap],
(header, cirTree, contigMap) => {
var contigIx = getContigId(contigMap, contig);
if (contigIx === null) {
throw `Invalid contig ${contig}`;
}
var contigRange = new ContigInterval(contigIx, start, stop);

var blocks = findOverlappingBlocks(header, cirTree, contigRange);
if (blocks.length == 0) {
return [];
}

var range = Interval.boundingInterval(
blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
return this.remoteFile.getBytes(range.start, range.length())
.then(buffer => {
var reverseMap = reverseContigMap(contigMap);
var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
features.forEach(f => {
f.contig = reverseMap[f.chrId];
delete f.chrId;
});
return features;
});
return fetchFeatures(contigRange, header, cirTree, contigMap, this.remoteFile);
});
}
}
Expand Down
11 changes: 6 additions & 5 deletions src/Controls.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ var Controls = React.createClass({
// XXX: can we be more specific than this with Flow?
onChange: React.PropTypes.func.isRequired
},
makeRange: function() {
// XXX Removing the Number() should lead to type errors, but doesn't.
makeRange: function(): GenomeRange {
return {
contig: this.refs.contig.getDOMNode().value,
start: Number(this.refs.start.getDOMNode().value),
Expand All @@ -35,8 +34,10 @@ var Controls = React.createClass({
this.refs.start.getDOMNode().value = r.start;
this.refs.stop.getDOMNode().value = r.stop;

var contigIdx = this.props.contigList.indexOf(r.contig);
this.refs.contig.getDOMNode().selectedIndex = contigIdx;
if (this.props.contigList) {
var contigIdx = this.props.contigList.indexOf(r.contig);
this.refs.contig.getDOMNode().selectedIndex = contigIdx;
}
},
render: function(): any {
var contigOptions = this.props.contigList
Expand All @@ -56,7 +57,7 @@ var Controls = React.createClass({
</form>
);
},
componentDidUpdate: function(prevProps, prevState) {
componentDidUpdate: function(prevProps: Object) {
if (!_.isEqual(prevProps.range, this.props.range)) {
this.updateRangeUI();
}
Expand Down
1 change: 0 additions & 1 deletion src/TwoBit.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ type SequenceRecord = {

type TwoBitHeader = {
sequenceCount: number;
reserved: number;
sequences: Array<FileIndexEntry>;
}

Expand Down
1 change: 1 addition & 0 deletions src/formats/helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/
var jBinary = require('jbinary');

// Read a jBinary type at an offset in the buffer specified by another field.
function typeAtOffset(typeName, offsetFieldName) {
return jBinary.Template({
baseType: typeName,
Expand Down
1 change: 0 additions & 1 deletion src/formats/twoBitTypes.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ var TYPE_SET = {
'Header': {
magic: ['const', 'uint32', 0x1A412743, true],
version: ['const', 'uint32', 0, true],

sequenceCount: 'uint32',
reserved: 'uint32',

Expand Down
6 changes: 0 additions & 6 deletions src/main.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
/* @flow */
var React = require('react'),
TwoBit = require('./TwoBit'),
BigBed = require('./BigBed'),
Root = require('./Root'),
createTwoBitDataSource = require('./TwoBitDataSource');

Expand All @@ -24,8 +23,3 @@ genome.getFeaturesInRange('chr1', 123000, 124000).done();

var root = React.render(<Root referenceSource={dataSource} />,
document.getElementById('root'));

var ensembl = new BigBed('/ensGene.bb');

window.ensembl = ensembl;
window.genome = genome;
62 changes: 54 additions & 8 deletions test/BigBed-test.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
// Things to test:
// - getFeatures which return no features
// - getFeatures which crosses a block boundary
// - getFeatures which crosses a contig boundary (not currently possible)

var chai = require('chai');
var expect = chai.expect;
var assert = chai.assert;

var Q = require('q');
var BigBed = require('../src/BigBed');

describe('BigBed', function() {
function getTestBigBed() {
// This file was generated using UCSC tools:
// cd kent/src/utils/bedToBigBed/tests; make
// This file is compressed, little endian and contains autoSQL.
// It is compressed, little endian, has autoSQL and two blocks.
return new BigBed('/test/data/itemRgb.bb');
}

Expand All @@ -22,8 +18,9 @@ describe('BigBed', function() {

bb.getFeaturesInRange('chrX', 151077036, 151078532)
.then(features => {
// chrX 151077031 151078198 MID_BLUE 0 - 151077031 151078198 0,0,128
// chrX 151078198 151079365 VIOLET_RED1 0 - 151078198 151079365 255,62,150
// Here's what these two lines in the file look like:
// chrX 151077031 151078198 MID_BLUE 0 - 151077031 151078198 0,0,128
// chrX 151078198 151079365 VIOLET_RED1 0 - 151078198 151079365 255,62,150
expect(features).to.have.length(2);
expect(features[0].contig).to.equal('chrX');
expect(features[0].start).to.equal(151077031);
Expand All @@ -47,4 +44,53 @@ describe('BigBed', function() {
})
.done();
});

it('should have inclusive ranges', function(done) {
// The matches looks like this:
// chrX 151071196 151072363 RED
// chrX 151094536 151095703 PeachPuff
var red = [151071196, 151072362], // note: stop is inclusive
peachpuff = [151094536, 151095702];

var bb = getTestBigBed();
var expectN = n => features => {
expect(features).to.have.length(n);
};

Q.all([
// request for precisely one row from the file.
bb.getFeaturesInRange('chrX', red[0], red[1])
.then(expectN(1)),
// the additional base in the range hits another row.
bb.getFeaturesInRange('chrX', red[0], 1 + red[1])
.then(expectN(2)),
// this overlaps exactly one base pair of the first feature.
bb.getFeaturesInRange('chrX', red[0] - 1000, red[0])
.then(expectN(1)),
// but this range ends one base pair before it.
bb.getFeaturesInRange('chrX', red[0] - 1000, red[0] - 1)
.then(expectN(0))
]).then(() => {
done();
}).done();
});

it('should add "chr" to contig names', function(done) {
var bb = getTestBigBed();

bb.getFeaturesInRange('X', 151077036, 151078532)
.then(features => {
// (same as 'should extract features in a range' test)
expect(features).to.have.length(2);
expect(features[0].contig).to.equal('chrX');
expect(features[1].contig).to.equal('chrX');
done();
})
.done();
});

// Things left to test:
// - getFeatures which crosses a block boundary
// - uncompressed bigBed file.
});

Loading

0 comments on commit a2e43f3

Please sign in to comment.