Merge pull request #38 from hammerlab/bigbed

bigBed parser
hammerlab · Mar 16, 2015 · 82cbe9f · 82cbe9f
2 parents a2f4298 + a2e43f3
commit 82cbe9f
Show file tree

Hide file tree

Showing 21 changed files with 583 additions and 365 deletions.
diff --git a/package.json b/package.json
@@ -16,11 +16,14 @@
   "dependencies": {
     "backbone": "^1.1.2",
     "d3": "^3.5.5",
+    "jbinary": "^2.1.2",
+    "pako": "^0.2.5",
     "q": "^1.1.2",
     "react": "^0.12.2",
     "underscore": "^1.7.0"
   },
   "devDependencies": {
+    "arraybuffer-slice": "^0.1.2",
     "chai": "^2.0.0",
     "coveralls": "^2.11.2",
     "es5-shim": "^4.1.0",
@@ -43,7 +46,6 @@
     "react-tools": "^0.12.2",
     "reactify": "^1.0.0",
     "sinon": "^1.12.2",
-    "source-map": "^0.3.0",
-    "text-encoding": "^0.5.2"
+    "source-map": "^0.3.0"
   }
 }
diff --git a/src/BigBed.js b/src/BigBed.js
@@ -0,0 +1,202 @@
+/**
+ * Parser for bigBed format.
+ * Based on UCSC's src/inc/bbiFile.h
+ */
+'use strict';
+
+var Q = require('q'),
+    _ = require('underscore'),
+    jBinary = require('jbinary'),
+    pako = require('pako');  // for gzip inflation
+
+
+var RemoteFile = require('./RemoteFile'),
+    Interval = require('./Interval'),
+    ContigInterval = require('./ContigInterval'),
+    utils = require('./utils.js'),
+    bbi = require('./formats/bbi');
+
+
+function parseHeader(buffer) {
+  // TODO: check Endianness using magic. Possibly use jDataView.littleEndian
+  // to flip the endianness for jBinary consumption.
+  // NB: dalliance doesn't support big endian formats.
+  return new jBinary(buffer, bbi.TYPE_SET).read('Header');
+}
+
+// The "CIR" tree contains a mapping from sequence -> block offsets.
+// It stands for "Chromosome Index R tree"
+function parseCirTree(buffer) {
+  return new jBinary(buffer, bbi.TYPE_SET).read('CirTree');
+}
+
+// Extract a map from contig name --> contig ID from the bigBed header.
+function generateContigMap(twoBitHeader): {[key:string]: number} {
+  // Just assume it's a flat "tree" for now.
+  var nodes = twoBitHeader.chromosomeTree.nodes.contents;
+  if (!nodes) {
+    throw 'Invalid chromosome tree';
+  }
+  return _.object(nodes.map(function({id, key}) {
+    // remove trailing nulls from the key string
+    return [key.replace(/\0.*/, ''), id];
+  }));
+}
+
+// Generate the reverse map from contig ID --> contig name.
+function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
+  var ary = [];
+  _.forEach(contigMap, (index, name) => {
+    ary[index] = name;
+  });
+  return ary;
+}
+
+// Map contig name to contig ID. Leading "chr" is optional.
+function getContigId(contigMap, contig) {
+  if (contig in contigMap) {
+    return contigMap[contig];
+  }
+  var chr = 'chr' + contig;
+  if (chr in contigMap) {
+    return contigMap[chr];
+  }
+  return null;
+}
+
+// Find all blocks containing features which intersect with contigRange.
+function findOverlappingBlocks(twoBitHeader, cirTree, contigRange) {
+  // Do a recursive search through the index tree
+  var matchingBlocks = [];
+  var tupleRange = [[contigRange.contig, contigRange.start()],
+                    [contigRange.contig, contigRange.stop()]];
+  var find = function(node) {
+    if (node.contents) {
+      node.contents.forEach(find);
+    } else {
+      var nodeRange = [[node.startChromIx, node.startBase],
+                       [node.endChromIx, node.endBase]];
+      if (utils.tupleRangeOverlaps(nodeRange, tupleRange)) {
+        matchingBlocks.push(node);
+      }
+    }
+  };
+  find(cirTree.blocks);
+
+  return matchingBlocks;
+}
+
+function extractFeaturesInRange(buffer, dataRange, blocks, contigRange) {
+  return _.flatten(blocks.map(block => {
+    var blockOffset = block.offset - dataRange.start,
+        blockLimit = blockOffset + block.size,
+        // TODO: where does the +2 come from? (I copied it from dalliance)
+        blockBuffer = buffer.slice(blockOffset + 2, blockLimit);
+    // TODO: only inflate if necessary
+    var inflatedBuffer = pako.inflateRaw(new Uint8Array(blockBuffer));
+
+    var jb = new jBinary(inflatedBuffer, bbi.TYPE_SET);
+    // TODO: parse only one BedEntry at a time, as many as is necessary.
+    var beds = jb.read('BedBlock');
+
+    beds = beds.filter(function(bed) {
+      // Note: BED intervals are explicitly half-open.
+      // The "- 1" converts them to closed intervals for ContigInterval.
+      var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop - 1);
+      return contigRange.intersects(bedInterval);
+    });
+
+    return beds;
+  }));
+}
+
+// Fetch the relevant blocks from the bigBed file and extract the features
+// which overlap the given range.
+function fetchFeatures(contigRange, header, cirTree, contigMap, remoteFile) {
+  var blocks = findOverlappingBlocks(header, cirTree, contigRange);
+  if (blocks.length == 0) {
+    return [];
+  }
+
+  // Find the range in the file which contains all relevant blocks.
+  // In theory there could be gaps between blocks, but it's hard to see how.
+  var range = Interval.boundingInterval(
+      blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
+
+  return remoteFile.getBytes(range.start, range.length())
+      .then(buffer => {
+        var reverseMap = reverseContigMap(contigMap);
+        var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
+        features.forEach(f => {
+          f.contig = reverseMap[f.chrId];
+          delete f.chrId;
+        });
+        return features;
+      });
+}
+
+
+type BedRow = {
+  // Half-open interval for the BED row.
+  contig: string;
+  start: number;
+  stop: number;
+  // Remaining fields in the BED row (typically tab-delimited)
+  rest: string;
+}
+
+
+class BigBed {
+  remoteFile: RemoteFile;
+  header: Q.Promise<any>;
+  cirTree: Q.Promise<any>;
+  contigMap: Q.Promise<{[key:string]: number}>;
+
+  /**
+   * Prepare to request features from a remote bigBed file.
+   * The remote source must support HTTP Range headers.
+   * This will kick off several async requests for portions of the file.
+   */
+  constructor(url: string) {
+    this.remoteFile = new RemoteFile(url);
+    this.header = this.remoteFile.getBytes(0, 64*1024).then(parseHeader);
+    this.contigMap = this.header.then(generateContigMap);
+
+    // Next: fetch the block index and parse out the "CIR" tree.
+    this.cirTree = this.header.then(header => {
+      // zoomHeaders[0].dataOffset is the next entry in the file.
+      // We assume the "cirTree" section goes all the way to that point.
+      // Lacking zoom headers, assume it's 4k.
+      // TODO: fetch more than 4k if necessary
+      var start = header.unzoomedIndexOffset,
+          zoomHeader = header.zoomHeaders[0],
+          length = zoomHeader ? zoomHeader.dataOffset - start : 4096;
+      return this.remoteFile.getBytes(start, length).then(parseCirTree);
+    });
+
+    // XXX: are these necessary? what's the right way to propagate errors?
+    this.header.done();
+    this.contigMap.done();
+    this.cirTree.done();
+  }
+
+  /**
+   * Returns all BED entries which overlap the range.
+   * Note: while the requested range is inclusive on both ends, ranges in
+   * bigBed format files are half-open (inclusive at the start, exclusive at
+   * the end).
+   */
+  getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<Array<BedRow>> {
+    return Q.spread([this.header, this.cirTree, this.contigMap],
+                    (header, cirTree, contigMap) => {
+      var contigIx = getContigId(contigMap, contig);
+      if (contigIx === null) {
+        throw `Invalid contig ${contig}`;
+      }
+      var contigRange = new ContigInterval(contigIx, start, stop);
+      return fetchFeatures(contigRange, header, cirTree, contigMap, this.remoteFile);
+    });
+  }
+}
+
+module.exports = BigBed;
diff --git a/src/Controls.js b/src/Controls.js
@@ -14,8 +14,7 @@ var Controls = React.createClass({
     // XXX: can we be more specific than this with Flow?
     onChange: React.PropTypes.func.isRequired
   },
-  makeRange: function() {
-    // XXX Removing the Number() should lead to type errors, but doesn't.
+  makeRange: function(): GenomeRange {
     return {
       contig: this.refs.contig.getDOMNode().value,
       start: Number(this.refs.start.getDOMNode().value),
@@ -35,8 +34,10 @@ var Controls = React.createClass({
     this.refs.start.getDOMNode().value = r.start;
     this.refs.stop.getDOMNode().value = r.stop;
 
-    var contigIdx = this.props.contigList.indexOf(r.contig);
-    this.refs.contig.getDOMNode().selectedIndex = contigIdx;
+    if (this.props.contigList) {
+      var contigIdx = this.props.contigList.indexOf(r.contig);
+      this.refs.contig.getDOMNode().selectedIndex = contigIdx;
+    }
   },
   render: function(): any {
     var contigOptions = this.props.contigList
@@ -56,7 +57,7 @@ var Controls = React.createClass({
       </form>
     );
   },
-  componentDidUpdate: function(prevProps, prevState) {
+  componentDidUpdate: function(prevProps: Object) {
     if (!_.isEqual(prevProps.range, this.props.range)) {
       this.updateRangeUI();
     }

diff --git a/src/ReadableView.js b/src/ReadableView.js
diff --git a/src/RemoteFile.js b/src/RemoteFile.js
@@ -17,20 +17,22 @@ class RemoteFile {
   url: string;
   fileLength: number;
   chunks: Array<Chunk>;  // regions of file that have already been loaded.
+  numNetworkRequests: number;  // track this for debugging/testing
 
   constructor(url: string) {
     this.url = url;
     this.fileLength = -1;  // unknown
     this.chunks = [];
+    this.numNetworkRequests = 0;
   }
 
-  getBytes(start: number, length: number): Q.Promise<DataView> {
+  getBytes(start: number, length: number): Q.Promise<ArrayBuffer> {
     var stop = start + length;
     // First check the cache.
     for (var i = 0; i < this.chunks.length; i++) {
       var chunk = this.chunks[i];
       if (chunk.start <= start && chunk.stop >= stop) {
-        return Q.when(new DataView(chunk.buffer, start - chunk.start, length));
+        return Q.when(chunk.buffer.slice(start - chunk.start, stop - chunk.start));
       }
     }
 
@@ -40,7 +42,7 @@ class RemoteFile {
     return this.getFromNetwork(start, start + length - 1);
   }
 
-  getFromNetwork(start: number, stop: number): Q.Promise<DataView> {
+  getFromNetwork(start: number, stop: number): Q.Promise<ArrayBuffer> {
     var deferred = Q.defer();
 
     var xhr = new XMLHttpRequest();
@@ -55,10 +57,11 @@ class RemoteFile {
 
       var newChunk = { start, stop: start + buffer.byteLength - 1, buffer };
       remoteFile.chunks.push(newChunk);
-      deferred.resolve(new DataView(buffer));
+      deferred.resolve(buffer);
     };
 
     // TODO: `reject`, `notify` on progress
+    this.numNetworkRequests++;
     xhr.send();
 
     return deferred.promise;