Merge e8ace77 into a2f4298

hammerlab · Mar 14, 2015 · bdef8e1 · bdef8e1
2 parents a2f4298 + e8ace77
commit bdef8e1
Show file tree

Hide file tree

Showing 21 changed files with 567 additions and 357 deletions.
diff --git a/package.json b/package.json
@@ -16,11 +16,14 @@
   "dependencies": {
     "backbone": "^1.1.2",
     "d3": "^3.5.5",
+    "jbinary": "^2.1.2",
+    "pako": "^0.2.5",
     "q": "^1.1.2",
     "react": "^0.12.2",
     "underscore": "^1.7.0"
   },
   "devDependencies": {
+    "arraybuffer-slice": "^0.1.2",
     "chai": "^2.0.0",
     "coveralls": "^2.11.2",
     "es5-shim": "^4.1.0",
@@ -43,7 +46,6 @@
     "react-tools": "^0.12.2",
     "reactify": "^1.0.0",
     "sinon": "^1.12.2",
-    "source-map": "^0.3.0",
-    "text-encoding": "^0.5.2"
+    "source-map": "^0.3.0"
   }
 }
diff --git a/src/BigBed.js b/src/BigBed.js
@@ -0,0 +1,174 @@
+/**
+ * Parser for bigBed format.
+ * Based on UCSC's src/inc/bbiFile.h
+ */
+'use strict';
+
+var Q = require('q'),
+    _ = require('underscore'),
+    jBinary = require('jbinary'),
+    pako = require('pako');  // for gzip inflation
+
+
+var RemoteFile = require('./RemoteFile'),
+    Interval = require('./Interval'),
+    ContigInterval = require('./ContigInterval'),
+    utils = require('./utils.js'),
+    bbi = require('./formats/bbi');
+
+
+function parseHeader(buffer) {
+  // TODO: check Endianness using magic. Possibly use jDataView.littleEndian
+  // to flip the endianness for jBinary consumption.
+  // NB: dalliance doesn't support big endian formats.
+  var jb = new jBinary(buffer, bbi.TYPE_SET);
+  var header = jb.read('Header');
+
+  return header;
+}
+
+function parseCirTree(buffer) {
+  var jb = new jBinary(buffer, bbi.TYPE_SET);
+  var cirTree = jb.read('CirTree');
+
+  return cirTree;
+}
+
+function generateContigMap(twoBitHeader): {[key:string]: number} {
+  // Just assume it's a flat "tree" for now.
+  var nodes = twoBitHeader.chromosomeTree.nodes.contents;
+  if (!nodes) {
+    throw 'Invalid chromosome tree';
+  }
+  return _.object(nodes.map(function({id, key}) {
+    // remove trailing nulls from the key string
+    return [key.replace(/\0.*/, ''), id];
+  }));
+}
+
+function getContigId(contigMap, contig) {
+  if (contig in contigMap) {
+    return contigMap[contig];
+  }
+  var chr = 'chr' + contig;
+  if (chr in contigMap) {
+    return contigMap[chr];
+  }
+  return null;
+}
+
+function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
+  var ary = [];
+  _.forEach(contigMap, (index, name) => {
+    ary[index] = name;
+  });
+  return ary;
+}
+
+// Get all blocks in the file containing features which intersect with contigRange.
+function findOverlappingBlocks(twoBitHeader, cirTree, contigRange) {
+  // Do a recursive search through the index tree
+  var matchingBlocks = [];
+  var tupleRange = [[contigRange.contig, contigRange.start()],
+                    [contigRange.contig, contigRange.stop()]];
+  var find = function(node) {
+    if (node.contents) {
+      node.contents.forEach(find);
+    } else {
+      var nodeRange = [[node.startChromIx, node.startBase],
+                       [node.endChromIx, node.endBase]];
+      if (utils.tupleRangeOverlaps(nodeRange, tupleRange)) {
+        matchingBlocks.push(node);
+      }
+    }
+  };
+  find(cirTree.blocks);
+
+  return matchingBlocks;
+}
+
+function extractFeaturesInRange(buffer, dataRange, blocks, contigRange) {
+  return _.flatten(blocks.map(block => {
+    var blockOffset = block.offset - dataRange.start,
+        blockLimit = blockOffset + block.size,
+        // TODO: where does the +2 come from? (I copied it from dalliance)
+        blockBuffer = buffer.slice(blockOffset + 2, blockLimit);
+    // TODO: only inflate if necessary
+    var inflatedBuffer = pako.inflateRaw(new Uint8Array(blockBuffer));
+
+    var jb = new jBinary(inflatedBuffer, bbi.TYPE_SET);
+    // TODO: parse only one BedEntry at a time, as many as is necessary.
+    var beds = jb.read('BedBlock');
+
+    beds = beds.filter(function(bed) {
+      var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop);
+      var r = contigRange.intersects(bedInterval);
+      return r;
+    });
+
+    return beds;
+  }));
+}
+
+
+class BigBed {
+  remoteFile: RemoteFile;
+  header: Q.Promise<any>;
+  cirTree: Q.Promise<any>;
+
+  constructor(url: string) {
+    this.remoteFile = new RemoteFile(url);
+    this.header = this.remoteFile.getBytes(0, 64*1024).then(parseHeader);
+    this.contigMap = this.header.then(generateContigMap);
+
+    // Next: fetch [header.unzoomedIndexOffset, zoomHeaders[0].dataOffset] and parse
+    // the "CIR" tree.
+    this.cirTree = this.header.then(header => {
+      // zoomHeaders[0].dataOffset is the next entry in the file.
+      // We assume the "cirTree" section goes all the way to that point.
+      // Lacking zoom headers, assume it's 4k.
+      var start = header.unzoomedIndexOffset,
+          zoomHeader = header.zoomHeaders[0],
+          length = zoomHeader ? zoomHeader.dataOffset - start : 4096;
+      return this.remoteFile.getBytes(start, length).then(parseCirTree);
+    });
+
+    // XXX: are these necessary? what's the right way to propagate errors?
+    this.header.done();
+    this.contigMap.done();
+    this.cirTree.done();
+  }
+
+  // Returns all BED entries which overlap the range.
+  // TODO: factor logic out into a helper
+  getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<any> {
+    return Q.spread([this.header, this.cirTree, this.contigMap],
+                    (header, cirTree, contigMap) => {
+      var contigIx = getContigId(contigMap, contig);
+      if (contigIx === null) {
+        throw `Invalid contig ${contig}`;
+      }
+      var contigRange = new ContigInterval(contigIx, start, stop);
+
+      var blocks = findOverlappingBlocks(header, cirTree, contigRange);
+      if (blocks.length == 0) {
+        return [];
+      }
+
+      var range = Interval.boundingInterval(
+          blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
+      return this.remoteFile.getBytes(range.start, range.length())
+          .then(buffer => {
+            var reverseMap = reverseContigMap(contigMap);
+            var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
+            features.forEach(f => {
+              f.contig = reverseMap[f.chrId];
+              delete f.chrId;
+            });
+            return features;
+          });
+    });
+  }
+}
+
+module.exports = BigBed;
diff --git a/src/ReadableView.js b/src/ReadableView.js
diff --git a/src/RemoteFile.js b/src/RemoteFile.js
@@ -17,20 +17,22 @@ class RemoteFile {
   url: string;
   fileLength: number;
   chunks: Array<Chunk>;  // regions of file that have already been loaded.
+  numNetworkRequests: number;  // track this for debugging/testing
 
   constructor(url: string) {
     this.url = url;
     this.fileLength = -1;  // unknown
     this.chunks = [];
+    this.numNetworkRequests = 0;
   }
 
-  getBytes(start: number, length: number): Q.Promise<DataView> {
+  getBytes(start: number, length: number): Q.Promise<ArrayBuffer> {
     var stop = start + length;
     // First check the cache.
     for (var i = 0; i < this.chunks.length; i++) {
       var chunk = this.chunks[i];
       if (chunk.start <= start && chunk.stop >= stop) {
-        return Q.when(new DataView(chunk.buffer, start - chunk.start, length));
+        return Q.when(chunk.buffer.slice(start - chunk.start, stop - chunk.start));
       }
     }
 
@@ -40,7 +42,7 @@ class RemoteFile {
     return this.getFromNetwork(start, start + length - 1);
   }
 
-  getFromNetwork(start: number, stop: number): Q.Promise<DataView> {
+  getFromNetwork(start: number, stop: number): Q.Promise<ArrayBuffer> {
     var deferred = Q.defer();
 
     var xhr = new XMLHttpRequest();
@@ -55,10 +57,11 @@ class RemoteFile {
 
       var newChunk = { start, stop: start + buffer.byteLength - 1, buffer };
       remoteFile.chunks.push(newChunk);
-      deferred.resolve(new DataView(buffer));
+      deferred.resolve(buffer);
     };
 
     // TODO: `reject`, `notify` on progress
+    this.numNetworkRequests++;
     xhr.send();
 
     return deferred.promise;