Merge 46d6702 into a2f4298

hammerlab · Mar 14, 2015 · 0cf10ce · 0cf10ce
2 parents a2f4298 + 46d6702
commit 0cf10ce
Show file tree

Hide file tree

Showing 20 changed files with 549 additions and 181 deletions.
diff --git a/package.json b/package.json
@@ -16,11 +16,14 @@
   "dependencies": {
     "backbone": "^1.1.2",
     "d3": "^3.5.5",
+    "jbinary": "^2.1.2",
+    "pako": "^0.2.5",
     "q": "^1.1.2",
     "react": "^0.12.2",
     "underscore": "^1.7.0"
   },
   "devDependencies": {
+    "arraybuffer-slice": "^0.1.2",
     "chai": "^2.0.0",
     "coveralls": "^2.11.2",
     "es5-shim": "^4.1.0",
@@ -43,7 +46,6 @@
     "react-tools": "^0.12.2",
     "reactify": "^1.0.0",
     "sinon": "^1.12.2",
-    "source-map": "^0.3.0",
-    "text-encoding": "^0.5.2"
+    "source-map": "^0.3.0"
   }
 }
diff --git a/src/BigBed.js b/src/BigBed.js
@@ -0,0 +1,314 @@
+/**
+ * Parser for bigBed format.
+ * Based on UCSC's src/inc/bbiFile.h
+ */
+'use strict';
+
+var Q = require('q'),
+    _ = require('underscore'),
+    jBinary = require('jbinary'),
+    pako = require('pako');  // for gzip inflation
+
+
+var ReadableView = require('./ReadableView'),
+    RemoteFile = require('./RemoteFile'),
+    Interval = require('./Interval'),
+    ContigInterval = require('./ContigInterval'),
+    utils = require('./utils.js');
+
+function typeAtOffset(typeName, offsetFieldName) {
+  return jBinary.Template({
+      baseType: typeName,
+      read: function(context) {
+        if (+context[offsetFieldName] == 0) {
+          return null;
+        } else {
+          return this.binary.read(this.baseType, +context[offsetFieldName]);
+        }
+      }
+    });
+}
+
+var BigBedTypeSet = {
+  'jBinary.all': 'File',
+  'jBinary.littleEndian': true,
+
+  'File': {
+    _magic: ['const', 'uint32', 0x8789F2EB, true],
+    version: ['const', 'uint16', 4, true],
+    zoomLevels: 'uint16',
+    chromosomeTreeOffset: 'uint64',
+    unzoomedDataOffset: 'uint64',
+    unzoomedIndexOffset: 'uint64',
+    fieldCount: 'uint16',
+    definedFieldCount: 'uint16',
+    // 0 if no autoSql information
+    autoSqlOffset: 'uint64',
+    totalSummaryOffset: 'uint64',
+    // Size of uncompression buffer.  0 if uncompressed.
+    uncompressBufSize: 'uint32',
+    // Offset to header extension 0 if no such extension
+    // TODO: support extended headers (not used in ensGene.bb)
+    extensionOffset: 'uint64',
+    zoomHeaders: ['array', 'ZoomHeader', 'zoomLevels'],
+
+    totalSummary: typeAtOffset('TotalSummary', 'totalSummaryOffset'),
+    chromosomeTree: typeAtOffset('BPlusTree', 'chromosomeTreeOffset')
+  },
+
+  'TotalSummary': {
+    basesCovered: 'uint64',
+    minVal: 'float64',     // for bigBed minimum depth of coverage
+    maxVal: 'float64',     // for bigBed maximum depth of coverage
+    sumData: 'float64',    // for bigBed sum of coverage
+    sumSquared: 'float64'  // for bigBed sum of coverage squared
+  },
+
+  'ZoomHeader': {
+    reductionLevel: 'uint32',
+    _reserved: 'uint32',
+    dataOffset: 'uint64',
+    indexOffset: 'uint64'
+  },
+
+  'BPlusTree': {
+    magic: ['const', 'uint32', 0x78CA8C91, true],
+    // Number of children per block (not byte size of block)
+    blockSize: 'uint32',
+    // Number of significant bytes in key
+    keySize: 'uint32',
+    // Number of bytes in value
+    valSize: 'uint32',
+    // Number of items in index
+    itemCount: 'uint64',
+    _reserved2: ['skip', 4],
+    _reserved3: ['skip', 4],
+    nodes: 'BPlusTreeNode'  // ['array', 'BPlusTreeNode', 'itemCount']
+  },
+
+  'BPlusTreeNode': {
+    isLeaf: 'uint8',  // 1 = yes, 0 = no
+    _reserved: 'uint8',
+    count: 'uint16',
+    contents: ['array', ['if', 'isLeaf', {
+      key: ['string', 'keySize'],
+      // Note: bigBed allows more general values; this is what Ensembl uses.
+      // value: ['string', 'valSize']
+      id: 'uint32',
+      size: 'uint32'
+    }, {
+      key: ['string', 'keySize'],
+      offset: 'uint64'
+    }], 'count']
+  }
+};
+
+var CirTreeTypeSet = {
+  'jBinary.all': 'File',
+  'jBinary.littleEndian': true,
+
+  'File': {
+    _magic: ['const', 'uint32', 0x2468ACE0, true],
+    blockSize: 'uint32',
+    itemCount: 'uint64',
+    startChromIx: 'uint32',
+    startBase: 'uint32',
+    endChromIx: 'uint32',
+    endBase: 'uint32',
+    fileSize: 'uint64',
+    itemsPerSlot: 'uint32',
+    _reserved: ['skip', 4],
+    blocks: 'CirNode'
+  },
+
+  'CirNode': {
+    isLeaf: 'uint8',  // 1 = yes, 0 = no
+    _reserved: 'uint8',
+    count: 'uint16',
+    contents: ['array', ['if', 'isLeaf', {
+      startChromIx: 'uint32',
+      startBase: 'uint32',
+      endChromIx: 'uint32',
+      endBase: 'uint32',
+      offset: 'uint64',
+      size: 'uint64'
+    }, {
+      startChromIx: 'uint32',
+      startBase: 'uint32',
+      endChromIx: 'uint32',
+      endBase: 'uint32',
+      offset: 'uint64',
+    }], 'count']
+  }
+};
+
+
+var BigBedBlock = {
+  'jBinary.all': 'File',
+  'jBinary.littleEndian': true,
+
+  'File': ['array', 'BedEntry'],
+  'BedEntry': {
+    'chrId': 'uint32',
+    'start': 'uint32',
+    'stop': 'uint32',
+    'rest': 'string0'
+  }
+};
+
+
+function parseHeader(buffer) {
+  // TODO: check Endianness using magic. Possibly use jDataView.littleEndian
+  // to flip the endianness for jBinary consumption.
+  // NB: dalliance doesn't support big endian formats.
+  var jb = new jBinary(buffer, BigBedTypeSet);
+  var header = jb.readAll();
+
+  return header;
+}
+
+function parseCirTree(buffer) {
+  var jb = new jBinary(buffer, CirTreeTypeSet);
+  var cirTree = jb.readAll();
+
+  return cirTree;
+}
+
+function generateContigMap(twoBitHeader): {[key:string]: number} {
+  // Just assume it's a flat "tree" for now.
+  var nodes = twoBitHeader.chromosomeTree.nodes.contents;
+  if (!nodes) {
+    throw 'Invalid chromosome tree';
+  }
+  return _.object(nodes.map(function({id, key}) {
+    // remove trailing nulls from the key string
+    return [key.replace(/\0.*/, ''), id];
+  }));
+}
+
+function getContigId(contigMap, contig) {
+  if (contig in contigMap) {
+    return contigMap[contig];
+  }
+  var chr = 'chr' + contig;
+  if (chr in contigMap) {
+    return contigMap[chr];
+  }
+  return null;
+}
+
+function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
+  var ary = [];
+  _.forEach(contigMap, (index, name) => {
+    ary[index] = name;
+  });
+  return ary;
+}
+
+// Get all blocks in the file containing features which intersect with contigRange.
+function findOverlappingBlocks(twoBitHeader, cirTree, contigRange) {
+  // Do a recursive search through the index tree
+  var matchingBlocks = [];
+  var tupleRange = [[contigRange.contig, contigRange.start()],
+                    [contigRange.contig, contigRange.stop()]];
+  var find = function(node) {
+    if (node.contents) {
+      node.contents.forEach(find);
+    } else {
+      var nodeRange = [[node.startChromIx, node.startBase],
+                       [node.endChromIx, node.endBase]];
+      if (utils.tupleRangeOverlaps(nodeRange, tupleRange)) {
+        matchingBlocks.push(node);
+      }
+    }
+  };
+  find(cirTree.blocks);
+
+  return matchingBlocks;
+}
+
+function extractFeaturesInRange(buffer, dataRange, blocks, contigRange) {
+  return _.flatten(blocks.map(block => {
+    var blockOffset = block.offset - dataRange.start,
+        blockLimit = blockOffset + block.size,
+        // TODO: where does the +2 come from? (I copied it from dalliance)
+        blockBuffer = buffer.slice(blockOffset + 2, blockLimit);
+    // TODO: only inflate if necessary
+    var inflatedBuffer = pako.inflateRaw(new Uint8Array(blockBuffer));
+
+    var jb = new jBinary(inflatedBuffer, BigBedBlock);
+    // TODO: parse only one record at a time, as many as is necessary.
+    var beds = jb.readAll();
+
+    beds = beds.filter(function(bed) {
+      var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop);
+      var r = contigRange.intersects(bedInterval);
+      return r;
+    });
+
+    return beds;
+  }));
+}
+
+
+class BigBed {
+  remoteFile: RemoteFile;
+  header: Q.Promise<any>;
+  cirTree: Q.Promise<any>;
+
+  constructor(url: string) {
+    this.remoteFile = new RemoteFile(url);
+    this.header = this.remoteFile.getBytes(0, 64*1024).then(parseHeader);
+    this.contigMap = this.header.then(generateContigMap);
+
+    // Next: fetch [header.unzoomedIndexOffset, zoomHeaders[0].dataOffset] and parse
+    // the "CIR" tree.
+    this.cirTree = this.header.then(header => {
+      // zoomHeaders[0].dataOffset is the next entry in the file.
+      // We assume the "cirTree" section goes all the way to that point.
+      // Lacking zoom headers, assume it's 4k.
+      var start = header.unzoomedIndexOffset,
+          zoomHeader = header.zoomHeaders[0],
+          length = zoomHeader ? zoomHeader.dataOffset - start : 4096;
+      return this.remoteFile.getBytes(start, length).then(parseCirTree);
+    });
+
+    // XXX: are these necessary? what's the right way to propagate errors?
+    this.header.done();
+    this.contigMap.done();
+    this.cirTree.done();
+  }
+
+  // Returns all BED entries which overlap the range.
+  // TODO: factor logic out into a helper
+  getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<any> {
+    return Q.spread([this.header, this.cirTree, this.contigMap],
+                    (header, cirTree, contigMap) => {
+      var contigIx = getContigId(contigMap, contig);
+      if (contigIx === null) {
+        throw `Invalid contig ${contig}`;
+      }
+      var contigRange = new ContigInterval(contigIx, start, stop);
+
+      var blocks = findOverlappingBlocks(header, cirTree, contigRange);
+      if (blocks.length == 0) {
+        return [];
+      }
+
+      var range = Interval.boundingInterval(
+          blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
+      return this.remoteFile.getBytes(range.start, range.length())
+          .then(buffer => {
+            var reverseMap = reverseContigMap(contigMap);
+            var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
+            features.forEach(f => {
+              f.contig = reverseMap[f.chrId];
+              delete f.chrId;
+            });
+            return features;
+          });
+    });
+  }
+}
+
+module.exports = BigBed;
diff --git a/src/ReadableView.js b/src/ReadableView.js
@@ -19,13 +19,18 @@ class ReadableView {
     return num;
   }
 
+  // Read an unsigned 16-bit integer and advance the current position.
+  readUint16(): number {
+    return this.readUint8() +
+           this.readUint8() * (1 << 8);
+  }
+
   // Read an unsigned 32-bit integer and advance the current position.
   readUint32(): number {
-    var num = this.readUint8()             |
-              this.readUint8() * (1 << 8 ) |
-              this.readUint8() * (1 << 16) |
-              this.readUint8() * (1 << 24);
-    return num;
+    return this.readUint8()             +
+           this.readUint8() * (1 << 8 ) +
+           this.readUint8() * (1 << 16) +
+           this.readUint8() * (1 << 24);
   }
 
   // Read a sequence of 32-bit integers and advance the current position.