Expand BigBed tests & cleanup

hammerlab · Mar 16, 2015 · a2e43f3 · a2e43f3
1 parent e156fa3
commit a2e43f3
Show file tree

Hide file tree

Showing 9 changed files with 134 additions and 126 deletions.
diff --git a/src/BigBed.js b/src/BigBed.js
@@ -21,19 +21,16 @@ function parseHeader(buffer) {
   // TODO: check Endianness using magic. Possibly use jDataView.littleEndian
   // to flip the endianness for jBinary consumption.
   // NB: dalliance doesn't support big endian formats.
-  var jb = new jBinary(buffer, bbi.TYPE_SET);
-  var header = jb.read('Header');
-
-  return header;
+  return new jBinary(buffer, bbi.TYPE_SET).read('Header');
 }
 
+// The "CIR" tree contains a mapping from sequence -> block offsets.
+// It stands for "Chromosome Index R tree"
 function parseCirTree(buffer) {
-  var jb = new jBinary(buffer, bbi.TYPE_SET);
-  var cirTree = jb.read('CirTree');
-
-  return cirTree;
+  return new jBinary(buffer, bbi.TYPE_SET).read('CirTree');
 }
 
+// Extract a map from contig name --> contig ID from the bigBed header.
 function generateContigMap(twoBitHeader): {[key:string]: number} {
   // Just assume it's a flat "tree" for now.
   var nodes = twoBitHeader.chromosomeTree.nodes.contents;
@@ -46,6 +43,16 @@ function generateContigMap(twoBitHeader): {[key:string]: number} {
   }));
 }
 
+// Generate the reverse map from contig ID --> contig name.
+function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
+  var ary = [];
+  _.forEach(contigMap, (index, name) => {
+    ary[index] = name;
+  });
+  return ary;
+}
+
+// Map contig name to contig ID. Leading "chr" is optional.
 function getContigId(contigMap, contig) {
   if (contig in contigMap) {
     return contigMap[contig];
@@ -57,15 +64,7 @@ function getContigId(contigMap, contig) {
   return null;
 }
 
-function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
-  var ary = [];
-  _.forEach(contigMap, (index, name) => {
-    ary[index] = name;
-  });
-  return ary;
-}
-
-// Get all blocks in the file containing features which intersect with contigRange.
+// Find all blocks containing features which intersect with contigRange.
 function findOverlappingBlocks(twoBitHeader, cirTree, contigRange) {
   // Do a recursive search through the index tree
   var matchingBlocks = [];
@@ -101,32 +100,74 @@ function extractFeaturesInRange(buffer, dataRange, blocks, contigRange) {
     var beds = jb.read('BedBlock');
 
     beds = beds.filter(function(bed) {
-      var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop);
-      var r = contigRange.intersects(bedInterval);
-      return r;
+      // Note: BED intervals are explicitly half-open.
+      // The "- 1" converts them to closed intervals for ContigInterval.
+      var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop - 1);
+      return contigRange.intersects(bedInterval);
     });
 
     return beds;
   }));
 }
 
+// Fetch the relevant blocks from the bigBed file and extract the features
+// which overlap the given range.
+function fetchFeatures(contigRange, header, cirTree, contigMap, remoteFile) {
+  var blocks = findOverlappingBlocks(header, cirTree, contigRange);
+  if (blocks.length == 0) {
+    return [];
+  }
+
+  // Find the range in the file which contains all relevant blocks.
+  // In theory there could be gaps between blocks, but it's hard to see how.
+  var range = Interval.boundingInterval(
+      blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
+
+  return remoteFile.getBytes(range.start, range.length())
+      .then(buffer => {
+        var reverseMap = reverseContigMap(contigMap);
+        var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
+        features.forEach(f => {
+          f.contig = reverseMap[f.chrId];
+          delete f.chrId;
+        });
+        return features;
+      });
+}
+
+
+type BedRow = {
+  // Half-open interval for the BED row.
+  contig: string;
+  start: number;
+  stop: number;
+  // Remaining fields in the BED row (typically tab-delimited)
+  rest: string;
+}
+
 
 class BigBed {
   remoteFile: RemoteFile;
   header: Q.Promise<any>;
   cirTree: Q.Promise<any>;
+  contigMap: Q.Promise<{[key:string]: number}>;
 
+  /**
+   * Prepare to request features from a remote bigBed file.
+   * The remote source must support HTTP Range headers.
+   * This will kick off several async requests for portions of the file.
+   */
   constructor(url: string) {
     this.remoteFile = new RemoteFile(url);
     this.header = this.remoteFile.getBytes(0, 64*1024).then(parseHeader);
     this.contigMap = this.header.then(generateContigMap);
 
-    // Next: fetch [header.unzoomedIndexOffset, zoomHeaders[0].dataOffset] and parse
-    // the "CIR" tree.
+    // Next: fetch the block index and parse out the "CIR" tree.
     this.cirTree = this.header.then(header => {
       // zoomHeaders[0].dataOffset is the next entry in the file.
       // We assume the "cirTree" section goes all the way to that point.
       // Lacking zoom headers, assume it's 4k.
+      // TODO: fetch more than 4k if necessary
       var start = header.unzoomedIndexOffset,
           zoomHeader = header.zoomHeaders[0],
           length = zoomHeader ? zoomHeader.dataOffset - start : 4096;
@@ -139,34 +180,21 @@ class BigBed {
     this.cirTree.done();
   }
 
-  // Returns all BED entries which overlap the range.
-  // TODO: factor logic out into a helper
-  getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<any> {
+  /**
+   * Returns all BED entries which overlap the range.
+   * Note: while the requested range is inclusive on both ends, ranges in
+   * bigBed format files are half-open (inclusive at the start, exclusive at
+   * the end).
+   */
+  getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<Array<BedRow>> {
     return Q.spread([this.header, this.cirTree, this.contigMap],
                     (header, cirTree, contigMap) => {
       var contigIx = getContigId(contigMap, contig);
       if (contigIx === null) {
         throw `Invalid contig ${contig}`;
       }
       var contigRange = new ContigInterval(contigIx, start, stop);
-
-      var blocks = findOverlappingBlocks(header, cirTree, contigRange);
-      if (blocks.length == 0) {
-        return [];
-      }
-
-      var range = Interval.boundingInterval(
-          blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
-      return this.remoteFile.getBytes(range.start, range.length())
-          .then(buffer => {
-            var reverseMap = reverseContigMap(contigMap);
-            var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
-            features.forEach(f => {
-              f.contig = reverseMap[f.chrId];
-              delete f.chrId;
-            });
-            return features;
-          });
+      return fetchFeatures(contigRange, header, cirTree, contigMap, this.remoteFile);
     });
   }
 }

diff --git a/src/Controls.js b/src/Controls.js
@@ -14,8 +14,7 @@ var Controls = React.createClass({
     // XXX: can we be more specific than this with Flow?
     onChange: React.PropTypes.func.isRequired
   },
-  makeRange: function() {
-    // XXX Removing the Number() should lead to type errors, but doesn't.
+  makeRange: function(): GenomeRange {
     return {
       contig: this.refs.contig.getDOMNode().value,
       start: Number(this.refs.start.getDOMNode().value),
@@ -35,8 +34,10 @@ var Controls = React.createClass({
     this.refs.start.getDOMNode().value = r.start;
     this.refs.stop.getDOMNode().value = r.stop;
 
-    var contigIdx = this.props.contigList.indexOf(r.contig);
-    this.refs.contig.getDOMNode().selectedIndex = contigIdx;
+    if (this.props.contigList) {
+      var contigIdx = this.props.contigList.indexOf(r.contig);
+      this.refs.contig.getDOMNode().selectedIndex = contigIdx;
+    }
   },
   render: function(): any {
     var contigOptions = this.props.contigList
@@ -56,7 +57,7 @@ var Controls = React.createClass({
       </form>
     );
   },
-  componentDidUpdate: function(prevProps, prevState) {
+  componentDidUpdate: function(prevProps: Object) {
     if (!_.isEqual(prevProps.range, this.props.range)) {
       this.updateRangeUI();
     }

diff --git a/src/TwoBit.js b/src/TwoBit.js
@@ -37,7 +37,6 @@ type SequenceRecord = {
 
 type TwoBitHeader = {
   sequenceCount: number;
-  reserved: number;
   sequences: Array<FileIndexEntry>;
 }
 

diff --git a/src/formats/helpers.js b/src/formats/helpers.js
@@ -3,6 +3,7 @@
  */
 var jBinary = require('jbinary');
 
+// Read a jBinary type at an offset in the buffer specified by another field.
 function typeAtOffset(typeName, offsetFieldName) {
   return jBinary.Template({
       baseType: typeName,

diff --git a/src/formats/twoBitTypes.js b/src/formats/twoBitTypes.js
@@ -13,7 +13,6 @@ var TYPE_SET = {
   'Header': {
     magic: ['const', 'uint32', 0x1A412743, true],
     version: ['const', 'uint32', 0, true],
-
     sequenceCount: 'uint32',
     reserved: 'uint32',
 

diff --git a/src/main.js b/src/main.js
@@ -1,7 +1,6 @@
 /* @flow */
 var React = require('react'),
     TwoBit = require('./TwoBit'),
-    BigBed = require('./BigBed'),
     Root = require('./Root'),
     createTwoBitDataSource = require('./TwoBitDataSource');
 
@@ -24,8 +23,3 @@ genome.getFeaturesInRange('chr1', 123000, 124000).done();
 
 var root = React.render(<Root referenceSource={dataSource} />,
                         document.getElementById('root'));
-
-var ensembl = new BigBed('/ensGene.bb');
-
-window.ensembl = ensembl;
-window.genome = genome;
diff --git a/test/BigBed-test.js b/test/BigBed-test.js
@@ -1,19 +1,15 @@
-// Things to test:
-// - getFeatures which return no features
-// - getFeatures which crosses a block boundary
-// - getFeatures which crosses a contig boundary (not currently possible)
-
 var chai = require('chai');
 var expect = chai.expect;
 var assert = chai.assert;
 
+var Q = require('q');
 var BigBed = require('../src/BigBed');
 
 describe('BigBed', function() {
   function getTestBigBed() {
     // This file was generated using UCSC tools:
     // cd kent/src/utils/bedToBigBed/tests; make
-    // This file is compressed, little endian and contains autoSQL.
+    // It is compressed, little endian, has autoSQL and two blocks.
     return new BigBed('/test/data/itemRgb.bb');
   }
 
@@ -22,8 +18,9 @@ describe('BigBed', function() {
 
     bb.getFeaturesInRange('chrX', 151077036, 151078532)
         .then(features => {
-          // chrX	151077031	151078198	MID_BLUE	0	-	151077031	151078198	0,0,128
-          // chrX	151078198	151079365	VIOLET_RED1	0	-	151078198	151079365	255,62,150
+          // Here's what these two lines in the file look like:
+          // chrX 151077031 151078198 MID_BLUE 0 - 151077031 151078198 0,0,128
+          // chrX 151078198 151079365 VIOLET_RED1 0 - 151078198 151079365 255,62,150
           expect(features).to.have.length(2);
           expect(features[0].contig).to.equal('chrX');
           expect(features[0].start).to.equal(151077031);
@@ -47,4 +44,53 @@ describe('BigBed', function() {
         })
         .done();
   });
+
+  it('should have inclusive ranges', function(done) {
+    // The matches looks like this:
+    // chrX 151071196 151072363 RED
+    // chrX 151094536 151095703 PeachPuff
+    var red = [151071196, 151072362],  // note: stop is inclusive
+        peachpuff = [151094536, 151095702];
+
+    var bb = getTestBigBed();
+    var expectN = n => features => {
+        expect(features).to.have.length(n);
+      };
+
+    Q.all([
+        // request for precisely one row from the file.
+        bb.getFeaturesInRange('chrX', red[0], red[1])
+            .then(expectN(1)),
+        // the additional base in the range hits another row.
+        bb.getFeaturesInRange('chrX', red[0], 1 + red[1])
+            .then(expectN(2)),
+        // this overlaps exactly one base pair of the first feature.
+        bb.getFeaturesInRange('chrX', red[0] - 1000, red[0])
+            .then(expectN(1)),
+        // but this range ends one base pair before it.
+        bb.getFeaturesInRange('chrX', red[0] - 1000, red[0] - 1)
+            .then(expectN(0))
+    ]).then(() => {
+      done();
+    }).done();
+  });
+
+  it('should add "chr" to contig names', function(done) {
+    var bb = getTestBigBed();
+
+    bb.getFeaturesInRange('X', 151077036, 151078532)
+        .then(features => {
+          // (same as 'should extract features in a range' test)
+          expect(features).to.have.length(2);
+          expect(features[0].contig).to.equal('chrX');
+          expect(features[1].contig).to.equal('chrX');
+          done();
+        })
+        .done();
+  });
+
+  // Things left to test:
+  // - getFeatures which crosses a block boundary
+  // - uncompressed bigBed file.
 });
+