Skip to content

Commit

Permalink
Extracting bigBed features!
Browse files Browse the repository at this point in the history
  • Loading branch information
danvk committed Mar 13, 2015
1 parent 75fe22b commit cc50c17
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 27 deletions.
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"backbone": "^1.1.2",
"d3": "^3.5.5",
"jbinary": "^2.1.2",
"jszlib": "git://github.com/dasmoth/jszlib",
"pako": "^0.2.5",
"q": "^1.1.2",
"react": "^0.12.2",
"underscore": "^1.7.0"
Expand Down
103 changes: 76 additions & 27 deletions src/BigBed.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

var Q = require('q'),
_ = require('underscore'),
jBinary = require('jbinary');
jBinary = require('jbinary'),
pako = require('pako'), // for gzip inflation
jszlib = require('jszlib'),
jszlib_inflate_buffer = jszlib.inflateBuffer;


var ReadableView = require('./ReadableView'),
RemoteFile = require('./RemoteFile');
Expand Down Expand Up @@ -138,6 +142,20 @@ var CirTreeTypeSet = {
};


var BigBedBlock = {
'jBinary.all': 'File',
'jBinary.littleEndian': true,

'File': ['array', 'BedEntry'],
'BedEntry': {
'chrId': 'uint32',
'start': 'uint32',
'end': 'uint32',
'rest': 'string0'
}
};


function parseHeader(dataView: DataView) {
// TODO: check Endianness using magic. Possibly use jDataView.littleEndian
// to flip the endianness for jBinary consumption.
Expand Down Expand Up @@ -185,26 +203,22 @@ function intersectIntervals(intervals: Array<[number, number]>): [number, number
return result;
}

// Get a byte range in the file containing a superset of the interval.
function findByteRange(twoBitHeader, cirTree, contigMap, contig: string, start: number, stop: number): ?[number, number] {
var contigIx = getContigId(contigMap, contig);
if (contigIx === null) {
throw `Invalid contig ${contig}`;
}
// TODO: factor out into module
var lessOrEqual = function(c1, p1, c2, p2) {
return c1 < c2 || (c1 == c2 && p1 <= p2);
};
var contains = function(startContig, startPos, endContig, endPos, contig, pos) {
return lessOrEqual(startContig, startPos, contig, pos) &&
lessOrEqual(contig, pos, endContig, endPos);
};

// TODO: factor out into module
var lessOrEqual = function(c1, p1, c2, p2) {
return c1 < c2 || (c1 == c2 && p1 <= p2);
};
var contains = function(startContig, startPos, endContig, endPos, contig, pos) {
return lessOrEqual(startContig, startPos, contig, pos) &&
lessOrEqual(contig, pos, endContig, endPos);
};
var overlaps = function(startContig, startBase, endContig, endBase, contig, start, stop) {
return contains(startContig, startBase, endContig, endBase, contig, start) ||
contains(startContig, startBase, endContig, endBase, contig, stop);
};

var overlaps = function(startContig, startBase, endContig, endBase, contig, start, stop) {
return contains(startContig, startBase, endContig, endBase, contig, start) ||
contains(startContig, startBase, endContig, endBase, contig, stop);
};
// Get a byte range in the file containing a superset of the interval.
function findByteRange(twoBitHeader, cirTree, contigIx: number, start: number, stop: number): ?[number, number] {

// Do a recursive search through the index tree
var matchingIntervals = [];
Expand All @@ -221,16 +235,41 @@ function findByteRange(twoBitHeader, cirTree, contigMap, contig: string, start:
};
find(cirTree.blocks);

if (!matchingIntervals) {
return null; // XXX better to throw?
}
console.log(`${contig}:${start}-${stop} => `, matchingIntervals);
return matchingIntervals;

// Intersect the intervals.
// XXX UCSC allows discontiguous intervals. When would this ever happen?
return intersectIntervals(
matchingIntervals.map(n => [+n.offset, n.offset+n.size]));
}

function extractFeaturesInRange(dataView, dataRange, blocks, contigIx, start, stop) {
console.log('Fetched ', dataRange);
var buffer = dataView.buffer;

blocks.forEach(block => {
var blockOffset = block.offset - dataRange[0],
blockLimit = blockOffset + block.size,
blockBuffer = buffer.slice(blockOffset + 2, blockLimit);
console.log('Decompressing', blockOffset, '-', blockLimit);
// var inflatedBuffer = pako.inflate(blockBuffer); // throws on invalid input.
// TODO: only inflate if necessary
// TODO: use pako or zlib instead of dasmoth's library
var inflatedBuffer = jszlib_inflate_buffer(blockBuffer, 0, blockBuffer.byteLength);

var jb = new jBinary(inflatedBuffer, BigBedBlock);
// TODO: parse only one record at a time, as many as is necessary.
var beds = jb.readAll();

beds = beds.filter(function(bed) {
return overlaps(bed.chrId, bed.start, bed.chrId, bed.end, contigIx, start, stop);
});

console.log(beds);
return beds;
});
}


class BigBed {
remoteFile: RemoteFile;
Expand Down Expand Up @@ -259,13 +298,23 @@ class BigBed {
}

// Returns all BED entries which overlap the range.
// TODO: factor logic out into a helper
getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<any> {
return Q.spread([this.header, this.cirTree, this.contigMap],
(header, cirTree, contigMap) => {
var range = findByteRange(header, cirTree, contigMap, contig, start, stop);
console.log(range);
// ... fetch range & parse out the features.
return true;
var contigIx = getContigId(contigMap, contig);
if (contigIx === null) {
throw `Invalid contig ${contig}`;
}

var blocks = findByteRange(header, cirTree, contigIx, start, stop);
if (!blocks) {
return null; // XXX better to throw?
}
console.log(blocks);
var range = intersectIntervals(blocks.map(n => [+n.offset, n.offset+n.size]));
return this.remoteFile.getBytes(range[0], range[1] - range[0])
.then(dataView => extractFeaturesInRange(dataView, range, blocks, contigIx, start, stop));
});
}
}
Expand Down
1 change: 1 addition & 0 deletions src/RemoteFile.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class RemoteFile {
this.chunks = [];
}

// TODO: return a buffer, not a DataView
getBytes(start: number, length: number): Q.Promise<DataView> {
var stop = start + length;
// First check the cache.
Expand Down
4 changes: 4 additions & 0 deletions test/BigBed-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Things to test:
// - getFeatures which return no features
// - getFeatures which crosses a block boundary
// - getFeatures which crosses a contig boundary (not currently possible)
22 changes: 22 additions & 0 deletions test/jbinary-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,26 @@ describe('jBinary', function() {
expect(header.sequenceCount).to.equal(93);
expect(header.reserved).to.equal(0);
});

it('should advance through a sequence', function() {
var uint8TypeSet = {
'jBinary.all': 'File',
'jBinary.littleEndian': true,
'File': {
value: 'uint8'
}
};

var u8array = new Uint8Array(16);
for (var i = 0; i < 16; i++) {
u8array[i] = i * i;
}
var buffer = u8array.buffer;

var jb = new jBinary(buffer, uint8TypeSet);
while (jb.tell() < buffer.byteLength) {
var x = jb.read({value: 'uint8'});
console.log(jb.tell(), x);
}
});
});

0 comments on commit cc50c17

Please sign in to comment.