Skip to content

Commit

Permalink
Merge 46d6702 into a2f4298
Browse files Browse the repository at this point in the history
  • Loading branch information
danvk committed Mar 14, 2015
2 parents a2f4298 + 46d6702 commit 0cf10ce
Show file tree
Hide file tree
Showing 20 changed files with 549 additions and 181 deletions.
6 changes: 4 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@
"dependencies": {
"backbone": "^1.1.2",
"d3": "^3.5.5",
"jbinary": "^2.1.2",
"pako": "^0.2.5",
"q": "^1.1.2",
"react": "^0.12.2",
"underscore": "^1.7.0"
},
"devDependencies": {
"arraybuffer-slice": "^0.1.2",
"chai": "^2.0.0",
"coveralls": "^2.11.2",
"es5-shim": "^4.1.0",
Expand All @@ -43,7 +46,6 @@
"react-tools": "^0.12.2",
"reactify": "^1.0.0",
"sinon": "^1.12.2",
"source-map": "^0.3.0",
"text-encoding": "^0.5.2"
"source-map": "^0.3.0"
}
}
314 changes: 314 additions & 0 deletions src/BigBed.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
/**
* Parser for bigBed format.
* Based on UCSC's src/inc/bbiFile.h
*/
'use strict';

var Q = require('q'),
_ = require('underscore'),
jBinary = require('jbinary'),
pako = require('pako'); // for gzip inflation


var ReadableView = require('./ReadableView'),
RemoteFile = require('./RemoteFile'),
Interval = require('./Interval'),
ContigInterval = require('./ContigInterval'),
utils = require('./utils.js');

function typeAtOffset(typeName, offsetFieldName) {
return jBinary.Template({
baseType: typeName,
read: function(context) {
if (+context[offsetFieldName] == 0) {
return null;
} else {
return this.binary.read(this.baseType, +context[offsetFieldName]);
}
}
});
}

var BigBedTypeSet = {
'jBinary.all': 'File',
'jBinary.littleEndian': true,

'File': {
_magic: ['const', 'uint32', 0x8789F2EB, true],
version: ['const', 'uint16', 4, true],
zoomLevels: 'uint16',
chromosomeTreeOffset: 'uint64',
unzoomedDataOffset: 'uint64',
unzoomedIndexOffset: 'uint64',
fieldCount: 'uint16',
definedFieldCount: 'uint16',
// 0 if no autoSql information
autoSqlOffset: 'uint64',
totalSummaryOffset: 'uint64',
// Size of uncompression buffer. 0 if uncompressed.
uncompressBufSize: 'uint32',
// Offset to header extension 0 if no such extension
// TODO: support extended headers (not used in ensGene.bb)
extensionOffset: 'uint64',
zoomHeaders: ['array', 'ZoomHeader', 'zoomLevels'],

totalSummary: typeAtOffset('TotalSummary', 'totalSummaryOffset'),
chromosomeTree: typeAtOffset('BPlusTree', 'chromosomeTreeOffset')
},

'TotalSummary': {
basesCovered: 'uint64',
minVal: 'float64', // for bigBed minimum depth of coverage
maxVal: 'float64', // for bigBed maximum depth of coverage
sumData: 'float64', // for bigBed sum of coverage
sumSquared: 'float64' // for bigBed sum of coverage squared
},

'ZoomHeader': {
reductionLevel: 'uint32',
_reserved: 'uint32',
dataOffset: 'uint64',
indexOffset: 'uint64'
},

'BPlusTree': {
magic: ['const', 'uint32', 0x78CA8C91, true],
// Number of children per block (not byte size of block)
blockSize: 'uint32',
// Number of significant bytes in key
keySize: 'uint32',
// Number of bytes in value
valSize: 'uint32',
// Number of items in index
itemCount: 'uint64',
_reserved2: ['skip', 4],
_reserved3: ['skip', 4],
nodes: 'BPlusTreeNode' // ['array', 'BPlusTreeNode', 'itemCount']
},

'BPlusTreeNode': {
isLeaf: 'uint8', // 1 = yes, 0 = no
_reserved: 'uint8',
count: 'uint16',
contents: ['array', ['if', 'isLeaf', {
key: ['string', 'keySize'],
// Note: bigBed allows more general values; this is what Ensembl uses.
// value: ['string', 'valSize']
id: 'uint32',
size: 'uint32'
}, {
key: ['string', 'keySize'],
offset: 'uint64'
}], 'count']
}
};

var CirTreeTypeSet = {
'jBinary.all': 'File',
'jBinary.littleEndian': true,

'File': {
_magic: ['const', 'uint32', 0x2468ACE0, true],
blockSize: 'uint32',
itemCount: 'uint64',
startChromIx: 'uint32',
startBase: 'uint32',
endChromIx: 'uint32',
endBase: 'uint32',
fileSize: 'uint64',
itemsPerSlot: 'uint32',
_reserved: ['skip', 4],
blocks: 'CirNode'
},

'CirNode': {
isLeaf: 'uint8', // 1 = yes, 0 = no
_reserved: 'uint8',
count: 'uint16',
contents: ['array', ['if', 'isLeaf', {
startChromIx: 'uint32',
startBase: 'uint32',
endChromIx: 'uint32',
endBase: 'uint32',
offset: 'uint64',
size: 'uint64'
}, {
startChromIx: 'uint32',
startBase: 'uint32',
endChromIx: 'uint32',
endBase: 'uint32',
offset: 'uint64',
}], 'count']
}
};


var BigBedBlock = {
'jBinary.all': 'File',
'jBinary.littleEndian': true,

'File': ['array', 'BedEntry'],
'BedEntry': {
'chrId': 'uint32',
'start': 'uint32',
'stop': 'uint32',
'rest': 'string0'
}
};


function parseHeader(buffer) {
// TODO: check Endianness using magic. Possibly use jDataView.littleEndian
// to flip the endianness for jBinary consumption.
// NB: dalliance doesn't support big endian formats.
var jb = new jBinary(buffer, BigBedTypeSet);
var header = jb.readAll();

return header;
}

function parseCirTree(buffer) {
var jb = new jBinary(buffer, CirTreeTypeSet);
var cirTree = jb.readAll();

return cirTree;
}

function generateContigMap(twoBitHeader): {[key:string]: number} {
// Just assume it's a flat "tree" for now.
var nodes = twoBitHeader.chromosomeTree.nodes.contents;
if (!nodes) {
throw 'Invalid chromosome tree';
}
return _.object(nodes.map(function({id, key}) {
// remove trailing nulls from the key string
return [key.replace(/\0.*/, ''), id];
}));
}

function getContigId(contigMap, contig) {
if (contig in contigMap) {
return contigMap[contig];
}
var chr = 'chr' + contig;
if (chr in contigMap) {
return contigMap[chr];
}
return null;
}

function reverseContigMap(contigMap: {[key:string]: number}): Array<string> {
var ary = [];
_.forEach(contigMap, (index, name) => {
ary[index] = name;
});
return ary;
}

// Get all blocks in the file containing features which intersect with contigRange.
function findOverlappingBlocks(twoBitHeader, cirTree, contigRange) {
// Do a recursive search through the index tree
var matchingBlocks = [];
var tupleRange = [[contigRange.contig, contigRange.start()],
[contigRange.contig, contigRange.stop()]];
var find = function(node) {
if (node.contents) {
node.contents.forEach(find);
} else {
var nodeRange = [[node.startChromIx, node.startBase],
[node.endChromIx, node.endBase]];
if (utils.tupleRangeOverlaps(nodeRange, tupleRange)) {
matchingBlocks.push(node);
}
}
};
find(cirTree.blocks);

return matchingBlocks;
}

function extractFeaturesInRange(buffer, dataRange, blocks, contigRange) {
return _.flatten(blocks.map(block => {
var blockOffset = block.offset - dataRange.start,
blockLimit = blockOffset + block.size,
// TODO: where does the +2 come from? (I copied it from dalliance)
blockBuffer = buffer.slice(blockOffset + 2, blockLimit);
// TODO: only inflate if necessary
var inflatedBuffer = pako.inflateRaw(new Uint8Array(blockBuffer));

var jb = new jBinary(inflatedBuffer, BigBedBlock);
// TODO: parse only one record at a time, as many as is necessary.
var beds = jb.readAll();

beds = beds.filter(function(bed) {
var bedInterval = new ContigInterval(bed.chrId, bed.start, bed.stop);
var r = contigRange.intersects(bedInterval);
return r;
});

return beds;
}));
}


class BigBed {
remoteFile: RemoteFile;
header: Q.Promise<any>;
cirTree: Q.Promise<any>;

constructor(url: string) {
this.remoteFile = new RemoteFile(url);
this.header = this.remoteFile.getBytes(0, 64*1024).then(parseHeader);
this.contigMap = this.header.then(generateContigMap);

// Next: fetch [header.unzoomedIndexOffset, zoomHeaders[0].dataOffset] and parse
// the "CIR" tree.
this.cirTree = this.header.then(header => {
// zoomHeaders[0].dataOffset is the next entry in the file.
// We assume the "cirTree" section goes all the way to that point.
// Lacking zoom headers, assume it's 4k.
var start = header.unzoomedIndexOffset,
zoomHeader = header.zoomHeaders[0],
length = zoomHeader ? zoomHeader.dataOffset - start : 4096;
return this.remoteFile.getBytes(start, length).then(parseCirTree);
});

// XXX: are these necessary? what's the right way to propagate errors?
this.header.done();
this.contigMap.done();
this.cirTree.done();
}

// Returns all BED entries which overlap the range.
// TODO: factor logic out into a helper
getFeaturesInRange(contig: string, start: number, stop: number): Q.Promise<any> {
return Q.spread([this.header, this.cirTree, this.contigMap],
(header, cirTree, contigMap) => {
var contigIx = getContigId(contigMap, contig);
if (contigIx === null) {
throw `Invalid contig ${contig}`;
}
var contigRange = new ContigInterval(contigIx, start, stop);

var blocks = findOverlappingBlocks(header, cirTree, contigRange);
if (blocks.length == 0) {
return [];
}

var range = Interval.boundingInterval(
blocks.map(n => new Interval(+n.offset, n.offset+n.size)));
return this.remoteFile.getBytes(range.start, range.length())
.then(buffer => {
var reverseMap = reverseContigMap(contigMap);
var features = extractFeaturesInRange(buffer, range, blocks, contigRange)
features.forEach(f => {
f.contig = reverseMap[f.chrId];
delete f.chrId;
});
return features;
});
});
}
}

module.exports = BigBed;
15 changes: 10 additions & 5 deletions src/ReadableView.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,18 @@ class ReadableView {
return num;
}

// Read an unsigned 16-bit integer and advance the current position.
readUint16(): number {
return this.readUint8() +
this.readUint8() * (1 << 8);
}

// Read an unsigned 32-bit integer and advance the current position.
readUint32(): number {
var num = this.readUint8() |
this.readUint8() * (1 << 8 ) |
this.readUint8() * (1 << 16) |
this.readUint8() * (1 << 24);
return num;
return this.readUint8() +
this.readUint8() * (1 << 8 ) +
this.readUint8() * (1 << 16) +
this.readUint8() * (1 << 24);
}

// Read a sequence of 32-bit integers and advance the current position.
Expand Down
Loading

0 comments on commit 0cf10ce

Please sign in to comment.