Skip to content

Commit

Permalink
Bam Data Source
Browse files Browse the repository at this point in the history
  • Loading branch information
danvk committed Apr 17, 2015
1 parent 361ef5c commit 36f3ebf
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 2 deletions.
2 changes: 2 additions & 0 deletions lib/underscore.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@ declare module "underscore" {
declare function rest<T>(a: Array<T>, index?: number): Array<T>;

declare function sortBy<T>(a: T[], iteratee: (val: T)=>any): T[];

declare function filter<T>(o: {[key:string]: T}, pred: (val: T, k: string)=>boolean): T[];
}

89 changes: 89 additions & 0 deletions src/BamDataSource.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/* @flow */
'use strict';

var Events = require('backbone').Events,
_ = require('underscore'),
Q = require('q');

import type * as BamFile from './bam';
import type * as SamRead from './SamRead';

var ContigInterval = require('./ContigInterval');

type BamDataSource = {
rangeChanged: (newRange: GenomeRange) => void;
getAlignmentsInRange: (range: ContigInterval<string>) => SamRead[];
on: (event: string, handler: Function) => void;
off: (event: string) => void;
trigger: (event: string, ...args:any) => void;
};

// Genome ranges are rounded to multiples of this for fetching.
// This reduces network activity while fetching.
// TODO: tune this value
var BASE_PAIRS_PER_FETCH = 100;

function expandRange(range: ContigInterval<string>) {
var roundDown = x => x - x % BASE_PAIRS_PER_FETCH;
var newStart = Math.max(1, roundDown(range.start())),
newStop = roundDown(range.stop() + BASE_PAIRS_PER_FETCH - 1);

return new ContigInterval(range.contig, newStart, newStop);
}


function createBamSource(remoteSource: BamFile): BamDataSource {
// Keys are virtualOffset.toString()
var reads: {[key:string]: SamRead} = {};

// Ranges for which we have complete information -- no need to hit network.
var coveredRanges: ContigInterval<string>[] = [];

function addRead(read: SamRead) {
var key = read.offset.toString();
if (!reads[key]) {
reads[key] = read;
}
}

function fetch(range: GenomeRange) {
var interval = new ContigInterval(range.contig, range.start, range.stop);

// Check if this interval is already in the cache.
if (interval.isCoveredBy(coveredRanges)) {
return Q.when();
}

interval = expandRange(interval);
return remoteSource.getAlignmentsInRange(interval).then(reads => {
coveredRanges.push(interval);
coveredRanges = ContigInterval.coalesce(coveredRanges);
reads.forEach(read => addRead(read));
});
}

function getAlignmentsInRange(range: ContigInterval<string>): SamRead[] {
if (!range) return [];
// XXX there may be an issue here with adding 'chr' to contig names.
return _.filter(reads, read => read.intersects(range));
}

var o = {
rangeChanged: function(newRange: GenomeRange) {
fetch(newRange)
.then(() => o.trigger('newdata', newRange))
.done();
},
getAlignmentsInRange,

// These are here to make Flow happy.
on: () => {},
off: () => {},
trigger: () => {}
};
_.extend(o, Events); // Make this an event emitter

return o;
}

module.exports = createBamSource;
1 change: 1 addition & 0 deletions src/BigBedDataSource.js
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ function createBigBedDataSource(remoteSource: BigBed): BigBedSource {
var interval = new ContigInterval(range.contig, range.start, range.stop);

// Check if this interval is already in the cache.
// XXX is this broken? should be r.contains(interval), no?
if (_.any(coveredRanges, r => r.intersects(interval))) {
return Q.when();
}
Expand Down
3 changes: 1 addition & 2 deletions src/TwoBitDataSource.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ type TwoBitSource = {
trigger: (event: string, ...args:any) => void;
}

// Expand range by EXPANSION_FACTOR, allowing for boundary effects and
// respecting MAX_BASE_PAIRS_TO_FETCH.
// Expand range to begin and end on multiples of BASE_PAIRS_PER_FETCH.
function expandRange(range) {
var roundDown = x => x - x % BASE_PAIRS_PER_FETCH;
var newStart = Math.max(1, roundDown(range.start())),
Expand Down
53 changes: 53 additions & 0 deletions test/BamDataSource-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* @flow */
'use strict';

var chai = require('chai');
var expect = chai.expect;

var Bam = require('../src/bam'),
createBamDataSource = require('../src/BamDataSource'),
ContigInterval = require('../src/ContigInterval'),
MappedRemoteFile = require('./MappedRemoteFile');

describe('BamDataSource', function() {
function getTestSource() {
// See test/data/README.md for provenance of these files.
var remoteBAI = new MappedRemoteFile('/test/data/dream.synth3.bam.bai.mapped',
[[8054040, 8242920]]),
remoteBAM = new MappedRemoteFile('/test/data/dream.synth3.bam.mapped',
[[0, 69453], [163622109888, 163622739903]]);

var bam = new Bam(remoteBAM, remoteBAI, {
// "chunks" is usually an array; here we take advantage of the
// Object-like nature of JavaScript arrays to create a sparse array.
"chunks": { "19": [8054040, 8242920] },
"minBlockIndex": 69454
});

return createBamDataSource(bam);
}

it('should extract features in a range', function(done) {
this.timeout(5000);
var source = getTestSource();

// This range matches the "large, dense" test in bam-test.js
var range = new ContigInterval('20', 31511349, 31514172);
var reads = source.getAlignmentsInRange(range);
expect(reads).to.deep.equal([]);

// Fetching that one gene should cache its entire block.
source.on('newdata', () => {
var reads = source.getAlignmentsInRange(range);
expect(reads).to.have.length(1114);
expect(reads[0].toString()).to.equal('20:31511251-31511351');
expect(reads[1113].toString()).to.equal('20:31514171-31514271');
done();
});
source.rangeChanged({
contig: range.contig,
start: range.start(),
stop: range.stop()
});
});
});

0 comments on commit 36f3ebf

Please sign in to comment.