Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test large BAM files using MappedRemoteFile #72

Merged
merged 4 commits into from
Apr 16, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/underscore.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,7 @@ declare module "underscore" {

declare function initial<T>(a: Array<T>, n?: number): Array<T>;
declare function rest<T>(a: Array<T>, index?: number): Array<T>;

declare function sortBy<T>(a: T[], iteratee: (val: T)=>any): T[];
}

50 changes: 50 additions & 0 deletions scripts/generate_mapped_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python
'''
This script generates mapped files for use with test/MappedRemotedFile.js.

Usage:

$ cat <END http://path/to/file.txt
[[0, 1234], [5678, 6789]]
END
...
Wrote file.mapped.txt
Use with:
new MappedRemoteFile('file.mapped.txt', [
[0, 1234],
[5678, 6789]
]);

'''

import fileinput
import json
import os
import requests
import urlparse
import sys

_, url = sys.argv

ranges = json.load(sys.stdin)
ranges.sort(key=lambda x: x[0])

# TODO: coalesce ranges instead of failing
for r1, r2 in zip(ranges[:-1], ranges[1:]):
assert r1[1] < r2[0]

outfile = os.path.basename(urlparse.urlparse(url).path) + '.mapped'

with open(outfile, 'wb') as out:
total_bytes = 0
for start, stop in ranges:
headers = {'Range': 'bytes=%s-%s' % (start, stop)}
result = requests.get(url, headers=headers).content
total_bytes += len(result)
out.write(result)

print '''Wrote %d bytes to %s

Use with:
new MappedRemoteFile('%s', %s)
''' % (total_bytes, outfile, outfile, json.dumps(ranges))
4 changes: 4 additions & 0 deletions src/RemoteFile.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class RemoteFile {
}

getBytes(start: number, length: number): Q.Promise<ArrayBuffer> {
if (length < 0) {
return Q.reject(`Requested <0 bytes (${length}) from ${this.url}`);
}

// If the remote file length is known, clamp the request to fit within it.
var stop = start + length - 1;
if (this.fileLength != -1) {
Expand Down
67 changes: 67 additions & 0 deletions test/MappedRemoteFile-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* @flow */
'use strict';

var chai = require('chai'),
expect = chai.expect,
jBinary = require('jbinary'),
Q = require('q');

var MappedRemoteFile = require('./MappedRemoteFile');

describe('MappedRemoteFile', function() {
function bufferToText(buf) {
return new jBinary(buf).read('string');
}

it('should serve requests through the map', function(done) {
var remoteFile = new MappedRemoteFile('/test/data/0to9.txt', [
[0, 2], // 0,1,2
[12345678, 12345680], // 3,4,5
[9876543210, 9876543214] // 6,7,8,9,\n
]);

var promises = [
remoteFile.getBytes(0, 3).then(buf => {
expect(bufferToText(buf)).to.equal('012');
}),

remoteFile.getBytes(12345678, 2).then(buf => {
expect(bufferToText(buf)).to.equal('34');
}),

remoteFile.getBytes(9876543211, 3).then(buf => {
expect(bufferToText(buf)).to.equal('789');
}),

remoteFile.getBytes(9876543211, 10).then(buf => {
throw 'Requests for unmapped ranges should fail';
}, err => {
expect(err).to.match(/is not mapped/);
}),

remoteFile.getBytes(23456789, 1).then(buf => {
throw 'Requests for unmapped ranges should fail';
}, err => {
expect(err).to.match(/is not mapped/);
}),
];

Q.all(promises).then(() => { done(); }).done();
});

it('should forget file length', function(done) {
var remoteFile = new MappedRemoteFile('/test/data/0to9.txt', [
[0, 2], // 0,1,2
[12345673, 12345690] // 3456789\n
]);

remoteFile.getBytes(0, 3).then(buf => {
expect(bufferToText(buf)).to.equal('012');
// This second read would fail if the file remembered its length.
return remoteFile.getBytes(12345673, 8).then(buf => {
expect(bufferToText(buf)).to.equal('3456789\n');
done();
});
}).done();
});
});
75 changes: 75 additions & 0 deletions test/MappedRemoteFile.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* This class allows testing with extremely large files, without the need to
* store large files in the git repo.
*
* It stores subsets of a file on disk, then maps these back to the original
* portions of the file.
*
* Recommended usage:
* - In your test, use RecordedRemoteFile with the real remote file.
* At the end of the test, console.log remoteFile.getRequests() and copy it.
* - Generate a mapped file using scripts/generate_mapped_file.py:
* pbpaste | ./scripts/generate_mapped_file.py http://path/to/url
* - Replace RecordedRemoteFile in the test with MappedRemoteFile.
*
* @flow
*/
'use strict';

var Q = require('q');

var RemoteFile = require('../src/RemoteFile'),
Interval = require('../src/Interval');

class MappedRemoteFile extends RemoteFile {
maps: Array<Interval>;

constructor(url: string, maps: Array<[number, number]>) {
super(url);
this.maps = maps.map(([x, y]) => new Interval(x, y));
for (var i = 1; i < this.maps.length; i++) {
var m0 = this.maps[i - 1],
m1 = this.maps[i];
if (m0.stop >= m1.start) throw 'Invalid maps';
}
}

getFromNetwork(start: number, stop: number): Q.Promise<ArrayBuffer> {
// Translate start/stop (which are offsets into the large file) into
// offsets in the smaller, realized file.
var originalRequest = new Interval(start, stop),
request = null;
var offset = 0;
for (var i = 0; i < this.maps.length; i++) {
var m = this.maps[i];
if (m.containsInterval(originalRequest)) {
request = new Interval(offset + (start - m.start),
offset + (stop - m.start));
break;
} else {
offset += m.length();
}
}

if (request) {
return super.getFromNetwork(request.start, request.stop).then(buf => {
// RemoteFile may discover the mapped file length from this request.
// This results in incorrect behavior, so we force it to forget.
this.fileLength = -1;
return buf;
});
} else {
return Q.reject(`Request for ${originalRequest} is not mapped in ${this.url}`);
}
}

getAll(): Q.Promise<ArrayBuffer> {
return Q.reject('Not implemented');
}

getSize(): Q.Promise<number> {
return Q.reject('Not implemented');
}
}

module.exports = MappedRemoteFile;
21 changes: 21 additions & 0 deletions test/RecordedRemoteFile.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import type * as Q from 'q';

var _ = require('underscore');

var RemoteFile = require('../src/RemoteFile'),
Interval = require('../src/Interval');

Expand All @@ -22,6 +24,25 @@ class RecordedRemoteFile extends RemoteFile {
this.requests.push(new Interval(start, stop));
return super.getFromNetwork(start, stop);
}

// This sorts & coalesces overlapping requests to facilitate use of
// scripts/generate_mapped_file.py.
getRequests(): Array<[number, number]> {
if (this.requests.length === 0) return [];

var rs = _.sortBy(this.requests, x => x.start);
var blocks = [rs[0]];
for (var i = 1; i < rs.length; i++) {
var r = rs[i],
last = blocks[blocks.length - 1];
if (r.intersects(last)) {
blocks[blocks.length - 1].stop = r.stop;
} else {
blocks.push(r);
}
}
return blocks.map(iv => [iv.start, iv.stop]);
}
}

module.exports = RecordedRemoteFile;
30 changes: 23 additions & 7 deletions test/bam-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ var expect = chai.expect;

var Bam = require('../src/bam'),
ContigInterval = require('../src/ContigInterval'),
RemoteFile = require('../src/RemoteFile');
RemoteFile = require('../src/RemoteFile'),
MappedRemoteFile = require('./MappedRemoteFile');

describe('BAM', function() {
it('should parse BAM files', function(done) {
Expand Down Expand Up @@ -160,14 +161,29 @@ describe('BAM', function() {
}).done();
});

/*
it('should handle ginormous files', function(done) {
it('should fetch from a large, dense BAM file', function(done) {
this.timeout(5000);
var bamFile = new Bam(new RemoteFile('/chrM.sorted.bam'));
bamFile.readAll(true).then(bamData => {
expect(bamData.alignments).to.have.length(38461);

// See test/data/README.md for details on where these files came from.
var remoteBAI = new MappedRemoteFile('/test/data/dream.synth3.bam.bai.mapped',
[[8054040, 8242920]]),
remoteBAM = new MappedRemoteFile('/test/data/dream.synth3.bam.mapped',
[[0, 69453], [163622109888, 163622739903]]);

var bam = new Bam(remoteBAM, remoteBAI, {
// "chunks" is usually an array; here we take advantage of the
// Object-like nature of JavaScript arrays to create a sparse array.
"chunks": { "19": [8054040, 8242920] },
"minBlockIndex": 69454
});

var range = new ContigInterval('20', 31511349, 31514172);

bam.getAlignmentsInRange(range).then(reads => {
expect(reads).to.have.length(1114);
expect(alignmentRange(reads[0])).to.equal('19:31511251-31511351');
expect(alignmentRange(reads[1113])).to.equal('19:31514171-31514271');
done();
}).done();
});
*/
});
13 changes: 10 additions & 3 deletions test/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,18 @@ This is a subset of `ensembl.chr17.bb`, shifted to match the coordinates in
These BAM and BAI files come from the [samtools][1] tests. You can find
corresponding SAM files for them in the same repo.

[1]: https://github.com/samtools/samtools/tree/develop/test/dat


#### index_test.bam

This BAM/BAI file pair comes from [htsjdk][2] tests.

#### dream.synth3.bam.mapped

This BAM/BAI pair comes from the [ICGC-TCGA DREAM Mutation Calling
challenge][3]. It's the synth3.normal data set with MDTags added. The BAM and
BAI files have been reduced to a small portion of the originals using
`scripts/generate_mapped_file.py`.


[1]: https://github.com/samtools/samtools/tree/develop/test/dat
[2]: https://github.com/samtools/htsjdk/blob/afecd5fa959087d5bdd5d5a701e415a72d629282/testdata/htsjdk/samtools/BAMFileIndexTest/index_test.bam
[3]: https://www.synapse.org/#%21Synapse:syn312572
Binary file added test/data/dream.synth3.bam.bai.mapped
Binary file not shown.
Binary file added test/data/dream.synth3.bam.mapped
Binary file not shown.