Skip to content

Commit

Permalink
Merge branch 'new-indexer'
Browse files Browse the repository at this point in the history
  • Loading branch information
jts committed Feb 6, 2012
2 parents 3a98fa6 + b7f85ca commit d503dc9
Show file tree
Hide file tree
Showing 8 changed files with 536 additions and 77 deletions.
75 changes: 69 additions & 6 deletions src/SGA/index.cpp
Expand Up @@ -8,6 +8,7 @@
//
#include <iostream>
#include <fstream>
#include <algorithm>
#include "SGACommon.h"
#include "Util.h"
#include "index.h"
Expand All @@ -17,6 +18,7 @@
#include "BWTDiskConstruction.h"
#include "BWT.h"
#include "Timer.h"
#include "BWTCABauerCoxRosone.h"

//
// Getopt
Expand All @@ -35,6 +37,10 @@ static const char *INDEX_USAGE_MESSAGE =
"\n"
" -v, --verbose display verbose output\n"
" --help display this help and exit\n"
" -a, --algorithm=STR BWT construction algorithm. STR must be SAIS (induced copying, the default) or BCR (Bauer-Cox-Rosone)\n"
" SAIS is the default method and works well for all types of input. BCR is a specialized to handle\n"
" large volumes of short (<150bp) reads. If you have a large collection of 100bp reads, use BCR as it\n"
" will be much faster and use less memory.\n"
" -d, --disk=NUM use disk-based BWT construction algorithm. The suffix array/BWT will be constructed\n"
" for batchs of NUM reads at a time. To construct the suffix array of 200 megabases of sequence\n"
" requires ~2GB of memory, set this parameter accordingly.\n"
Expand All @@ -55,6 +61,7 @@ namespace opt
static unsigned int verbose;
static std::string readsFile;
static std::string prefix;
static std::string algorithm = "sais";
static int numReadsPerBatch = 2000000;
static int numThreads = 1;
static bool bDiskAlgo = false;
Expand All @@ -63,7 +70,7 @@ namespace opt
static int gapArrayStorage = 8;
}

static const char* shortopts = "p:m:t:d:g:cv";
static const char* shortopts = "p:a:m:t:d:g:cv";

enum { OPT_HELP = 1, OPT_VERSION, OPT_NO_REVERSE };

Expand All @@ -74,6 +81,7 @@ static const struct option longopts[] = {
{ "threads", required_argument, NULL, 't' },
{ "disk", required_argument, NULL, 'd' },
{ "gap-array", required_argument, NULL, 'g' },
{ "algorithm", required_argument, NULL, 'a' },
{ "no-reverse", no_argument, NULL, OPT_NO_REVERSE },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
Expand All @@ -85,20 +93,50 @@ int indexMain(int argc, char** argv)
Timer t("sga index");
parseIndexOptions(argc, argv);
if(!opt::bDiskAlgo)
indexInMemory();
{
if(opt::algorithm == "sais")
indexInMemorySAIS();
else
indexInMemoryBCR();
}
else
{
indexOnDisk();
}
return 0;
}

//
void indexInMemory()
void indexInMemoryBCR()
{
std::cout << "Building index for " << opt::readsFile << " in memory using BCR\n";

// Parse the initial read table
std::vector<DNAEncodedString> readSequences;
SeqReader reader(opt::readsFile);
SeqRecord sr;
while(reader.get(sr))
readSequences.push_back(sr.seq.toString());

BWTCA::runBauerCoxRosone(&readSequences, opt::prefix + BWT_EXT, opt::prefix + SAI_EXT);

if(opt::bBuildReverse)
{
// Reverse all the reads
for(size_t i = 0; i < readSequences.size(); ++i)
readSequences[i] = reverse(readSequences[i].toString());
BWTCA::runBauerCoxRosone(&readSequences, opt::prefix + RBWT_EXT, opt::prefix + RSAI_EXT);
}
}

//
void indexInMemorySAIS()
{
std::cout << "Building index for " << opt::readsFile << " in memory\n";

// Parse the initial read table
ReadTable* pRT = new ReadTable(opt::readsFile);

// Create and write the suffix array for the forward reads
buildIndexForTable(opt::prefix, pRT, false);

Expand All @@ -118,10 +156,25 @@ void indexInMemory()
void indexOnDisk()
{
std::cout << "Building index for " << opt::readsFile << " on disk\n";
buildBWTDisk(opt::readsFile, opt::prefix, BWT_EXT, SAI_EXT, false, opt::numThreads, opt::numReadsPerBatch, opt::gapArrayStorage);
BWTDiskParameters parameters;
parameters.inFile = opt::readsFile;
parameters.outPrefix = opt::prefix;
parameters.bwtExtension = BWT_EXT;
parameters.saiExtension = SAI_EXT;
parameters.numReadsPerBatch = opt::numReadsPerBatch;
parameters.numThreads = opt::numThreads;
parameters.storageLevel = opt::gapArrayStorage;
parameters.bBuildReverse = false;
parameters.bUseBCR = (opt::algorithm == "bcr");
buildBWTDisk(parameters);

if(opt::bBuildReverse)
buildBWTDisk(opt::readsFile, opt::prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads, opt::numReadsPerBatch, opt::gapArrayStorage);
{
parameters.bwtExtension = RBWT_EXT;
parameters.saiExtension = RSAI_EXT;
parameters.bBuildReverse = true;
buildBWTDisk(parameters);
}
}

//
Expand Down Expand Up @@ -163,6 +216,7 @@ void parseIndexOptions(int argc, char** argv)
case 'd': opt::bDiskAlgo = true; arg >> opt::numReadsPerBatch; break;
case 't': arg >> opt::numThreads; break;
case 'g': arg >> opt::gapArrayStorage; break;
case 'a': arg >> opt::algorithm; break;
case 'v': opt::verbose++; break;
case OPT_NO_REVERSE: opt::bBuildReverse = false; break;
case OPT_HELP:
Expand All @@ -174,6 +228,9 @@ void parseIndexOptions(int argc, char** argv)
}
}

// Transform algorithm parameter to lower case
std::transform(opt::algorithm.begin(), opt::algorithm.end(), opt::algorithm.begin(), ::tolower);

if (argc - optind < 1)
{
std::cerr << SUBPROGRAM ": missing arguments\n";
Expand All @@ -198,6 +255,12 @@ void parseIndexOptions(int argc, char** argv)
die = true;
}

if(opt::algorithm != "sais" && opt::algorithm != "bcr")
{
std::cerr << SUBPROGRAM ": unrecognized algorithm string " << opt::algorithm << ". --algorithm must be sais or bcr\n";
die = true;
}

if (die)
{
std::cerr << "Try `" << SUBPROGRAM << " --help' for more information.\n";
Expand Down
3 changes: 2 additions & 1 deletion src/SGA/index.h
Expand Up @@ -13,7 +13,8 @@
#include "SuffixArray.h"

int indexMain(int argc, char** argv);
void indexInMemory();
void indexInMemorySAIS();
void indexInMemoryBCR();
void indexOnDisk();
void buildIndexForTable(std::string outfile, const ReadTable* pRT, bool isReverse);
void parseIndexOptions(int argc, char** argv);
Expand Down

0 comments on commit d503dc9

Please sign in to comment.