Skip to content

Commit

Permalink
Use the new version of BEETL. Rewrote convert-beetl to use far less m…
Browse files Browse the repository at this point in the history
…emory.
  • Loading branch information
jts committed Jun 10, 2012
1 parent ebf3f6b commit 15b77af
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 26 deletions.
6 changes: 2 additions & 4 deletions src/SGA/convert-beetl.cpp
Expand Up @@ -105,15 +105,13 @@ int convertBeetlMain(int argc, char** argv)

// Create the suffix array index files using the sampled suffix array machinery
std::cout << "Generating lexicographic index (.sai)\n";
BWT* pBWT = new BWT(outBWTName);
ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile);
BWT* pBWT = new BWT(outBWTName, 512);
SampledSuffixArray* pSSA = new SampledSuffixArray();

pSSA->build(pBWT, pRIT, 8192);
pSSA->buildLexicoIndex(pBWT, opt::readsFile);
pSSA->writeLexicoIndex(opt::prefix + SAI_EXT);

delete pBWT;
delete pRIT;
delete pSSA;
return 0;
}
Expand Down
60 changes: 58 additions & 2 deletions src/SuffixTools/SampledSuffixArray.cpp
Expand Up @@ -149,6 +149,59 @@ void SampledSuffixArray::build(const BWT* pBWT, const ReadInfoTable* pRIT, int s
}
}

// A streamlined version of the above functions, which does not require the reads to be
// stored in a table
void SampledSuffixArray::buildLexicoIndex(const BWT* pBWT, const std::string& filename)
{
size_t numStrings = pBWT->getNumStrings();
m_saLexoIndex.resize(numStrings);

size_t MAX_ELEMS = std::numeric_limits<SSA_INT_TYPE>::max();

// Read each sequence and calculate its rank
SeqReader reader(filename);
SeqRecord record;
size_t read_idx = 0;
while(reader.get(record))
{
// For each read, start from the end of the read and backtrack through the suffix array/BWT
// to calculate its lexicographic rank in the collection
size_t idx = read_idx;

// The position coordinate is inclusive but
// since the read information table does not store the '$' symbol
// the starting position equals the read length
SAElem elem(read_idx, record.seq.length());

while(1)
{
char b = pBWT->getChar(idx);
idx = pBWT->getPC(b) + pBWT->getOcc(b, idx - 1);
if(b != '$')
{
// Decrease the position of the elem
elem.setPos(elem.getPos() - 1);
}
else
{
// we have hit the beginning of this string
// store the SAElem for the beginning of the read
// in the lexicographic index
if(elem.getPos() != 0)
std::cout << "elem: " << elem << " i: " << read_idx << "\n";

assert(elem.getPos() == 0);
size_t id = elem.getID();
assert(id < MAX_ELEMS);
m_saLexoIndex[idx] = id;
break; // done;
}
}

read_idx += 1;
}
}

// Validate the sampled suffix array values are correct
void SampledSuffixArray::validate(const std::string filename, const BWT* pBWT)
{
Expand Down Expand Up @@ -210,8 +263,11 @@ void SampledSuffixArray::writeLexicoIndex(const std::string& filename)
SAWriter writer(filename);
size_t num_strings = m_saLexoIndex.size();
writer.writeHeader(num_strings, num_strings);
for(size_t i = 0; i < m_saLexoIndex.size(); ++i)
writer.writeElem(m_saLexoIndex[i]);
for(size_t i = 0; i < m_saLexoIndex.size(); ++i)
{
SAElem elem(m_saLexoIndex[i], 0);
writer.writeElem(elem);
}
}


Expand Down
3 changes: 3 additions & 0 deletions src/SuffixTools/SampledSuffixArray.h
Expand Up @@ -40,6 +40,9 @@ class SampledSuffixArray
// Construct the sampled SA using the bwt of a set of reads and their lengths
void build(const BWT* pBWT, const ReadInfoTable* pRIT, int sampleRate = DEFAULT_SA_SAMPLE_RATE);

// Construct the lexicographic index (.sai) from the BWT and the reads in a file
void buildLexicoIndex(const BWT* pBWT, const std::string& filename);

// Validate using the full suffix array for the given set of reads. Very slow.
void validate(std::string readsFile, const BWT* pBWT);
void printInfo() const;
Expand Down
27 changes: 8 additions & 19 deletions src/bin/sga-beetl-index.pl
Expand Up @@ -8,8 +8,8 @@
use File::Basename;

# Program paths
my $SGA_BIN = "~/bin/sga-0.9.14";
my $BEETL_BIN = "/nfs/team118/js18/software/BEETL-tc/BCRext";
my $SGA_BIN = "/nfs/users/nfs_j/js18/work/devel/sga/src/build-lenny/SGA/sga";
my $BEETL_BIN = "/nfs/users/nfs_j/js18/work/devel/BEETL/Beetl";

# Options
my $bHelp = 0;
Expand Down Expand Up @@ -43,35 +43,24 @@
my $final_dir = Cwd::getcwd();
chdir($beetl_dir);

# Currently using the development version of BEETL
# Flatten the input file
my $flat_file = "flattened.txt";
#BCR expects FASTA input, convert if necessary
my $bcr_in_file = $abs_input;
my $bIsFastq = isFastq($abs_input);
my $ret = 0;
if($bIsFastq == 1)
{
runCmd("cat $abs_input | awk 'NR % 2 == 0 && NR % 4 > 0' > $flat_file");
}
elsif($bIsFastq == 0)
{
runCmd("cat $abs_input | awk 'NR % 2 == 0' > $flat_file");
}
else
{
print STDERR "Error: Cannot determine if $abs_input is FASTQ or FASTA\n";

# Set error status so no further commands are run
$ret = 1;
die("TODO");
# runCmd("cat $abs_input | awk 'NR % 2 == 0 && NR % 4 > 0' > $flat_file");
}

# Run BEETL on the flattened input file
my $time_str = `date`;
print "Starting beetl at $time_str\n";
$ret = runCmd("$BEETL_BIN $flat_file > beetl.status") if($ret == 0);
$ret = runCmd("$BEETL_BIN ext -i $bcr_in_file -a > beetl.status") if($ret == 0);

# concatenate the beetl output files
my $beetl_bwt = "beetl.bwt";
$ret = runCmd("cat bwt-B* > $beetl_bwt") if($ret == 0);
$ret = runCmd("cat BCRext-B* > $beetl_bwt") if($ret == 0);

# Run sga convert-beetl
$time_str = `date`;
Expand Down
2 changes: 1 addition & 1 deletion src/bin/sga-mergeDriver.pl
Expand Up @@ -30,7 +30,7 @@
# In optimize time mode, we load the larger of the two into memory
my $MODE_OPT_MEMORY = 0;
my $MODE_OPT_TIME = 1;
my $mode = $MODE_OPT_MEMORY;
my $mode = $MODE_OPT_TIME;

my $finalParam = "";
if($n == 2)
Expand Down

0 comments on commit 15b77af

Please sign in to comment.