Use the new version of BEETL. Rewrote convert-beetl to use far less m…

…emory.
jts · Jun 10, 2012 · 15b77af · 15b77af
1 parent ebf3f6b
commit 15b77af
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 26 deletions.
diff --git a/src/SGA/convert-beetl.cpp b/src/SGA/convert-beetl.cpp
@@ -105,15 +105,13 @@ int convertBeetlMain(int argc, char** argv)
 
     // Create the suffix array index files using the sampled suffix array machinery
     std::cout << "Generating lexicographic index (.sai)\n";
-    BWT* pBWT = new BWT(outBWTName); 
-    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile);
+    BWT* pBWT = new BWT(outBWTName, 512); 
     SampledSuffixArray* pSSA = new SampledSuffixArray();
 
-    pSSA->build(pBWT, pRIT, 8192);
+    pSSA->buildLexicoIndex(pBWT, opt::readsFile);
     pSSA->writeLexicoIndex(opt::prefix + SAI_EXT);
 
     delete pBWT;
-    delete pRIT;
     delete pSSA;
     return 0;
 }

diff --git a/src/SuffixTools/SampledSuffixArray.cpp b/src/SuffixTools/SampledSuffixArray.cpp
@@ -149,6 +149,59 @@ void SampledSuffixArray::build(const BWT* pBWT, const ReadInfoTable* pRIT, int s
     }
 }
 
+// A streamlined version of the above functions, which does not require the reads to be 
+// stored in a table
+void SampledSuffixArray::buildLexicoIndex(const BWT* pBWT, const std::string& filename)
+{
+    size_t numStrings = pBWT->getNumStrings();
+    m_saLexoIndex.resize(numStrings);
+
+    size_t MAX_ELEMS = std::numeric_limits<SSA_INT_TYPE>::max();
+
+    // Read each sequence and calculate its rank
+    SeqReader reader(filename);
+    SeqRecord record;
+    size_t read_idx = 0;
+    while(reader.get(record)) 
+    {
+        // For each read, start from the end of the read and backtrack through the suffix array/BWT
+        // to calculate its lexicographic rank in the collection
+        size_t idx = read_idx;
+
+        // The position coordinate is inclusive but 
+        // since the read information table does not store the '$' symbol
+        // the starting position equals the read length
+        SAElem elem(read_idx, record.seq.length());
+
+        while(1)
+        {
+            char b = pBWT->getChar(idx);
+            idx = pBWT->getPC(b) + pBWT->getOcc(b, idx - 1);
+            if(b != '$')
+            {
+                // Decrease the position of the elem
+                elem.setPos(elem.getPos() - 1);
+            }
+            else
+            {
+                // we have hit the beginning of this string
+                // store the SAElem for the beginning of the read
+                // in the lexicographic index
+                if(elem.getPos() != 0)
+                    std::cout << "elem: " << elem << " i: " << read_idx << "\n";
+
+                assert(elem.getPos() == 0);
+                size_t id = elem.getID();
+                assert(id < MAX_ELEMS);
+                m_saLexoIndex[idx] = id;
+                break; // done;
+            }
+        }
+
+        read_idx += 1;
+    }
+}
+
 // Validate the sampled suffix array values are correct
 void SampledSuffixArray::validate(const std::string filename, const BWT* pBWT)
 {
@@ -210,8 +263,11 @@ void SampledSuffixArray::writeLexicoIndex(const std::string& filename)
     SAWriter writer(filename);
     size_t num_strings = m_saLexoIndex.size();
     writer.writeHeader(num_strings, num_strings);
-    for(size_t i = 0; i < m_saLexoIndex.size(); ++i)
-        writer.writeElem(m_saLexoIndex[i]);
+    for(size_t i = 0; i < m_saLexoIndex.size(); ++i) 
+    {
+        SAElem elem(m_saLexoIndex[i], 0);
+        writer.writeElem(elem);
+    }
 }
 
 

diff --git a/src/SuffixTools/SampledSuffixArray.h b/src/SuffixTools/SampledSuffixArray.h
@@ -40,6 +40,9 @@ class SampledSuffixArray
         // Construct the sampled SA using the bwt of a set of reads and their lengths
         void build(const BWT* pBWT, const ReadInfoTable* pRIT, int sampleRate = DEFAULT_SA_SAMPLE_RATE);
 
+        // Construct the lexicographic index (.sai) from the BWT and the reads in a file
+        void buildLexicoIndex(const BWT* pBWT, const std::string& filename);
+
         // Validate using the full suffix array for the given set of reads. Very slow.
         void validate(std::string readsFile, const BWT* pBWT);
         void printInfo() const;

diff --git a/src/bin/sga-beetl-index.pl b/src/bin/sga-beetl-index.pl
@@ -8,8 +8,8 @@
 use File::Basename;
 
 # Program paths
-my $SGA_BIN = "~/bin/sga-0.9.14";
-my $BEETL_BIN = "/nfs/team118/js18/software/BEETL-tc/BCRext";
+my $SGA_BIN = "/nfs/users/nfs_j/js18/work/devel/sga/src/build-lenny/SGA/sga";
+my $BEETL_BIN = "/nfs/users/nfs_j/js18/work/devel/BEETL/Beetl";
 
 # Options
 my $bHelp = 0;
@@ -43,35 +43,24 @@
 my $final_dir = Cwd::getcwd();
 chdir($beetl_dir);
 
-# Currently using the development version of BEETL
-# Flatten the input file
-my $flat_file = "flattened.txt";
+#BCR expects FASTA input, convert if necessary
+my $bcr_in_file = $abs_input;
 my $bIsFastq = isFastq($abs_input);
 my $ret = 0;
 if($bIsFastq == 1)
 {
-    runCmd("cat $abs_input | awk 'NR % 2 == 0 && NR % 4 > 0' > $flat_file");
-}
-elsif($bIsFastq == 0)
-{
-    runCmd("cat $abs_input | awk 'NR % 2 == 0' > $flat_file");
-}
-else
-{
-    print STDERR "Error: Cannot determine if $abs_input is FASTQ or FASTA\n"; 
-
-    # Set error status so no further commands are run
-    $ret = 1;
+    die("TODO");
+#    runCmd("cat $abs_input | awk 'NR % 2 == 0 && NR % 4 > 0' > $flat_file");
 }
 
 # Run BEETL on the flattened input file
 my $time_str = `date`;
 print "Starting beetl at $time_str\n";
-$ret = runCmd("$BEETL_BIN $flat_file > beetl.status") if($ret == 0);
+$ret = runCmd("$BEETL_BIN ext -i $bcr_in_file -a > beetl.status") if($ret == 0);
 
 # concatenate the beetl output files
 my $beetl_bwt = "beetl.bwt";
-$ret = runCmd("cat bwt-B* > $beetl_bwt") if($ret == 0);
+$ret = runCmd("cat BCRext-B* > $beetl_bwt") if($ret == 0);
 
 # Run sga convert-beetl
 $time_str = `date`;

diff --git a/src/bin/sga-mergeDriver.pl b/src/bin/sga-mergeDriver.pl
@@ -30,7 +30,7 @@
 # In optimize time mode, we load the larger of the two into memory
 my $MODE_OPT_MEMORY = 0;
 my $MODE_OPT_TIME = 1;
-my $mode = $MODE_OPT_MEMORY;
+my $mode = $MODE_OPT_TIME;
 
 my $finalParam = "";
 if($n == 2)