Skip to content

Commit

Permalink
adding kmer-plot command
Browse files Browse the repository at this point in the history
  • Loading branch information
James Gurtowski committed Dec 9, 2014
1 parent fabc28a commit 47611b6
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/SGA/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ sga_SOURCES = sga.cpp \
walk.cpp walk.h \
filter.cpp filter.h \
kmer-count.cpp kmer-count.h \
kmer-plot.cpp kmer-plot.h \
stats.cpp stats.h \
fm-merge.cpp fm-merge.h \
gmap.h gmap.cpp \
Expand Down
208 changes: 208 additions & 0 deletions src/SGA/kmer-plot.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@

#include <kmer-count.h>
#include <iostream>
#include <stack>
#include <memory>
#include <getopt.h>
#include <BWT.h>
#include <BWTInterval.h>
#include <BWTAlgorithms.h>
#include "SeqReader.h"

//
// Getopt
//

#define SUBPROGRAM "kmer-plot"
static const char *KMERPLOT_VERSION_MESSAGE = SUBPROGRAM " Version " PACKAGE_VERSION "\n";
static const char *KMERPLOT_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] in.fa test1.bwt [test2.bwt]\n"
"Generate a table of the k-mers in src.bwt, and optionaly count the number of time they appears in testX.bwt.\n"
"Output on stdout the canonical kmers and their counts on forward and reverse strand\n"
"\n"
" --help display this help and exit\n"
" --version display program version\n"
" -k, --kmer-size=N The length of the kmer to use. (default: 27)\n"
" -d, --sample-rate=N use occurrence array sample rate of N in the FM-index. Higher values use significantly\n"
" less memory at the cost of higher runtime. This value must be a power of 2 (default: 128)\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";



namespace opt {
static std::vector<std::string> bwtFiles;
static int sampleRate = BWT::DEFAULT_SAMPLE_RATE_SMALL;
static int kmerLength = 27;
}

static const char* shortopts = "d:k:x:";
enum { OPT_HELP = 1, OPT_VERSION };
static const struct option longopts[] = {
{ "sample-rate", required_argument, NULL, 'd' },
{ "kmer-size", required_argument, NULL, 'k' },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 }
};


void parseKmerPlotOptions(int argc, char** argv) {

for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
{
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c)
{
case 'd': arg >> opt::sampleRate; break;
case 'k': arg >> opt::kmerLength; break;
case OPT_HELP:
std::cout << KMERPLOT_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case OPT_VERSION:
std::cout << KMERPLOT_VERSION_MESSAGE;
exit(EXIT_SUCCESS);
}
}

if(opt::kmerLength <= 0 || opt::kmerLength % 2 == 0)
{
std::cerr << SUBPROGRAM ": invalid kmer length: " << opt::kmerLength << ", must be greater than zero and odd\n";
std::cout << "\n" << KMERPLOT_USAGE_MESSAGE;
exit(EXIT_FAILURE);
}

for(;optind<argc;++optind) {
opt::bwtFiles.push_back(argv[optind]);
}

if (opt::bwtFiles.size() < 1)
{
std::cerr << SUBPROGRAM ": missing arguments\n";
std::cout << "\n" << KMERPLOT_USAGE_MESSAGE;
exit(EXIT_FAILURE);
}

}



//
// BWT Traversal algorithm
//

// Stack structure used in the depth first search of kmers
struct stack_elt_t
{
size_t str_sz;
char bp;
BWTInterval range;
stack_elt_t(size_t strsz,char bp):str_sz(strsz),bp(bp){}
stack_elt_t(size_t strsz,char bp,const BWTInterval& range):str_sz(strsz),bp(bp),range(range){}
};



// extract all canonical kmers of a bwt by performing a backward depth-first-search
void traverse_kmer(size_t n, const BWT** pBWTs, unsigned int k)
{
std::stack< stack_elt_t > stack;
std::string str; // string storing the current path

// Intitialize the search with root elements
for(size_t i = 0; i < DNA_ALPHABET_SIZE; ++i)
{
stack_elt_t e(str.size(),ALPHABET[i]);
BWTAlgorithms::initInterval(e.range,e.bp,pBWTs[0]);
if (e.range.isValid()) stack.push(e);
}

// Perform the kmer search
while(!stack.empty())
{
// pop an element from the stack, and update the string path accordingly
stack_elt_t top = stack.top();
stack.pop();
str.resize(top.str_sz);
str.push_back(top.bp);
if (str.length()>=k) {
// we found a kmer, retreive the number of occurence
std::string seq(reverse(str));
int64_t seq_count = top.range.size();

// look for the count of its reverse complement
std::string seq_rc(complement(str));
BWTInterval range_rc = BWTAlgorithms::findInterval(pBWTs[0],seq_rc);
int64_t seq_rc_count = range_rc.isValid()?range_rc.size():0;

// print the current kmer if canonical
if (seq<seq_rc) {
std::cout << seq << '\t' << seq_count << '\t' << seq_rc_count;
for(size_t i=1;i<n;i++) {
std::cout << '\t' << std::max<int64_t>(BWTAlgorithms::findInterval(pBWTs[i],seq).size(),0) << '\t' << std::max<int64_t>(BWTAlgorithms::findInterval(pBWTs[i],seq_rc).size(),0);
}
std::cout << std::endl;
} else if (seq_rc_count<=0) {
// the current kmer is not canonical, but the reverse complement doesn't exists
// so print it now as it will never be traversed by the searching algorithm
std::cout << seq_rc << '\t' << seq_rc_count << '\t' << seq_count;
for(size_t i=1;i<n;i++) {
std::cout << '\t' << std::max<int64_t>(BWTAlgorithms::findInterval(pBWTs[i],seq_rc).size(),0) << '\t' << std::max<int64_t>(BWTAlgorithms::findInterval(pBWTs[i],seq).size(),0);
}
std::cout << std::endl;
}
} else
{
// not yet a suffix of size k, push next candidates
for(size_t i = 0; i < DNA_ALPHABET_SIZE; ++i)
{
stack_elt_t e(str.size(),ALPHABET[i],top.range);
BWTAlgorithms::updateInterval(e.range,e.bp,pBWTs[0]);
if (e.range.isValid()) stack.push(e);
}
}
}
}




//
// Main
//

int kmerPlotMain(int argc, char** argv)
{
// parse command line arguments
parseKmerCountOptions(argc,argv);

// allocate BWT objects
const BWT** pBWTs = new const BWT*[opt::bwtFiles.size()];
for(size_t i=1;i<opt::bwtFiles.size();++i)
{
pBWTs[i] = new BWT(opt::bwtFiles[i],opt::sampleRate);
}

std::string filename = opt::bwtFiles[0];

SeqReader reader(filename, SRF_NO_VALIDATION);
SeqRecord record;
std::cerr << "Processing " << filename << std::endl;
while(reader.get(record)){
std::cout << record.name << record.seq << std::endl;
}

// run kmer search
//traverse_kmer(opt::bwtFiles.size(),pBWTs,opt::kmerLength);


// clean memory
for(size_t i=0;i<opt::bwtFiles.size();++i)
{
delete pBWTs[i];
}
delete[] pBWTs;
return 0;
}



8 changes: 8 additions & 0 deletions src/SGA/kmer-plot.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef KMERPLOT_H
#define KMERPLOT_H
#include "config.h"

int kmerPlotMain(int argc, char** argv);
void parseKmerPlotOptions(int argc, char** argv);

#endif
4 changes: 4 additions & 0 deletions src/SGA/sga.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "graph-concordance.h"
#include "somatic-variant-filters.h"
#include "kmer-count.h"
#include "kmer-plot.h"

#define PROGRAM_BIN "sga"
#define AUTHOR "Jared Simpson"
Expand Down Expand Up @@ -82,6 +83,7 @@ static const char *SGA_USAGE_MESSAGE =
" filterBAM filter out contaminating mate-pair data in a BAM file\n"
" cluster find clusters of reads belonging to the same connected component in an assembly graph\n"
" kmer-count extract all kmers from a BWT file\n"
" kmer-plot plot kmer counts for a given sequence\n"
//" connect resolve the complete sequence of a paired-end fragment\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";

Expand Down Expand Up @@ -166,6 +168,8 @@ int main(int argc, char** argv)
somaticVariantFiltersMain(argc - 1, argv + 1);
else if(command == "kmer-count")
kmerCountMain(argc - 1, argv + 1);
else if(command == "kmer-plot")
kmerPlotMain(argc - 1, argv + 1);
else
{
std::cerr << "Unrecognized command: " << command << "\n";
Expand Down

0 comments on commit 47611b6

Please sign in to comment.