Skip to content

Commit

Permalink
Allow aggregating gene-level estimates by an arbitrary key
Browse files Browse the repository at this point in the history
This commit adds the txpAggregationKey option, to allow the
aggregation of transcripts by a feature of the provided name
(instead of always being based on gene_id).
  • Loading branch information
rob-p committed Sep 9, 2015
1 parent 5172146 commit 9a9dedc
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 2 deletions.
1 change: 1 addition & 0 deletions include/SailfishUtils.hpp
Expand Up @@ -101,6 +101,7 @@ namespace sailfish{
// not exist!
void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
boost::filesystem::path& estDir,
std::string aggKey,
bool haveBiasCorrectedFile = false);

enum class OrphanStatus: uint8_t { LeftOrphan = 0, RightOrphan = 1, Paired = 2 };
Expand Down
8 changes: 7 additions & 1 deletion src/SailfishQuantify.cpp
Expand Up @@ -471,6 +471,7 @@ int mainQuantify(int argc, char* argv[]) {
vector<string> unmatedReadFiles;
vector<string> mate1ReadFiles;
vector<string> mate2ReadFiles;
string txpAggregationKey;

po::options_description generic("\n"
"basic options");
Expand Down Expand Up @@ -509,10 +510,14 @@ int mainQuantify(int argc, char* argv[]) {
("fldMax" , po::value<size_t>(&(sopt.fragLenDistMax))->default_value(800), "The maximum fragment length to consider when building the empirical "
"distribution")
*/
("txpAggregationKey", po::value<std::string>(&txpAggregationKey)->default_value("gene_id"), "When generating the gene-level estimates, "
"use the provided key for aggregating transcripts. The default is the \"gene_id\" field, but other fields (e.g. \"gene_name\") might "
"be useful depending on the specifics of the annotation being used. Note: this option only affects aggregation when using a "
"GTF annotation; not an annotation in \"simple\" format.")
("fldMean", po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(200),
"If single end reads are being used for quantification, or there are an insufficient "
"number of uniquely mapping reads when performing paired-end quantification to estimate "
"the empirical fragment length distribution, then use this value to calculate effective lengths")
"the empirical fragment length distribution, then use this value to calculate effective lengths.")
("fldSD" , po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(80),
"The standard deviation used in the fragment length distribution for single-end quantification or "
"when an empirical distribution cannot be learned.")
Expand Down Expand Up @@ -710,6 +715,7 @@ int mainQuantify(int argc, char* argv[]) {
try {
sailfish::utils::generateGeneLevelEstimates(geneMapPath,
outputDirectory,
txpAggregationKey,
biasCorrect);
} catch (std::invalid_argument& e) {
fmt::print(stderr, "Error: [{}] when trying to compute gene-level "\
Expand Down
3 changes: 2 additions & 1 deletion src/SailfishUtils.cpp
Expand Up @@ -586,6 +586,7 @@ namespace sailfish {

void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
boost::filesystem::path& estDir,
std::string aggKey,
bool haveBiasCorrectedFile) {
namespace bfs = boost::filesystem;
std::cerr << "Computing gene-level abundance estimates\n";
Expand All @@ -596,7 +597,7 @@ namespace sailfish {
// parse the map as a GTF file
if (extension == gtfExtension) {
// Using libgff
tranGeneMap = sailfish::utils::transcriptGeneMapFromGTF(geneMapPath.string(), "gene_id");
tranGeneMap = sailfish::utils::transcriptGeneMapFromGTF(geneMapPath.string(), aggKey);
} else { // parse the map as a simple format files
std::ifstream tgfile(geneMapPath.string());
tranGeneMap = sailfish::utils::readTranscriptToGeneMap(tgfile);
Expand Down

0 comments on commit 9a9dedc

Please sign in to comment.