diff --git a/src/nanopolish_polya_estimator.cpp b/src/nanopolish_polya_estimator.cpp index 7f76eaf6..3bb1e763 100644 --- a/src/nanopolish_polya_estimator.cpp +++ b/src/nanopolish_polya_estimator.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,8 @@ static const char *POLYA_USAGE_MESSAGE = " --version display version\n" " --help display this help and exit\n" " -w, --window=STR only compute the poly-A lengths for reads in window STR (format: ctg:start_id-end_id)\n" +" -c, --ci whether to calculate confidence intervals for read rate and poly-A length (off by default)\n" +" -n, --nboots=INT the number of bootstraps to use for computing 95% confidence intervals\n" " -r, --reads=FILE the 1D ONT direct RNA reads are in fasta FILE\n" " -b, --bam=FILE the reads aligned to the genome assembly are in bam FILE\n" " -g, --genome=FILE the reference genome assembly for the reads is in FILE\n" @@ -78,9 +81,11 @@ namespace opt static int progress = 0; static int num_threads = 1; static int batch_size = 128; + static unsigned int conf_invs; + static int n_boots = 100; } -static const char* shortopts = "r:b:g:t:w:v"; +static const char* shortopts = "r:b:g:w:cn:t:v"; enum { OPT_HELP = 1, OPT_VERSION }; @@ -90,6 +95,8 @@ static const struct option longopts[] = { { "bam", required_argument, NULL, 'b' }, { "genome", required_argument, NULL, 'g' }, { "window", required_argument, NULL, 'w' }, + { "ci", no_argument, NULL, 'c' }, + { "nboots", required_argument, NULL, 'n' }, { "threads", required_argument, NULL, 't' }, { "help", no_argument, NULL, OPT_HELP }, { "version", no_argument, NULL, OPT_VERSION }, @@ -109,6 +116,8 @@ void parse_polya_options(int argc, char** argv) case 't': arg >> opt::num_threads; break; case 'v': opt::verbose++; break; case 'w': arg >> opt::region; break; + case 'c': opt::conf_invs++; break; + case 'n': arg >> opt::n_boots; break; case OPT_HELP: std::cout << POLYA_USAGE_MESSAGE; exit(EXIT_SUCCESS); @@ -147,6 +156,11 @@ void parse_polya_options(int argc, char** argv) die = true; } + if((opt::n_boots > 10000)||(opt::n_boots < 1)) { + std::cerr << SUBPROGRAM ": --nboots should be between 1 and 10000\n"; + die = true; + } + if (die) { std::cout << "\n" << POLYA_USAGE_MESSAGE; @@ -201,6 +215,19 @@ struct ViterbiOutputs { std::vector labels; }; +// struct DurationRange containing 95% ci and median read rate +struct DurationRange { + double read_rate; // median read rate + double read_rate_lower; // lower bound on read rate + double read_rate_upper; // upper bound on read rate +}; + +struct PolyALengthRange { + double polya_length; + double polya_length_lower; + double polya_length_upper; +}; + class SegmentationHMM { private: // ----- state space parameters: @@ -560,13 +587,41 @@ double estimate_eventalign_duration_profile(SquiggleRead& sr, return read_rate; } + +std::vector get_bootstrapped_median_durations(std::vector &durations_per_kmer, + size_t n_boots) +{ + std::vector bootstrapped_medians; + bootstrapped_medians.reserve(n_boots); + + std::vector sample; + sample.resize(durations_per_kmer.size(), 0.0); + + std::mt19937 rng(durations_per_kmer.size()); + std::uniform_int_distribution gen(0, durations_per_kmer.size() - 1); + + for (size_t i=0; ik; @@ -587,15 +642,29 @@ double estimate_unaligned_duration_profile(const SquiggleRead& sr, } } - std::sort(durations_per_kmer.begin(), durations_per_kmer.end()); - assert(durations_per_kmer.size() > 0); - double median_duration = durations_per_kmer[durations_per_kmer.size() / 2]; + double median_duration, lower_duration, upper_duration; + if(conf_invs == 0) { + std::sort(durations_per_kmer.begin(), durations_per_kmer.end()); + median_duration = durations_per_kmer[durations_per_kmer.size() / 2]; + lower_duration = median_duration; + upper_duration = median_duration; + } else { + std::vector bootstrapped_durations = get_bootstrapped_median_durations(durations_per_kmer, n_boots); + std::sort(bootstrapped_durations.begin(), bootstrapped_durations.end()); + + median_duration = bootstrapped_durations[bootstrapped_durations.size() / 2]; + lower_duration = bootstrapped_durations[std::floor(bootstrapped_durations.size() * 0.025)]; + upper_duration = bootstrapped_durations[std::floor(bootstrapped_durations.size() * 0.975)]; + } // this is our estimator of read rate, currently we use the median duration // per k-mer as its more robust to outliers caused by stalls double read_rate = 1.0 / median_duration; + double read_rate_lower = 1.0 / upper_duration; + double read_rate_upper = 1.0 / lower_duration; - return read_rate; + DurationRange read_rate_range = { read_rate, read_rate_lower, read_rate_upper }; + return read_rate_range; } // fetch the raw event durations for a given read: @@ -635,7 +704,7 @@ std::vector fetch_event_durations(const SquiggleRead& sr, // * estimate_polya_length : return an estimate of the read rate for this read. // ================================================================================ // Compute an estimate of the number of nucleotides in the poly-A tail -double estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_indices, const double read_rate) +PolyALengthRange estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_indices, const DurationRange& read_rate_range) { // start and end times (sample indices) of the poly(A) tail, in original 3'->5' time-direction: // (n.b.: everything in 5'->3' order due to inversion in SquiggleRead constructor, but our @@ -653,12 +722,17 @@ double estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_ double estimation_error_offset = -5; // length of the poly(A) tail, in nucleotides: - double polya_length = polya_duration * read_rate + estimation_error_offset; + double polya_length = polya_duration * read_rate_range.read_rate + estimation_error_offset; + double polya_length_lower = polya_duration * read_rate_range.read_rate_lower + estimation_error_offset; + double polya_length_upper = polya_duration * read_rate_range.read_rate_upper + estimation_error_offset; // ensure estimated length is non-negative: polya_length = std::max(0.0, polya_length); + polya_length_lower = std::max(0.0, polya_length_lower); + polya_length_upper = std::max(0.0, polya_length_upper); - return polya_length; + PolyALengthRange polya_length_range = { polya_length, polya_length_lower, polya_length_upper }; + return polya_length_range; } // ================================================================================ @@ -733,6 +807,8 @@ std::string post_estimation_qc(const Segmentation& region_indices, const Squiggl void estimate_polya_for_single_read(const ReadDB& read_db, const faidx_t* fai, FILE* out_fp, + size_t conf_invs, + size_t n_boots, const bam_hdr_t* hdr, const bam1_t* record, size_t read_idx, @@ -761,12 +837,17 @@ void estimate_polya_for_single_read(const ReadDB& read_db, SquiggleRead sr(read_name, read_db, SRF_LOAD_RAW_SAMPLES); if (sr.fast5_path == "" || sr.events[0].empty()) { #pragma omp critical - { - fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\tREAD_FAILED_LOAD\n", - read_name.c_str(), ref_name.c_str(), record->core.pos); + { + if (conf_invs == 0) { + fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\tREAD_FAILED_LOAD\n", + read_name.c_str(), ref_name.c_str(), record->core.pos); + } else { + fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\t-1.00\t-1.00\t-1.00\t-1.00\tREAD_FAILED_LOAD\n", + read_name.c_str(), ref_name.c_str(), record->core.pos); + } if (opt::verbose == 1) { fprintf(out_fp, - "polya-samples\t%s\t%s\t-1\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\tREAD_FAILED_LOAD\n", + "polya-samples\t%s\t%s\t-1\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\t-1.00\t-1.00\tREAD_FAILED_LOAD\n", read_name.substr(0,6).c_str(), ref_name.c_str()); } if (opt::verbose == 2) { @@ -794,11 +875,13 @@ void estimate_polya_for_single_read(const ReadDB& read_db, std::string post_segmentation_qc_flag = post_segmentation_qc(region_indices, sr); //----- compute duration profile for the read: - double read_rate = estimate_unaligned_duration_profile(sr, fai, hdr, record, read_idx, strand_idx); + DurationRange read_rate_range = estimate_unaligned_duration_profile(sr, fai, hdr, + record, read_idx, strand_idx, + conf_invs, n_boots); //----- estimate number of nucleotides in poly-A tail & post-estimation QC: - double polya_length = estimate_polya_length(sr, region_indices, read_rate); - std::string post_estimation_qc_flag = post_estimation_qc(region_indices, sr, read_rate, polya_length); + PolyALengthRange polya_length_range = estimate_polya_length(sr, region_indices, read_rate_range); + std::string post_estimation_qc_flag = post_estimation_qc(region_indices, sr, read_rate_range.read_rate, polya_length_range.polya_length); //----- Resolve QC flag based on priority: std::string qc_tag; @@ -818,12 +901,23 @@ void estimate_polya_for_single_read(const ReadDB& read_db, double polya_sample_start = region_indices.adapter+1; double polya_sample_end = region_indices.polya; double transcr_sample_start = region_indices.polya+1; + #pragma omp critical { - fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%s\n", - read_name.c_str(), ref_name.c_str(), record->core.pos, - leader_sample_start, adapter_sample_start, polya_sample_start, - transcr_sample_start, read_rate, polya_length, qc_tag.c_str()); + if (conf_invs == 0) { + fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%s\n", + read_name.c_str(), ref_name.c_str(), record->core.pos, + leader_sample_start, adapter_sample_start, polya_sample_start, + transcr_sample_start, read_rate_range.read_rate, polya_length_range.polya_length, qc_tag.c_str()); + } else { + fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%s\n", + read_name.c_str(), ref_name.c_str(), record->core.pos, + leader_sample_start, adapter_sample_start, polya_sample_start, transcr_sample_start, + read_rate_range.read_rate, read_rate_range.read_rate_lower, read_rate_range.read_rate_upper, + polya_length_range.polya_length, + polya_length_range.polya_length_lower, polya_length_range.polya_length_upper, + qc_tag.c_str()); + } // if `verbose == 1`, print the samples (picoAmps) of the read, // up to the first 1000 samples of transcript region: if (opt::verbose == 1) { @@ -874,12 +968,16 @@ int polya_main(int argc, char** argv) faidx_t *fai = fai_load(opt::genome_file.c_str()); // print header line: - fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tpolya_length\tqc_tag\n"); + if (opt::conf_invs == 0) { + fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tpolya_length\tqc_tag\n"); + } else { + fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tread_rate_ci_lower\tread_rate_ci_upper\tpolya_length\tpolya_length_ci_lower\tpolya_length_ci_upper\tqc_tag\n"); + } // the BamProcessor framework calls the input function with the // bam record, read index, etc passed as parameters // bind the other parameters the worker function needs here - auto f = std::bind(estimate_polya_for_single_read, std::ref(read_db), std::ref(fai), stdout, _1, _2, _3, _4, _5); + auto f = std::bind(estimate_polya_for_single_read, std::ref(read_db), std::ref(fai), stdout, opt::conf_invs, opt::n_boots, _1, _2, _3, _4, _5); BamProcessor processor(opt::bam_file, opt::region, opt::num_threads); processor.parallel_run(f);