jts · mparker2 · May 6, 2020 · May 6, 2020 · May 6, 2020 · May 6, 2020
diff --git a/src/nanopolish_polya_estimator.cpp b/src/nanopolish_polya_estimator.cpp
@@ -14,6 +14,7 @@
 #include <inttypes.h>
 #include <assert.h>
 #include <math.h>
+#include <random>
 #include <sys/time.h>
 #include <algorithm>
 #include <sstream>
@@ -62,6 +63,8 @@ static const char *POLYA_USAGE_MESSAGE =
 "      --version                        display version\n"
 "      --help                           display this help and exit\n"
 "  -w, --window=STR                     only compute the poly-A lengths for reads in window STR (format: ctg:start_id-end_id)\n"
+"  -c, --ci                             whether to calculate confidence intervals for read rate and poly-A length (off by default)\n"
+"  -n, --nboots=INT                     the number of bootstraps to use for computing 95% confidence intervals\n"
 "  -r, --reads=FILE                     the 1D ONT direct RNA reads are in fasta FILE\n"
 "  -b, --bam=FILE                       the reads aligned to the genome assembly are in bam FILE\n"
 "  -g, --genome=FILE                    the reference genome assembly for the reads is in FILE\n"
@@ -78,9 +81,11 @@ namespace opt
     static int progress = 0;
     static int num_threads = 1;
     static int batch_size = 128;
+    static unsigned int conf_invs;
+    static int n_boots = 100;
 }
 
-static const char* shortopts = "r:b:g:t:w:v";
+static const char* shortopts = "r:b:g:w:cn:t:v";
 
 enum { OPT_HELP = 1, OPT_VERSION };
 
@@ -90,6 +95,8 @@ static const struct option longopts[] = {
     { "bam",              required_argument, NULL, 'b' },
     { "genome",           required_argument, NULL, 'g' },
     { "window",           required_argument, NULL, 'w' },
+    { "ci",               no_argument,       NULL, 'c' },
+    { "nboots",           required_argument, NULL, 'n' },
     { "threads",          required_argument, NULL, 't' },
     { "help",             no_argument,       NULL, OPT_HELP },
     { "version",          no_argument,       NULL, OPT_VERSION },
@@ -109,6 +116,8 @@ void parse_polya_options(int argc, char** argv)
             case 't': arg >> opt::num_threads; break;
             case 'v': opt::verbose++; break;
             case 'w': arg >> opt::region; break;
+            case 'c': opt::conf_invs++; break;
+            case 'n': arg >> opt::n_boots; break;
             case OPT_HELP:
                 std::cout << POLYA_USAGE_MESSAGE;
                 exit(EXIT_SUCCESS);
@@ -147,6 +156,11 @@ void parse_polya_options(int argc, char** argv)
         die = true;
     }
 
+    if((opt::n_boots > 10000)||(opt::n_boots < 1)) {
+        std::cerr << SUBPROGRAM ": --nboots should be between 1 and 10000\n";
+        die = true;
+    }
+
     if (die)
     {
         std::cout << "\n" << POLYA_USAGE_MESSAGE;
@@ -201,6 +215,19 @@ struct ViterbiOutputs {
     std::vector<HMMState> labels;
 };
 
+// struct DurationRange containing 95% ci and median read rate
+struct DurationRange {
+    double read_rate; // median read rate
+    double read_rate_lower; // lower bound on read rate
+    double read_rate_upper; // upper bound on read rate
+};
+
+struct PolyALengthRange {
+    double polya_length;
+    double polya_length_lower;
+    double polya_length_upper;
+};
+
 class SegmentationHMM {
 private:
     // ----- state space parameters:
@@ -560,13 +587,41 @@ double estimate_eventalign_duration_profile(SquiggleRead& sr,
     return read_rate;
 }
 
+
+std::vector<double> get_bootstrapped_median_durations(std::vector<double> &durations_per_kmer,
+                                                      size_t n_boots)
+{
+    std::vector<double> bootstrapped_medians;
+    bootstrapped_medians.reserve(n_boots);
+
+    std::vector<double> sample;
+    sample.resize(durations_per_kmer.size(), 0.0);
+
+    std::mt19937 rng(durations_per_kmer.size());
+    std::uniform_int_distribution<int> gen(0, durations_per_kmer.size() - 1);
+
+    for (size_t i=0; i<n_boots; i++) {
+        for (size_t j=0; j<durations_per_kmer.size(); j++) {
+            sample[j] = durations_per_kmer[gen(rng)];
+        }
+        std::sort(sample.begin(), sample.end());
+        bootstrapped_medians.push_back(
+            sample[sample.size() / 2]
+        );
+    }
+    return bootstrapped_medians;
+}
+
+
 // compute a read-rate based on kmer-to-event mapping, collapsed by consecutive 5mer identity:
-double estimate_unaligned_duration_profile(const SquiggleRead& sr,
-                                           const faidx_t* fai,
-                                           const bam_hdr_t* hdr,
-                                           const bam1_t* record,
-                                           const size_t read_idx,
-                                           const size_t strand_idx)
+DurationRange estimate_unaligned_duration_profile(const SquiggleRead& sr,
+                                                  const faidx_t* fai,
+                                                  const bam_hdr_t* hdr,
+                                                  const bam1_t* record,
+                                                  const size_t read_idx,
+                                                  const size_t strand_idx,
+                                                  const size_t conf_invs,
+                                                  const size_t n_boots)
 {
     // get kmer stats:
     size_t basecalled_k = sr.get_base_model(strand_idx)->k;
@@ -587,15 +642,29 @@ double estimate_unaligned_duration_profile(const SquiggleRead& sr,
         }
     }
 
-    std::sort(durations_per_kmer.begin(), durations_per_kmer.end());
-    assert(durations_per_kmer.size() > 0);
-    double median_duration = durations_per_kmer[durations_per_kmer.size() / 2];
+    double median_duration, lower_duration, upper_duration;
+    if(conf_invs == 0) {
+        std::sort(durations_per_kmer.begin(), durations_per_kmer.end());
+        median_duration = durations_per_kmer[durations_per_kmer.size() / 2];
+        lower_duration = median_duration;
+        upper_duration = median_duration;
+    } else {
+        std::vector<double> bootstrapped_durations = get_bootstrapped_median_durations(durations_per_kmer, n_boots);
+        std::sort(bootstrapped_durations.begin(), bootstrapped_durations.end());
+
+        median_duration = bootstrapped_durations[bootstrapped_durations.size() / 2];
+        lower_duration = bootstrapped_durations[std::floor(bootstrapped_durations.size() * 0.025)];
+        upper_duration = bootstrapped_durations[std::floor(bootstrapped_durations.size() * 0.975)];
+    }
 
     // this is our estimator of read rate, currently we use the median duration
     // per k-mer as its more robust to outliers caused by stalls
     double read_rate = 1.0 / median_duration;
+    double read_rate_lower = 1.0 / upper_duration;
+    double read_rate_upper = 1.0 / lower_duration;
 
-    return read_rate;
+    DurationRange read_rate_range = { read_rate, read_rate_lower, read_rate_upper };
+    return read_rate_range;
 }
 
 // fetch the raw event durations for a given read:
@@ -635,7 +704,7 @@ std::vector<double> fetch_event_durations(const SquiggleRead& sr,
 // * estimate_polya_length : return an estimate of the read rate for this read.
 // ================================================================================
 // Compute an estimate of the number of nucleotides in the poly-A tail
-double estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_indices, const double read_rate)
+PolyALengthRange estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_indices, const DurationRange& read_rate_range)
 {
     // start and end times (sample indices) of the poly(A) tail, in original 3'->5' time-direction:
     // (n.b.: everything in 5'->3' order due to inversion in SquiggleRead constructor, but our
@@ -653,12 +722,17 @@ double estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_
     double estimation_error_offset = -5;
 
     // length of the poly(A) tail, in nucleotides:
-    double polya_length = polya_duration * read_rate + estimation_error_offset;
+    double polya_length = polya_duration * read_rate_range.read_rate + estimation_error_offset;
+    double polya_length_lower = polya_duration * read_rate_range.read_rate_lower + estimation_error_offset;
+    double polya_length_upper = polya_duration * read_rate_range.read_rate_upper + estimation_error_offset;
 
     // ensure estimated length is non-negative:
     polya_length = std::max(0.0, polya_length);
+    polya_length_lower = std::max(0.0, polya_length_lower);
+    polya_length_upper = std::max(0.0, polya_length_upper);
 
-    return polya_length;
+    PolyALengthRange polya_length_range = { polya_length, polya_length_lower, polya_length_upper };
+    return polya_length_range;
 }
 
 // ================================================================================
@@ -733,6 +807,8 @@ std::string post_estimation_qc(const Segmentation& region_indices, const Squiggl
 void estimate_polya_for_single_read(const ReadDB& read_db,
                                     const faidx_t* fai,
                                     FILE* out_fp,
+                                    size_t conf_invs,
+                                    size_t n_boots,
                                     const bam_hdr_t* hdr,
                                     const bam1_t* record,
                                     size_t read_idx,
@@ -761,12 +837,17 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
     SquiggleRead sr(read_name, read_db, SRF_LOAD_RAW_SAMPLES);
     if (sr.fast5_path == "" || sr.events[0].empty()) {
         #pragma omp critical
-	{
-            fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
-                read_name.c_str(), ref_name.c_str(), record->core.pos);
+        {
+            if (conf_invs == 0) {
+                fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
+                    read_name.c_str(), ref_name.c_str(), record->core.pos);
+            } else {
+                fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\t-1.00\t-1.00\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
+                    read_name.c_str(), ref_name.c_str(), record->core.pos);
+            }
             if (opt::verbose == 1) {
                 fprintf(out_fp,
-                    "polya-samples\t%s\t%s\t-1\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\tREAD_FAILED_LOAD\n",
+                    "polya-samples\t%s\t%s\t-1\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
                     read_name.substr(0,6).c_str(), ref_name.c_str());
             }
             if (opt::verbose == 2) {
@@ -794,11 +875,13 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
     std::string post_segmentation_qc_flag = post_segmentation_qc(region_indices, sr);
 
     //----- compute duration profile for the read:
-    double read_rate = estimate_unaligned_duration_profile(sr, fai, hdr, record, read_idx, strand_idx);
+    DurationRange read_rate_range = estimate_unaligned_duration_profile(sr, fai, hdr,
+                                                                        record, read_idx, strand_idx,
+                                                                        conf_invs, n_boots);
 
     //----- estimate number of nucleotides in poly-A tail & post-estimation QC:
-    double polya_length = estimate_polya_length(sr, region_indices, read_rate);
-    std::string post_estimation_qc_flag = post_estimation_qc(region_indices, sr, read_rate, polya_length);
+    PolyALengthRange polya_length_range = estimate_polya_length(sr, region_indices, read_rate_range);
+    std::string post_estimation_qc_flag = post_estimation_qc(region_indices, sr, read_rate_range.read_rate, polya_length_range.polya_length);
 
     //----- Resolve QC flag based on priority:
     std::string qc_tag;
@@ -818,12 +901,23 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
     double polya_sample_start = region_indices.adapter+1;
     double polya_sample_end = region_indices.polya;
     double transcr_sample_start = region_indices.polya+1;
+
     #pragma omp critical
     {
-        fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%s\n",
-                read_name.c_str(), ref_name.c_str(), record->core.pos,
-                leader_sample_start, adapter_sample_start, polya_sample_start,
-                transcr_sample_start, read_rate, polya_length, qc_tag.c_str());
+        if (conf_invs == 0) {
+            fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%s\n",
+                    read_name.c_str(), ref_name.c_str(), record->core.pos,
+                    leader_sample_start, adapter_sample_start, polya_sample_start,
+                    transcr_sample_start, read_rate_range.read_rate, polya_length_range.polya_length, qc_tag.c_str());
+        } else {
+            fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%s\n",
+                    read_name.c_str(), ref_name.c_str(), record->core.pos,
+                    leader_sample_start, adapter_sample_start, polya_sample_start, transcr_sample_start,
+                    read_rate_range.read_rate, read_rate_range.read_rate_lower, read_rate_range.read_rate_upper,
+                    polya_length_range.polya_length,
+                    polya_length_range.polya_length_lower, polya_length_range.polya_length_upper,
+                    qc_tag.c_str());
+        }
         // if `verbose == 1`, print the samples (picoAmps) of the read,
         // up to the first 1000 samples of transcript region:
         if (opt::verbose == 1) {
@@ -874,12 +968,16 @@ int polya_main(int argc, char** argv)
     faidx_t *fai = fai_load(opt::genome_file.c_str());
 
     // print header line:
-    fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tpolya_length\tqc_tag\n");
+    if (opt::conf_invs == 0) {
+        fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tpolya_length\tqc_tag\n");
+    } else {
+        fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tread_rate_ci_lower\tread_rate_ci_upper\tpolya_length\tpolya_length_ci_lower\tpolya_length_ci_upper\tqc_tag\n");
+    }
 
     // the BamProcessor framework calls the input function with the
     // bam record, read index, etc passed as parameters
     // bind the other parameters the worker function needs here
-    auto f = std::bind(estimate_polya_for_single_read, std::ref(read_db), std::ref(fai), stdout, _1, _2, _3, _4, _5);
+    auto f = std::bind(estimate_polya_for_single_read, std::ref(read_db), std::ref(fai), stdout, opt::conf_invs, opt::n_boots, _1, _2, _3, _4, _5);
     BamProcessor processor(opt::bam_file, opt::region, opt::num_threads);
     processor.parallel_run(f);