Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add confidence intervals to polyA estimator #802

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 125 additions & 27 deletions src/nanopolish_polya_estimator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <inttypes.h>
#include <assert.h>
#include <math.h>
#include <random>
#include <sys/time.h>
#include <algorithm>
#include <sstream>
Expand Down Expand Up @@ -62,6 +63,8 @@ static const char *POLYA_USAGE_MESSAGE =
" --version display version\n"
" --help display this help and exit\n"
" -w, --window=STR only compute the poly-A lengths for reads in window STR (format: ctg:start_id-end_id)\n"
" -c, --ci whether to calculate confidence intervals for read rate and poly-A length (off by default)\n"
" -n, --nboots=INT the number of bootstraps to use for computing 95% confidence intervals\n"
" -r, --reads=FILE the 1D ONT direct RNA reads are in fasta FILE\n"
" -b, --bam=FILE the reads aligned to the genome assembly are in bam FILE\n"
" -g, --genome=FILE the reference genome assembly for the reads is in FILE\n"
Expand All @@ -78,9 +81,11 @@ namespace opt
static int progress = 0;
static int num_threads = 1;
static int batch_size = 128;
static unsigned int conf_invs;
static int n_boots = 100;
}

static const char* shortopts = "r:b:g:t:w:v";
static const char* shortopts = "r:b:g:w:cn:t:v";

enum { OPT_HELP = 1, OPT_VERSION };

Expand All @@ -90,6 +95,8 @@ static const struct option longopts[] = {
{ "bam", required_argument, NULL, 'b' },
{ "genome", required_argument, NULL, 'g' },
{ "window", required_argument, NULL, 'w' },
{ "ci", no_argument, NULL, 'c' },
{ "nboots", required_argument, NULL, 'n' },
{ "threads", required_argument, NULL, 't' },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
Expand All @@ -109,6 +116,8 @@ void parse_polya_options(int argc, char** argv)
case 't': arg >> opt::num_threads; break;
case 'v': opt::verbose++; break;
case 'w': arg >> opt::region; break;
case 'c': opt::conf_invs++; break;
case 'n': arg >> opt::n_boots; break;
case OPT_HELP:
std::cout << POLYA_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
Expand Down Expand Up @@ -147,6 +156,11 @@ void parse_polya_options(int argc, char** argv)
die = true;
}

if((opt::n_boots > 10000)||(opt::n_boots < 1)) {
std::cerr << SUBPROGRAM ": --nboots should be between 1 and 10000\n";
die = true;
}

if (die)
{
std::cout << "\n" << POLYA_USAGE_MESSAGE;
Expand Down Expand Up @@ -201,6 +215,19 @@ struct ViterbiOutputs {
std::vector<HMMState> labels;
};

// struct DurationRange containing 95% ci and median read rate
struct DurationRange {
double read_rate; // median read rate
double read_rate_lower; // lower bound on read rate
double read_rate_upper; // upper bound on read rate
};

struct PolyALengthRange {
double polya_length;
double polya_length_lower;
double polya_length_upper;
};

class SegmentationHMM {
private:
// ----- state space parameters:
Expand Down Expand Up @@ -560,13 +587,41 @@ double estimate_eventalign_duration_profile(SquiggleRead& sr,
return read_rate;
}


std::vector<double> get_bootstrapped_median_durations(std::vector<double> &durations_per_kmer,
size_t n_boots)
{
std::vector<double> bootstrapped_medians;
bootstrapped_medians.reserve(n_boots);

std::vector<double> sample;
sample.resize(durations_per_kmer.size(), 0.0);

std::mt19937 rng(durations_per_kmer.size());
std::uniform_int_distribution<int> gen(0, durations_per_kmer.size() - 1);

for (size_t i=0; i<n_boots; i++) {
for (size_t j=0; j<durations_per_kmer.size(); j++) {
sample[j] = durations_per_kmer[gen(rng)];
}
std::sort(sample.begin(), sample.end());
bootstrapped_medians.push_back(
sample[sample.size() / 2]
);
}
return bootstrapped_medians;
}


// compute a read-rate based on kmer-to-event mapping, collapsed by consecutive 5mer identity:
double estimate_unaligned_duration_profile(const SquiggleRead& sr,
const faidx_t* fai,
const bam_hdr_t* hdr,
const bam1_t* record,
const size_t read_idx,
const size_t strand_idx)
DurationRange estimate_unaligned_duration_profile(const SquiggleRead& sr,
const faidx_t* fai,
const bam_hdr_t* hdr,
const bam1_t* record,
const size_t read_idx,
const size_t strand_idx,
const size_t conf_invs,
const size_t n_boots)
{
// get kmer stats:
size_t basecalled_k = sr.get_base_model(strand_idx)->k;
Expand All @@ -587,15 +642,29 @@ double estimate_unaligned_duration_profile(const SquiggleRead& sr,
}
}

std::sort(durations_per_kmer.begin(), durations_per_kmer.end());
assert(durations_per_kmer.size() > 0);
double median_duration = durations_per_kmer[durations_per_kmer.size() / 2];
double median_duration, lower_duration, upper_duration;
if(conf_invs == 0) {
std::sort(durations_per_kmer.begin(), durations_per_kmer.end());
median_duration = durations_per_kmer[durations_per_kmer.size() / 2];
lower_duration = median_duration;
upper_duration = median_duration;
} else {
std::vector<double> bootstrapped_durations = get_bootstrapped_median_durations(durations_per_kmer, n_boots);
std::sort(bootstrapped_durations.begin(), bootstrapped_durations.end());

median_duration = bootstrapped_durations[bootstrapped_durations.size() / 2];
lower_duration = bootstrapped_durations[std::floor(bootstrapped_durations.size() * 0.025)];
upper_duration = bootstrapped_durations[std::floor(bootstrapped_durations.size() * 0.975)];
}

// this is our estimator of read rate, currently we use the median duration
// per k-mer as its more robust to outliers caused by stalls
double read_rate = 1.0 / median_duration;
double read_rate_lower = 1.0 / upper_duration;
double read_rate_upper = 1.0 / lower_duration;

return read_rate;
DurationRange read_rate_range = { read_rate, read_rate_lower, read_rate_upper };
return read_rate_range;
}

// fetch the raw event durations for a given read:
Expand Down Expand Up @@ -635,7 +704,7 @@ std::vector<double> fetch_event_durations(const SquiggleRead& sr,
// * estimate_polya_length : return an estimate of the read rate for this read.
// ================================================================================
// Compute an estimate of the number of nucleotides in the poly-A tail
double estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_indices, const double read_rate)
PolyALengthRange estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_indices, const DurationRange& read_rate_range)
{
// start and end times (sample indices) of the poly(A) tail, in original 3'->5' time-direction:
// (n.b.: everything in 5'->3' order due to inversion in SquiggleRead constructor, but our
Expand All @@ -653,12 +722,17 @@ double estimate_polya_length(const SquiggleRead& sr, const Segmentation& region_
double estimation_error_offset = -5;

// length of the poly(A) tail, in nucleotides:
double polya_length = polya_duration * read_rate + estimation_error_offset;
double polya_length = polya_duration * read_rate_range.read_rate + estimation_error_offset;
double polya_length_lower = polya_duration * read_rate_range.read_rate_lower + estimation_error_offset;
double polya_length_upper = polya_duration * read_rate_range.read_rate_upper + estimation_error_offset;

// ensure estimated length is non-negative:
polya_length = std::max(0.0, polya_length);
polya_length_lower = std::max(0.0, polya_length_lower);
polya_length_upper = std::max(0.0, polya_length_upper);

return polya_length;
PolyALengthRange polya_length_range = { polya_length, polya_length_lower, polya_length_upper };
return polya_length_range;
}

// ================================================================================
Expand Down Expand Up @@ -733,6 +807,8 @@ std::string post_estimation_qc(const Segmentation& region_indices, const Squiggl
void estimate_polya_for_single_read(const ReadDB& read_db,
const faidx_t* fai,
FILE* out_fp,
size_t conf_invs,
size_t n_boots,
const bam_hdr_t* hdr,
const bam1_t* record,
size_t read_idx,
Expand Down Expand Up @@ -761,12 +837,17 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
SquiggleRead sr(read_name, read_db, SRF_LOAD_RAW_SAMPLES);
if (sr.fast5_path == "" || sr.events[0].empty()) {
#pragma omp critical
{
fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
read_name.c_str(), ref_name.c_str(), record->core.pos);
{
if (conf_invs == 0) {
fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
read_name.c_str(), ref_name.c_str(), record->core.pos);
} else {
fprintf(out_fp, "%s\t%s\t%zu\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\t-1.00\t-1.00\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
read_name.c_str(), ref_name.c_str(), record->core.pos);
}
if (opt::verbose == 1) {
fprintf(out_fp,
"polya-samples\t%s\t%s\t-1\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\tREAD_FAILED_LOAD\n",
"polya-samples\t%s\t%s\t-1\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.0\t-1.00\t-1.00\t-1.00\t-1.00\tREAD_FAILED_LOAD\n",
read_name.substr(0,6).c_str(), ref_name.c_str());
}
if (opt::verbose == 2) {
Expand Down Expand Up @@ -794,11 +875,13 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
std::string post_segmentation_qc_flag = post_segmentation_qc(region_indices, sr);

//----- compute duration profile for the read:
double read_rate = estimate_unaligned_duration_profile(sr, fai, hdr, record, read_idx, strand_idx);
DurationRange read_rate_range = estimate_unaligned_duration_profile(sr, fai, hdr,
record, read_idx, strand_idx,
conf_invs, n_boots);

//----- estimate number of nucleotides in poly-A tail & post-estimation QC:
double polya_length = estimate_polya_length(sr, region_indices, read_rate);
std::string post_estimation_qc_flag = post_estimation_qc(region_indices, sr, read_rate, polya_length);
PolyALengthRange polya_length_range = estimate_polya_length(sr, region_indices, read_rate_range);
std::string post_estimation_qc_flag = post_estimation_qc(region_indices, sr, read_rate_range.read_rate, polya_length_range.polya_length);

//----- Resolve QC flag based on priority:
std::string qc_tag;
Expand All @@ -818,12 +901,23 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
double polya_sample_start = region_indices.adapter+1;
double polya_sample_end = region_indices.polya;
double transcr_sample_start = region_indices.polya+1;

#pragma omp critical
{
fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%s\n",
read_name.c_str(), ref_name.c_str(), record->core.pos,
leader_sample_start, adapter_sample_start, polya_sample_start,
transcr_sample_start, read_rate, polya_length, qc_tag.c_str());
if (conf_invs == 0) {
fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%s\n",
read_name.c_str(), ref_name.c_str(), record->core.pos,
leader_sample_start, adapter_sample_start, polya_sample_start,
transcr_sample_start, read_rate_range.read_rate, polya_length_range.polya_length, qc_tag.c_str());
} else {
fprintf(out_fp, "%s\t%s\t%zu\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%s\n",
read_name.c_str(), ref_name.c_str(), record->core.pos,
leader_sample_start, adapter_sample_start, polya_sample_start, transcr_sample_start,
read_rate_range.read_rate, read_rate_range.read_rate_lower, read_rate_range.read_rate_upper,
polya_length_range.polya_length,
polya_length_range.polya_length_lower, polya_length_range.polya_length_upper,
qc_tag.c_str());
}
// if `verbose == 1`, print the samples (picoAmps) of the read,
// up to the first 1000 samples of transcript region:
if (opt::verbose == 1) {
Expand Down Expand Up @@ -874,12 +968,16 @@ int polya_main(int argc, char** argv)
faidx_t *fai = fai_load(opt::genome_file.c_str());

// print header line:
fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tpolya_length\tqc_tag\n");
if (opt::conf_invs == 0) {
fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tpolya_length\tqc_tag\n");
} else {
fprintf(stdout, "readname\tcontig\tposition\tleader_start\tadapter_start\tpolya_start\ttranscript_start\tread_rate\tread_rate_ci_lower\tread_rate_ci_upper\tpolya_length\tpolya_length_ci_lower\tpolya_length_ci_upper\tqc_tag\n");
}

// the BamProcessor framework calls the input function with the
// bam record, read index, etc passed as parameters
// bind the other parameters the worker function needs here
auto f = std::bind(estimate_polya_for_single_read, std::ref(read_db), std::ref(fai), stdout, _1, _2, _3, _4, _5);
auto f = std::bind(estimate_polya_for_single_read, std::ref(read_db), std::ref(fai), stdout, opt::conf_invs, opt::n_boots, _1, _2, _3, _4, _5);
BamProcessor processor(opt::bam_file, opt::region, opt::num_threads);
processor.parallel_run(f);

Expand Down