Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

More testing of seg filter

  • Loading branch information...
commit 7accd36efff39a09aa2fff5be320a49f638a2df6 1 parent d940471
@jts authored
Showing with 451 additions and 54 deletions.
  1. +451 −54 src/SGA/haplotype-filter.cpp
View
505 src/SGA/haplotype-filter.cpp
@@ -23,6 +23,20 @@
#include "HashMap.h"
// Functions
+// Add the log-scaled values l1 and l2 using a transform to avoid
+// precision errors
+inline double addLogs(const double l1, const double l2)
+{
+ if (l1>l2) {
+ double diff=l2-l1;
+ return l1+log(1.0+exp(diff));
+ } else {
+ double diff=l1-l2;
+ return l2+log(1.0+exp(diff));
+ }
+}
+
+
bool applyFilter(const std::string& kmer, const BWTIndexSet& indices);
// Get the number of times the kmer appears in each samples
@@ -33,8 +47,12 @@ std::vector<double> getSampleMeanKmerDepth(size_t k, const BWTIndexSet& indices)
//
double LMHaploid(double d, const std::vector<size_t>& sample_count);
-double LMHaploidNaive(double d, const std::vector<size_t>& sample_count);
-double LMDiploidNew(double d, const std::vector<size_t>& sample_count);
+double LMHaploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count);
+double LMDiploid(double d, const std::vector<size_t>& sample_count);
+double LMDiploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count);
+
+double _haploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count, double M);
+double _diploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count, double M);
//
std::vector<double> loadSampleDepthsFromFile(std::string filename);
@@ -44,7 +62,8 @@ std::vector<size_t> simulateCoverageNull(double d, size_t o, size_t N);
std::vector<size_t> simulateCoverageHaploid(double d, size_t o, size_t N);
std::vector<size_t> simulateCoverageDiploid(double d, size_t o, size_t N);
std::vector<size_t> simulateCoverageNullNonUniform(std::vector<double> sample_coverage, size_t o);
-std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample_coverage, size_t o);
+std::vector<size_t> simulateCoverageHaploidNonUniform(std::vector<double> sample_coverage, size_t o, double& out_num_carriers);
+std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample_coverage, size_t o, double& out_num_carriers);
std::vector<double> simulateSampleKmerDepths(size_t mean_depth, size_t N);
//
@@ -120,7 +139,8 @@ int haplotypeFilterMain(int argc, char** argv)
indices.pQualityTable = new QualityTable();
std::cout << "done\n";
- getSampleMeanKmerDepth(opt::k, indices);
+ std::vector<double> depths = loadSampleDepthsFromFile("depths.txt");
+ //getSampleMeanKmerDepth(opt::k, indices);
std::ofstream outFile(opt::outFile.c_str());
std::ifstream inFile(opt::vcfFile.c_str());
@@ -151,8 +171,9 @@ int haplotypeFilterMain(int argc, char** argv)
for(size_t i = 0; i < sample_coverage.size(); ++i)
total_coverage += sample_coverage[i];
- double d = 4;
- double val = LMDiploidNew(d, sample_coverage);
+ //double d = 4;
+ //double val = LMDiploid(d, sample_coverage);
+ double val = LMDiploidNonUniform(depths, sample_coverage);
std::stringstream lmss;
lmss << fields[7];
lmss << ";LM=" << val << ";";
@@ -203,12 +224,12 @@ int haplotypeFilterMain(int argc, char** argv)
//
void runSimulation()
{
- size_t trials = 2000;
+ size_t trials = 500;
size_t N = 1000;
double d = 3;
size_t min_o = 5;
- size_t max_o = round(N * d);
+ size_t max_o = d * N;
(void)min_o;
(void)max_o;
srand(NULL);
@@ -221,7 +242,8 @@ void runSimulation()
size_t n_fn = 0;
//std::vector<double> depths = simulateSampleKmerDepths(d, N);
- std::vector<double> depths = loadSampleDepthsFromFile("depths.txt");
+ //std::vector<double> depths = loadSampleDepthsFromFile("depths.txt");
+ std::vector<double> depths(N, 3);
if(opt::verbose > 0)
{
@@ -236,20 +258,26 @@ void runSimulation()
size_t o = (rand() % (max_o - min_o)) + min_o;
std::vector<size_t> counts;
bool is_fp = p < fpr;
+ double dbg_num_carriers = 0.0f;
if(is_fp)
counts = simulateCoverageNullNonUniform(depths, o);
else
- counts = simulateCoverageDiploidNonUniform(depths, o);
+ counts = simulateCoverageDiploidNonUniform(depths, o, dbg_num_carriers);
+ //counts = simulateCoverageHaploidNonUniform(depths, o, dbg_num_carriers);
if(opt::verbose > 0)
{
+ std::cout << "Read counts:\n";
std::copy(counts.begin(), counts.end(), std::ostream_iterator<size_t>(std::cout, " "));
std::cout << "\n";
}
- double s_h = LMHaploidNaive(d, counts);
- double s_d = LMDiploidNew(d, counts);
-
+// double s_h = LMHaploid(d, counts);
+ double s_h = 0.0f;
+ double s_d_nu = LMDiploidNonUniform(depths, counts);
+ double s_h_nu = LMHaploidNonUniform(depths, counts);
+ double s_d = LMDiploid(d, counts);
double s = use_diploid ? s_d : s_h;
+
if(is_fp && s < 0)
n_tn += 1;
else if(is_fp && s > 0)
@@ -258,7 +286,7 @@ void runSimulation()
n_fn += 1;
else
n_tp += 1;
- printf("%zu\t%d\t%zu\t%lf\t%lf\n", i, is_fp, o, s_d, s_h);
+ printf("%zu\t%d\t%zu\t%lf\t%lf\t%lf\t%lf\n", i, is_fp, o, s_d, s_h, s_h_nu, s_d_nu);
}
fprintf(stderr, "tp: %zu fp: %zu tn: %zu fn: %zu\n", n_tp, n_fp, n_tn, n_fn);
}
@@ -279,15 +307,16 @@ bool applyFilter(const std::string& kmer, const BWTIndexSet& indices)
double LMHaploid(double d, const std::vector<size_t>& sample_count)
{
size_t N = sample_count.size();
- std::map<size_t, size_t> samples_with_count;
double o = 0;
+ size_t n0 = 0;
for(size_t i = 0; i < N; ++i)
{
+ if(sample_count[i] == 0)
+ n0 += 1;
+
o += sample_count[i];
- samples_with_count[sample_count[i]] += 1;
}
- double n_0 = samples_with_count[0]; // number of samples with no depth
//double M = std::max(round(o/d), N - n_0);
double M = round(o/d);
// Clamp M at the number of samples
@@ -295,83 +324,222 @@ double LMHaploid(double d, const std::vector<size_t>& sample_count)
double p = o / N;
double q = o / M;
- double r = M / N;
- double inv_r = N / M;
-
- double Q_M = 1 - (1 - exp(-q)) * r;
- double log_Q_M = log(Q_M);
- double t1 = n_0 * (p + log_Q_M);
-
- double t2 = 0.0;
- for(size_t k = 1; k < N; ++k)
+ double sum = 0;
+ for(size_t i = 0; i < sample_count.size(); ++i)
{
- double nk = samples_with_count[k];
- double tmp = nk * ((k - 1) * log(inv_r) - (q - p));
- t2 += tmp;
+ size_t oi = sample_count[i];
+ if(oi == 0)
+ {
+ //sum += p - q;
+ double h1_zero = (M/N) * exp(-q) + ((N - M) / N);
+ double log_h0_zero = -p;
+ double log_pzero = log(h1_zero) - log_h0_zero;
+ sum += log_pzero;
+ }
+ else
+ {
+ //sum += (log(M/N) + Stats::logPoisson(oi, q) - Stats::logPoisson(oi, p));
+ sum += log(M/N) + oi * log(q) - oi * log(p) + p - q;
+ }
}
if(opt::verbose > 0)
{
printf("LMHaploid:\n");
- printf("\to: %lf M: %lf\n", o, M);
- printf("\tn0: %lf q: %lf r: %lf log(Q(M)): %lf\n", n_0, q, r, log_Q_M);
- printf("\tT1: %lf T2: %lf LM: %lf\n", t1, t2, t1 + t2);
+ printf("\to: %lf M: %lf N: %zu p: %lf q: %lf n0: %zu\n", o, M, N, p, q, n0);
}
- return t1 + t2;
+ return sum;
}
-double LMHaploidNaive(double d, const std::vector<size_t>& sample_count)
+double LMHaploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count)
{
size_t N = sample_count.size();
double o = 0;
size_t n0 = 0;
+ double sum_depth = 0.0;
for(size_t i = 0; i < N; ++i)
{
if(sample_count[i] == 0)
n0 += 1;
o += sample_count[i];
+ sum_depth += depths[i];
+ }
+
+ double mean_depth = sum_depth / N;
+ double M1 = N - n0;
+ double M2 = o / mean_depth;
+ double M_est = std::max(M1, M2);
+ if(opt::verbose)
+ printf("M1: %lf M2: %lf\n", M1, M2);
+
+ double best_M = -1;
+ double best_LM = -std::numeric_limits<double>::max();
+ double sum_exp = 0.0f;
+ //double lsum = 0.0f;
+
+ // Bayesian integration model
+ //double p = o / N;
+
+ double log_normalization = 0.0f;
+ for(size_t i = N - n0; i < N; ++i)
+ {
+ double log_prob_M = Stats::logPoisson(o, mean_depth*i) - log(i);
+ log_normalization = addLogs(log_normalization, log_prob_M);
+ }
+
+#if 0
+ for(double M = N - n0; M < (double)N; M += 10)
+ {
+ double q = o / M;
+
+ /*
+ double QM = (M/N) * exp(-q) + ((N - M) / N);
+ double t1 = pow(QM / exp(-p), n0);
+ double t2 = pow( (M * exp(-q)) / (N * exp(-p)), N - n0);
+ double t3 = pow(N / M, o);
+ double s1 = t1 * t2 * t3 / M;
+ */
+ double QM = (M/N) * exp(-q) + ((N - M) / N);
+ double t1 = n0 * (log(QM) + p);
+ //double t2 = (N - n0) * ( (log(M) - q) - ( log(N) - p));
+ double t2 = (N - n0) * ( log(M * exp(-q)) - log(N * exp(-p)) );
+ double t3 = o * (log(N) - log(M));
+ //double log_prob_M = log(M);
+
+ double log_prob_M = Stats::logPoisson(o, mean_depth*M) - log_normalization - log(M);
+ double s1 = t1 + t2 + t3 + log_prob_M;
+
+ if(s1 > best_LM)
+ best_LM = s1;
+ sum_exp += exp(s1);
+ lsum = addLogs(lsum, s1);
+ //printf("exp(s1) %lf\n", exp(s1));
+ if(opt::verbose)
+ {
+ printf("M: %lf M2: %lf log_prob_m: %lf p(M) %lf\n", M, M2, log_prob_M, exp(log_prob_M));
+ printf("M %zu\nQM %lf\nt1 %lf\nt2 %lf\nt3 %lf\ns1 %lf\n", (size_t)M, QM, t1, t2, t3, s1);
+ printf("M %zu\ts1 %lf sum %lf\n", (size_t)M, s1, lsum);
+ }
+ }
+#endif
+
+ return _haploidNonUniform(depths, sample_count, M1);
+
+ for(size_t M = N - n0; M < N; M += 10)
+ {
+ double log_prob_M = Stats::logPoisson(o, mean_depth*M) - log_normalization - log(M);
+ double LM = _haploidNonUniform(depths, sample_count, M_est);
+ LM += log_prob_M;
+
+ //printf("M: %zu LM: %lf\n", i, LM);
+
+ if(LM > best_LM)
+ {
+ best_LM = LM;
+ best_M = M;
+ }
+ }
+
+
+ double sum = log(sum_exp);
+ if(isinf(sum))
+ sum = 1000;
+
+// if(opt::verbose)
+// printf("Optimal %lf Sum: %lf is_inf %d\n", best_LM, sum, isinf(sum));
+ return best_LM;
+}
+
+//
+double _haploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count, double M)
+{
+ size_t N = sample_count.size();
+ assert(depths.size() == N);
+
+ double o = 0;
+ size_t n0 = 0;
+
+ std::vector<double> mean_p(N);
+ std::vector<double> mean_q(N);
+ double sum_p = 0.0f;
+ for(size_t i = 0; i < N; ++i)
+ {
+ if(sample_count[i] == 0)
+ n0 += 1;
+
+ o += sample_count[i];
+ mean_p[i] = depths[i];
+ mean_q[i] = depths[i];
+ sum_p += mean_p[i];
+ }
+
+ // Compute the expected mean depth under the NULL and H1
+ for(size_t i = 0; i < N; ++i)
+ {
+ mean_p[i] = (mean_p[i] / sum_p) * o;
+ mean_q[i] = (mean_q[i] / sum_p) * o * N / M;
+ }
+
+ if(opt::verbose > 0)
+ {
+ printf("HAP_NU Parameters -- mean_p %lf\n", mean_p[0]);
+ printf("HAP_NU Parameters -- mean_q %lf\n", mean_q[0]);
+ printf("HAP_NU Parameters -- M %lf\n", M);
}
//double M = std::max(round(o/d), N - n_0);
- double M = round(o/d);
+ //M = round(o / mean_depth);
+
// Clamp M at the number of samples
- M = std::min((double)N, M);
+ //M = std::min((double)N, M);
- double p = o / N;
- double q = o / M;
-
double sum = 0;
+ double sum_alt = 0.0f;
+ double sum_null = 0.0f;
+
for(size_t i = 0; i < sample_count.size(); ++i)
{
+ double L_NULL = 0.0f;
+ double L_ALT = 0.0f;
+
+ double p = mean_p[i];
+ double q = mean_q[i];
size_t oi = sample_count[i];
if(oi == 0)
{
- //sum += p - q;
double h1_zero = (M/N) * exp(-q) + ((N - M) / N);
double log_h0_zero = -p;
double log_pzero = log(h1_zero) - log_h0_zero;
sum += log_pzero;
+
+ L_NULL = log_h0_zero;
+ L_ALT = log(h1_zero);
}
else
{
//sum += (log(M/N) + Stats::logPoisson(oi, q) - Stats::logPoisson(oi, p));
- sum += log(M/N) + oi * log(q) - oi * log(p) + p - q;
+ //sum += log(M/N) + oi * log(q) - oi * log(p) + p - q;
+ L_NULL = Stats::logPoisson(oi, p);
+ L_ALT = (log(M/N) + Stats::logPoisson(oi, q));
+ sum += L_ALT - L_NULL;
}
- }
- if(opt::verbose > 0)
- {
- printf("LMHaploidNaive:\n");
- printf("\to: %lf M: %lf N: %zu p: %lf q: %lf n0: %zu\n", o, M, N, p, q, n0);
- }
+ sum_alt += L_ALT;
+ sum_null += L_NULL;
+ if(opt::verbose > 1)
+ printf("HAP_NU o: %zu LA: %lf LN: %lf S: %lf\n", oi, L_ALT, L_NULL, sum);
+ }
+
+ if(opt::verbose > 0)
+ printf("FINAL_HAP_NU M: %lf O: %lf L_A: %lf L_N: %lf S: %lf\n", M, o, sum_alt, sum_null, sum);
return sum;
}
-double LMDiploidNew(double d, const std::vector<size_t>& sample_count)
+double LMDiploid(double d, const std::vector<size_t>& sample_count)
{
size_t N = sample_count.size();
double o = 0;
@@ -416,13 +584,163 @@ double LMDiploidNew(double d, const std::vector<size_t>& sample_count)
if(opt::verbose > 0)
{
- printf("LMDiploidNew:\n");
+ printf("LMDiploid:\n");
printf("\to: %lf M: %lf N: %zu p: %lf q: %lf n0: %zu f: %lf\n", o, M, N, p, q, n0, f);
}
return sum;
}
+//
+double LMDiploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count)
+{
+ size_t N = sample_count.size();
+ double o = 0;
+ size_t n0 = 0;
+ double sum_depth = 0.0;
+ for(size_t i = 0; i < N; ++i)
+ {
+ if(sample_count[i] == 0)
+ n0 += 1;
+
+ o += sample_count[i];
+ sum_depth += depths[i];
+ }
+
+ double mean_depth = sum_depth / (2 * N);
+ double M1 = N - n0;
+ double M2 = o / mean_depth;
+ double M_est = std::max(M1, M2);
+ if(opt::verbose)
+ printf("n0: %zu M1: %lf M2: %lf\n", n0, M1, M2);
+
+ double best_M = -1;
+ double best_LM = -std::numeric_limits<double>::max();
+ return _diploidNonUniform(depths, sample_count, M_est);
+
+ double log_normalization = 0.0f;
+ size_t min_alleles = 0; //(size_t)std::max((int)M2 - 50, 0);
+ size_t max_alleles = 2*N; //std::min((size_t)M2 + 50, 2 * N);
+ size_t stride = 2;
+
+ for(size_t M = min_alleles; M < max_alleles; M += stride)
+ {
+ double log_prob_M = Stats::logPoisson(o, mean_depth*M) - log(M);
+ log_normalization = addLogs(log_normalization, log_prob_M);
+ }
+
+ //
+ for(size_t M = min_alleles; M < max_alleles; M += stride)
+ {
+ double log_prob_M = Stats::logPoisson(o, mean_depth*M) - log_normalization - log(M);
+ double LM = _diploidNonUniform(depths, sample_count, M);
+ if(opt::verbose)
+ printf("\tM: %zu log_prob_M %lf LM %lf\n", M, log_prob_M, LM);
+
+ LM += log_prob_M;
+
+ if(LM > best_LM)
+ {
+ best_LM = LM;
+ best_M = M;
+ }
+ }
+
+// if(opt::verbose)
+// printf("Optimal %lf Sum: %lf is_inf %d\n", best_LM, sum, isinf(sum));
+ return best_LM;
+}
+
+//
+double _diploidNonUniform(const std::vector<double>& depths, const std::vector<size_t>& sample_count, double M)
+{
+ size_t N = sample_count.size();
+ assert(depths.size() == N);
+
+ double f = M / (2 * N);
+ double hom_alt_prop = pow(f, 2.0f);
+ double het_prop = 2 * f * (1.0f - f);
+ double hom_ref_prop = pow(1.0 - f, 2.0f);
+
+ double o = 0;
+ double sum_p = 0.0f;
+ std::vector<double> mean_p(N);
+ std::vector<double> mean_q(N);
+
+ for(size_t i = 0; i < N; ++i)
+ {
+ o += sample_count[i];
+ mean_p[i] = depths[i];
+ mean_q[i] = depths[i];
+ sum_p += mean_p[i];
+ }
+ double mean_depth = sum_p / 2 * N;
+ (void)mean_depth;
+ // Compute the expected mean depth under the NULL and H1
+ for(size_t i = 0; i < N; ++i)
+ {
+ //mean_p[i] = (mean_p[i] / sum_p) * o;
+ //mean_q[i] = (mean_q[i] / sum_p) * o * 2 * N / M;
+ //mean_q[i] = (o / M) * depths[i] / mean_depth;
+ mean_p[i] = o / N;
+ mean_q[i] = o / M;
+ }
+
+ if(opt::verbose)
+ {
+ printf("DIP_NU Parameters -- M: %lf\n", M);
+ printf("DIP_NU Parameters -- mean_p: %lf\n", o/N);
+ printf("DIP_NU Parameters -- mean_q: %lf\n", o/M);
+ printf("DIP_NU Parameters -- f: %lf\n", f);
+ printf("DIP_NU Parameters -- hom_alt_prop: %lf\n", hom_alt_prop);
+ printf("DIP_NU Parameters -- het_prop: %lf\n", het_prop);
+ printf("DIP_NU Parameters -- hom_ref: %lf\n", hom_ref_prop);
+ }
+
+ double sum = 0;
+ double sum_alt = 0.0f;
+ double sum_null = 0.0f;
+
+ for(size_t i = 0; i < sample_count.size(); ++i)
+ {
+ double p = mean_p[i];
+ double q = mean_q[i];
+ size_t oi = sample_count[i];
+ double L_ALT = 0.0f;
+ double L_NULL = 0.0f;
+ if(oi == 0)
+ {
+ double h1_zero = hom_ref_prop + het_prop * exp(-q) + hom_alt_prop * exp(-2*q);
+ double log_h0_zero = -p;
+ double log_pzero = log(h1_zero) - log_h0_zero;
+ sum += log_pzero;
+ L_NULL = log_h0_zero;
+ L_ALT = log(h1_zero);
+ }
+ else
+ {
+ double log_null = Stats::logPoisson(oi, p);
+ double t1 = het_prop * pow(q, oi) * exp(-q) + hom_alt_prop * pow(2 * q, oi) * exp(-2 * q);
+ double t2 = Stats::logFactorial(oi);
+ double t3 = log(t1) - t2;
+ sum += t3 - log_null;
+ L_NULL = log_null;
+ L_ALT = t3;
+ }
+
+ sum_alt += L_ALT;
+ sum_null += L_NULL;
+
+ if(opt::verbose > 1)
+ printf("DIP_NU o: %zu LA: %lf LN: %lf S: %lf\n", oi, L_ALT, L_NULL, sum);
+ }
+
+ if(opt::verbose > 0)
+ printf("FINAL_DIP_NU M: %lf O: %lf f: %lf L_A: %lf L_N: %lf S: %lf\n", M, o, f, sum_alt, sum_null, sum);
+
+ return sum;
+}
+
std::vector<size_t> simulateCoverageNull(double /*d*/, size_t o, size_t N)
{
// Distribute o reads over the N individuals
@@ -510,6 +828,9 @@ std::vector<size_t> simulateCoverageDiploid(double d, size_t o, size_t N)
std::vector<size_t> simulateCoverageNullNonUniform(std::vector<double> sample_coverage, size_t o)
{
size_t N = sample_coverage.size();
+
+ if(opt::verbose > 0)
+ printf("SimulateNullNonUniform %zu reads\n", o);
// Make a discrete distribution using the per-sample coverage depths
std::vector<double> distribution(N);
@@ -539,11 +860,11 @@ std::vector<size_t> simulateCoverageNullNonUniform(std::vector<double> sample_co
}
counts[j]++;
}
-
return counts;
}
+
//
-std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample_coverage, size_t o)
+std::vector<size_t> simulateCoverageHaploidNonUniform(std::vector<double> sample_coverage, size_t o, double& out_M)
{
size_t N = sample_coverage.size();
double mean_coverage = 0.0f;
@@ -551,11 +872,73 @@ std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample
mean_coverage += sample_coverage[i];
mean_coverage /= N;
- // Here M is the number of non-reference alleles
+ // Here M is the number of carriers
double M = round(o / mean_coverage);
M = std::min(M, (double)N);
assert(M <= N);
+ if(opt::verbose > 0)
+ printf("SimulateNonUniform haploid %zu reads across %lf carriers\n", o, M);
+
+ // Select indices at random to become carriers of the variant
+ std::vector<size_t> initial_indices(N);
+ for(size_t i = 0; i < N; ++i)
+ initial_indices[i] = i;
+ std::random_shuffle(initial_indices.begin(), initial_indices.end());
+
+ // The first num_hom_alt entries will carry two alleles
+ std::vector<size_t> allele_indices(M);
+ for(size_t i = 0; i < M; ++i)
+ allele_indices[i] = initial_indices[i];
+
+ // Make a discrete distribution using the per-sample coverage depths
+ std::vector<double> distribution(allele_indices.size());
+ double sum = 0.f;
+ for(size_t i = 0; i < allele_indices.size(); ++i)
+ {
+ double p = sample_coverage[allele_indices[i]] / 2;
+ distribution[i] = sum + p;
+ sum = distribution[i];
+ }
+
+ // Normalize
+ for(size_t i = 0; i < allele_indices.size(); ++i)
+ distribution[i] /= sum;
+
+ // Distribute o reads over the individuals according to the read depth
+ std::vector<size_t> counts(N);
+ for(size_t i = 0; i < o; ++i)
+ {
+ double p = (double)rand() / RAND_MAX;
+ size_t j = 0;
+ if(p > distribution[0])
+ {
+ while(p > distribution[j])
+ j += 1;
+ j -= 1;
+ }
+
+ size_t idx = allele_indices[j];
+ counts[idx]++;
+ }
+ out_M = M;
+ return counts;
+}
+
+
+//
+std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample_coverage, size_t o, double& out_M)
+{
+ size_t N = sample_coverage.size();
+ double mean_coverage_per_allele = 0.0f;
+ for(size_t i = 0; i < N; ++i)
+ mean_coverage_per_allele += sample_coverage[i];
+ mean_coverage_per_allele = mean_coverage_per_allele / (2 * N);
+
+ // Here M is the number of non-reference alleles
+ double M = round(o / mean_coverage_per_allele);
+ M = std::min(M, (double)2*N);
+
// Proportion of non-reference alleles in the population
double q = M / (2 * N);
size_t num_hom_alt = round(pow(q, 2) * N);
@@ -563,7 +946,7 @@ std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample
if(opt::verbose > 0)
{
- printf("SimulateNonUniform diploid %zu reads across %lf alleles\n", o, M);
+ printf("SimulateNonUniformDiploid %zu reads across %lf alleles\n", o, M);
printf(" q: %lf num_hom_alt: %zu num_het: %zu\n", q, num_hom_alt, num_het);
}
@@ -573,17 +956,31 @@ std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample
initial_indices[i] = i;
std::random_shuffle(initial_indices.begin(), initial_indices.end());
+ std::vector<size_t> allele_count(N);
+
// The first num_hom_alt entries will carry two alleles
std::vector<size_t> allele_indices(M);
for(size_t i = 0; i < num_hom_alt; ++i)
{
allele_indices[2*i] = initial_indices[i];
allele_indices[2*i + 1] = initial_indices[i];
+
+ allele_count[initial_indices[i]] = 2;
}
// The next num_het entries carry one allele
for(size_t i = 0; i < num_het; ++i)
+ {
allele_indices[2*num_hom_alt + i] = initial_indices[num_hom_alt + i];
+ allele_count[initial_indices[num_hom_alt + i]] = 1;
+ }
+
+ if(opt::verbose > 0)
+ {
+ std::cout << "Allele Counts:\n";
+ std::copy(allele_count.begin(), allele_count.end(), std::ostream_iterator<size_t>(std::cout, " "));
+ std::cout << "\n";
+ }
// Make a discrete distribution using the per-sample coverage depths
std::vector<double> distribution(allele_indices.size());
@@ -615,7 +1012,7 @@ std::vector<size_t> simulateCoverageDiploidNonUniform(std::vector<double> sample
size_t idx = allele_indices[j];
counts[idx]++;
}
-
+ out_M = num_het + num_hom_alt;
return counts;
}
Please sign in to comment.
Something went wrong with that request. Please try again.