Skip to content

Commit

Permalink
Adjust min/max qual for SNP caller.
Browse files Browse the repository at this point in the history
There is now a maximum quality value, defaulting to 60 so above
Illumina values.  This is useful or PacBio CCS which produces
abnormally high qualities leading to over-certainty in calls,
especially for incorrect genotype assignment.

SNP base quality is now the minimum of this base qual plus the base
either side.  CCS data commonly has a neighbouring low qual value
adjacent to in incorrect but high quality substitution.  However it
turns out this is also beneficial to the HG002 Illumina data.

An example of the effect for Illumina 60x data (with adjustment on min
QUAL to get the same FN rate as the scores are slightly suppressed
now):

Before:
SNP          Q>0 /   Q>=20 / Filtered
SNP   TP  262541 /  262287 /  262283
SNP   FP    2313 /    1442 /    1405
SNP   GT     287 /     269 /     269
SNP   FN    1769 /    2023 /    2027

After:
SNP          Q>0 /   Q>=15 / Filtered
SNP   TP  262503 /  262298 /  262294
SNP   FP    1787 /    1349 /    1312   -6.6%
SNP   GT     283 /     268 /     268   ~=
SNP   FN    1807 /    2012 /    2016   -0.5%

For 31x PacBio CCS, the quality assignment suppression is more
pronounced due to the excessively high QUALs in the bam records, but
the FN/FP tradeoff is the same (again tuning to try and compare with
~= FN):

Before:
SNP          Q>0 /   Q>=47 / Filtered
SNP   TP  263847 /  263693 /  263693
SNP   FP    4908 /    3089 /    3089
SNP   GT     804 /     793 /     793
SNP   FN     463 /     617 /     617

After:
SNP          Q>0 /   Q>=15 / Filtered
SNP   TP  263779 /  263692 /  263692
SNP   FP    3493 /    2973 /    2973   -10%
SNP   GT     509 /     503 /     503   -37%
SNP   FN     531 /     618 /     618   ~=

Finally, added -X,--config STR option to specify sets of parameters
tuned at specific platforms. Earlier experiments showed the indel
caller on PacBio CCS data needs very different gap-open and gap-extend
parameters.  It also benefits a bit from raising minimum qual to 5 and
because this is a new option we also enable partial realignment by
default.
  • Loading branch information
jkbonfield committed Mar 24, 2021
1 parent 98bbd48 commit f4e8f72
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 8 deletions.
15 changes: 13 additions & 2 deletions bam2bcf.c
Expand Up @@ -40,14 +40,15 @@ extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);

#define CAP_DIST 25

bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ)
{
bcf_callaux_t *bca;
if (theta <= 0.) theta = CALL_DEFTHETA;
bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
bca->capQ = 60;
bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
bca->min_baseQ = min_baseQ;
bca->max_baseQ = max_baseQ;
bca->e = errmod_init(1. - theta);
bca->min_frac = 0.002;
bca->min_support = 1;
Expand Down Expand Up @@ -225,8 +226,18 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
{
b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
baseQ = q = (int)bam_get_qual(p->b)[p->qpos];

// Lowest of this and neighbour quality values
uint8_t *qual = bam_get_qual(p->b);
q = qual[p->qpos];
if (p->qpos > 0 && q > qual[p->qpos-1])
q = qual[p->qpos-1];
if (p->qpos+1 < p->b->core.l_qseq && q > qual[p->qpos+1])
q = qual[p->qpos+1];

if (q < bca->min_baseQ) continue;
if (q > bca->max_baseQ) q = bca->max_baseQ;
baseQ = q;
seqQ = 99;
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
}
Expand Down
4 changes: 2 additions & 2 deletions bam2bcf.h
Expand Up @@ -84,7 +84,7 @@ DEALINGS IN THE SOFTWARE. */

typedef struct __bcf_callaux_t {
int fmt_flag;
int capQ, min_baseQ;
int capQ, min_baseQ, max_baseQ;
int openQ, extQ, tandemQ; // for indels
uint32_t min_support, max_support; // for collecting indel candidates
double min_frac; // for collecting indel candidates
Expand Down Expand Up @@ -153,7 +153,7 @@ typedef struct {
extern "C" {
#endif

bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ);
void bcf_call_destroy(bcf_callaux_t *bca);
int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
Expand Down
44 changes: 40 additions & 4 deletions mpileup.c
Expand Up @@ -66,8 +66,8 @@ typedef struct _mplp_pileup_t mplp_pileup_t;

// Data shared by all bam files
typedef struct {
int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth,
max_read_len, fmt_flag;
int min_mq, flag, min_baseQ, max_baseQ, capQ_thres, max_depth,
max_indel_depth, max_read_len, fmt_flag;
int rflag_require, rflag_filter, output_type;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
Expand Down Expand Up @@ -853,7 +853,7 @@ static int mpileup(mplp_conf_t *conf)
bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");

conf->bca = bcf_call_init(-1., conf->min_baseQ);
conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ);
conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
conf->bca->indel_bias = conf->indel_bias;
Expand Down Expand Up @@ -1132,6 +1132,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
fprintf(fp,
" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
fprintf(fp,
" --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
fprintf(fp,
" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
" -R, --regions-file FILE restrict to regions listed in a file\n"
" --ignore-RG ignore RG tags (one BAM = one sample)\n"
Expand All @@ -1157,6 +1159,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" --threads INT use multithreading with INT worker threads [0]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" -X, --config STR Specify platform specific configuration profiles\n"
" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
fprintf(fp,
" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
Expand All @@ -1179,6 +1182,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
"\n", mplp->indel_bias);
fprintf(fp,
"Notes: Assuming diploid individuals.\n"
"--config STR values are equivalent to the following option combinations:\n"
" 1.12: -Q13 -h100 -m1\n"
" illumina: -D\n"
" pacbio-ccs: -F0.1 -o25 -e1 -Q5 --max-BQ 50 -D -M99999\n"
"\n"
"Example:\n"
" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
Expand All @@ -1198,6 +1205,7 @@ int main_mpileup(int argc, char *argv[])
mplp_conf_t mplp;
memset(&mplp, 0, sizeof(mplp_conf_t));
mplp.min_baseQ = 1;
mplp.max_baseQ = 60;
mplp.capQ_thres = 0;
mplp.max_depth = 250; mplp.max_indel_depth = 250;
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
Expand Down Expand Up @@ -1251,6 +1259,8 @@ int main_mpileup(int argc, char *argv[])
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"min-bq", required_argument, NULL, 'Q'},
{"max-bq", required_argument, NULL, 11},
{"max-BQ", required_argument, NULL, 11},
{"ignore-overlaps", no_argument, NULL, 'x'},
{"output-type", required_argument, NULL, 'O'},
{"samples", required_argument, NULL, 's'},
Expand All @@ -1267,9 +1277,10 @@ int main_mpileup(int argc, char *argv[])
{"per-sample-mf", no_argument, NULL, 'p'},
{"platforms", required_argument, NULL, 'P'},
{"max-read-len", required_argument, NULL, 'M'},
{"config", required_argument, NULL, 'X'},
{NULL, 0, NULL, 0}
};
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:",lopts,NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
Expand Down Expand Up @@ -1339,6 +1350,7 @@ int main_mpileup(int argc, char *argv[])
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
case 'Q': mplp.min_baseQ = atoi(optarg); break;
case 11: mplp.max_baseQ = atoi(optarg); break;
case 'b': file_list = optarg; break;
case 'o': {
char *end;
Expand Down Expand Up @@ -1369,6 +1381,30 @@ int main_mpileup(int argc, char *argv[])
mplp.fmt_flag |= parse_format_flag(optarg);
break;
case 'M': mplp.max_read_len = atoi(optarg); break;
case 'X':
if (strcasecmp(optarg, "pacbio-ccs") == 0) {
mplp.min_frac = 0.1;
mplp.min_baseQ = 5;
mplp.max_baseQ = 50;
mplp.openQ = 25;
mplp.extQ = 1;
mplp.flag |= MPLP_REALN_PARTIAL;
mplp.max_read_len = 99999;
} else if (strcasecmp(optarg, "1.12") == 0) {
// 1.12 and earlier
mplp.min_frac = 0.05;
mplp.min_support = 1;
mplp.min_baseQ = 13;
mplp.tandemQ = 100;
} else if (strcasecmp(optarg, "illumina") == 0) {
mplp.flag |= MPLP_REALN_PARTIAL;
} else {
fprintf(stderr, "Unknown configuration name '%s'\n"
"Please choose from 1.12, illumina or pacbio-ccs\n",
optarg);
return 1;
}
break;
default:
fprintf(stderr,"Invalid option: '%c'\n", c);
return 1;
Expand Down

0 comments on commit f4e8f72

Please sign in to comment.