Skip to content

Commit

Permalink
Use SAM headers from index directory's <prefix>.hdr if present
Browse files Browse the repository at this point in the history
Introduce a <prefix>.hdr file alongside the other index files, useful for
setting up detailed `@SQ` headers etc to be used by default when mapping.

Output SAM headers from: -H options; .hdr file stored with the index files;
the basic hard-coded `@HD`/`@SQ` default headers. An `@HD` header is always
printed, with -H overriding one from <prefix>.hdr (if any), which overrides
the default one. A set of `@SQ` headers is always printed, with -H options
overriding <prefix>.hdr overriding the default, similarly. Other headers
specified in either -H or <prefix>.hdr are all output.

If -H includes `@SQ` headers, we consider that the user has carefully
set up all headers to be output and ignore <prefix>.hdr entirely.

Add descriptions of <prefix>.alt (brief) and <prefix>.hdr (complete)
to the manual page. This <prefix>.hdr file is implemented for mem, bwase,
and bwape. It is not implemented for bwasw, which is deprecated.
  • Loading branch information
jmarshall committed Mar 17, 2022
1 parent 2d4272b commit f53a9fe
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 21 deletions.
34 changes: 33 additions & 1 deletion bwa.1
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,33 @@ RAM and does not work for databases with total length longer than 2GB. The
second algorithm is adapted from the BWT-SW source code. It in theory works
with database with trillions of bases. When this option is not specified, the
appropriate algorithm will be chosen automatically.

.PP
.B ADDITIONAL FILES:

In addition to the files generated by the
.B index
command, several other files may be used by the mapping sub-commands if they
are placed alongside the generated index files:
.TP 4
.IR prefix .alt
Identifies reference sequences that are alternative alleles, and provides
alignment information that can be used to lift ALT hits over to corresponding
positions within the primary assembly.
.TP
.IR prefix .hdr
Provides SAM headers to be used as the mapping output's headers, instead of
the default simple `@HD' and `@SQ' headers. This can be used to add descriptive
fields to the `@SQ' headers output: species, assembly, alternative names (which
enables querying for e.g. `chr1' or `1' interchangeably), alternate loci (ALT
contigs listed in
.IR prefix .alt
should have `AH:*' or similar), etc. If the file contains an `@HD' header,
it should include `SO:unsorted GO:query'.

The output from
.B samtools dict
is a suitable starting point for creating this file.
.RE

.TP
Expand Down Expand Up @@ -293,7 +320,12 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
.BI -H \ ARG
If ARG starts with @, it is interpreted as a string and gets inserted into the
output SAM header; otherwise, ARG is interpreted as a file with all lines
starting with @ in the file inserted into the SAM header. [null]
starting with @ in the file inserted into the SAM header. If the header lines
added via
.B -H
include any `@SQ' headers, the
.IR db.prefix .hdr
file (if it exists) will be ignored. [null]
.TP
.BI -o \ FILE
Write the output SAM file to
Expand Down
107 changes: 90 additions & 17 deletions bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -404,39 +404,112 @@ int bwa_idx2mem(bwaidx_t *idx)
* SAM header routines *
***********************/

void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
// Remove the first line from lines, possibly printing it
// (lines is newline-separated but NOT newline-terminated)
static const char *remove_line(const char *lines, int print)
{
const char *nl = strchr(lines, '\n');
if (nl) {
if (print) err_printf("%.*s\n", (int)(nl - lines), lines);
return nl+1;
}
else {
if (print) err_printf("%s\n", lines);
return NULL;
}
}

// Return 1 iff there are any @SQ header lines
static int has_SQ(const char *lines)
{
if (lines == NULL) return 0;
return strncmp(lines, "@SQ\t", 4) == 0 || strstr(lines, "\n@SQ\t");
}

// Count the number of @SQ header lines
static int count_SQ(const char *lines)
{
int i, n_HD = 0, n_SQ = 0;
extern char *bwa_pg;

if (hdr_line) {
// check for HD line
const char *p = hdr_line;
if ((p = strstr(p, "@HD")) != 0) {
++n_HD;
}
int n_SQ = 0;
if (lines) {
const char *p = lines;
// check for SQ lines
p = hdr_line;
while ((p = strstr(p, "@SQ\t")) != 0) {
if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
if (p == lines || *(p-1) == '\n') ++n_SQ;
p += 4;
}
}
if (n_SQ == 0) {
return n_SQ;
}

static void print_sam_hdr(const bntseq_t *bns, const char *bns_hdr, const char *hdr_line)
{
int i, n_HD = 0, n_SQ = count_SQ(hdr_line);

// Print the user's @HD line (if present), or the index's, or a default
// (As a byproduct, remove any @HD lines from both user and index headers)
if (hdr_line && strncmp(hdr_line, "@HD\t", 4) == 0) {
hdr_line = remove_line(hdr_line, n_HD == 0);
n_HD++;
}
if (bns_hdr && strncmp(bns_hdr, "@HD\t", 4) == 0) {
bns_hdr = remove_line(bns_hdr, n_HD == 0);
n_HD++;
}
if (n_HD == 0) err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");

// If neither the user's nor the index's headers provide @SQ lines, generate them
if (n_SQ == 0 && !has_SQ(bns_hdr)) {
for (i = 0; i < bns->n_seqs; ++i) {
err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len);
if (bns->anns[i].is_alt) err_printf("\tAH:*\n");
else err_fputc('\n', stdout);
}
} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
if (n_HD == 0) {
err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
}

if (n_SQ != 0 && n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);

if (bns_hdr) err_printf("%s\n", bns_hdr);
if (hdr_line) err_printf("%s\n", hdr_line);
if (bwa_pg) err_printf("%s\n", bwa_pg);
}

void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
{
print_sam_hdr(bns, NULL, hdr_line);
}

void bwa_print_sam_hdr2(const bntseq_t *bns, const char *prefix, const char *hdr_line)
{
kstring_t str = { 0, 0, NULL };
char *bns_hdr = NULL;

// Ignore index .hdr file entirely if the user's headers provide @SQ lines
if (!has_SQ(hdr_line)) {
FILE *fp;
// Otherwise read the .hdr file if present
ksprintf(&str, "%s.hdr", prefix);
if ((fp = fopen(str.s, "r")) != NULL) {
int c, n_SQ;
str.l = 0;
while ((c = getc(fp)) != EOF)
if (c != '\r') kputc(c, &str);
fclose(fp);

while (str.l > 0 && str.s[str.l-1] == '\n') str.l--;
str.s[str.l] = '\0';
if (str.l > 0) bns_hdr = str.s;

n_SQ = count_SQ(str.s);
if (n_SQ != 0 && n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided in \"%s.hdr\"; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, prefix, bns->n_seqs);
}
}

print_sam_hdr(bns, bns_hdr, hdr_line);
free(str.s);
}

static char *bwa_escape(char *s)
{
char *p, *q;
Expand Down
1 change: 1 addition & 0 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ extern "C" {
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);

void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
void bwa_print_sam_hdr2(const bntseq_t *bns, const char *prefix, const char *hdr_line);
char *bwa_set_rg(const char *s);
char *bwa_insert_header(const char *s, char *hdr);

Expand Down
2 changes: 1 addition & 1 deletion bwape.c
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
}

// core loop
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_hdr2(bns, prefix, rg_line);
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
isize_info_t ii;
Expand Down
2 changes: 1 addition & 1 deletion bwase.c
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
exit(1);
}
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_hdr2(bns, prefix, rg_line);
// set ks
ks = bwa_open_reads(opt.mode, fn_fa);
// core loop
Expand Down
2 changes: 1 addition & 1 deletion fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ int main_mem(int argc, char *argv[])
opt->flag |= MEM_F_PE;
}
}
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
bwa_print_sam_hdr2(aux.idx->bns, argv[optind], hdr_line);
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
free(hdr_line);
Expand Down

0 comments on commit f53a9fe

Please sign in to comment.