Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use SAM headers from index directory's <prefix>.hdr if it exists #348

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 35 additions & 1 deletion bwa.1
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,33 @@ RAM and does not work for databases with total length longer than 2GB. The
second algorithm is adapted from the BWT-SW source code. It in theory works
with database with trillions of bases. When this option is not specified, the
appropriate algorithm will be chosen automatically.

.PP
.B ADDITIONAL FILES:

In addition to the files generated by the
.B index
command, several other files may be used by the mapping sub-commands if they
are placed alongside the generated index files:
.TP 4
.IR prefix .alt
Identifies reference sequences that are alternative alleles, and provides
alignment information that can be used to lift ALT hits over to corresponding
positions within the primary assembly.
.TP
.IR prefix ".hdr or " baseprefix .dict
Provides SAM headers to be used as the mapping output's headers, instead of
the default simple `@HD' and `@SQ' headers. This can be used to add descriptive
fields to the `@SQ' headers output: species, assembly, alternative names (which
enables querying for e.g. `chr1' or `1' interchangeably), alternate loci (ALT
contigs listed in
.IR prefix .alt
should have `AH:*' or similar), etc. If the file contains an `@HD' header,
it should include `SO:unsorted GO:query'.

The output from
.B samtools dict
is a suitable starting point for creating this file.
.RE

.TP
Expand Down Expand Up @@ -293,7 +320,14 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
.BI -H \ ARG
If ARG starts with @, it is interpreted as a string and gets inserted into the
output SAM header; otherwise, ARG is interpreted as a file with all lines
starting with @ in the file inserted into the SAM header. [null]
starting with @ in the file inserted into the SAM header. If the header lines
added via
.B -H
include any `@SQ' headers, the
.IR db.prefix .hdr
and
.IR db.baseprefix .dict
files (if either exist) will be ignored. [null]
.TP
.BI -o \ FILE
Write the output SAM file to
Expand Down
117 changes: 100 additions & 17 deletions bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -404,39 +404,122 @@ int bwa_idx2mem(bwaidx_t *idx)
* SAM header routines *
***********************/

void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
// Remove the first line from lines, possibly printing it
// (lines is newline-separated but NOT newline-terminated)
static const char *remove_line(const char *lines, int print)
{
const char *nl = strchr(lines, '\n');
if (nl) {
if (print) err_printf("%.*s\n", (int)(nl - lines), lines);
return nl+1;
}
else {
if (print) err_printf("%s\n", lines);
return NULL;
}
}

// Return 1 iff there are any @SQ header lines
static int has_SQ(const char *lines)
{
if (lines == NULL) return 0;
return strncmp(lines, "@SQ\t", 4) == 0 || strstr(lines, "\n@SQ\t");
}

// Count the number of @SQ header lines
static int count_SQ(const char *lines)
{
int i, n_HD = 0, n_SQ = 0;
extern char *bwa_pg;

if (hdr_line) {
// check for HD line
const char *p = hdr_line;
if ((p = strstr(p, "@HD")) != 0) {
++n_HD;
}
int n_SQ = 0;
if (lines) {
const char *p = lines;
// check for SQ lines
p = hdr_line;
while ((p = strstr(p, "@SQ\t")) != 0) {
if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
if (p == lines || *(p-1) == '\n') ++n_SQ;
p += 4;
}
}
if (n_SQ == 0) {
return n_SQ;
}

static void print_sam_hdr(const bntseq_t *bns, const char *bns_hdr, const char *hdr_line)
{
int i, n_HD = 0, n_SQ = count_SQ(hdr_line);

// Print the user's @HD line (if present), or the index's, or a default
// (As a byproduct, remove any @HD lines from both user and index headers)
if (hdr_line && strncmp(hdr_line, "@HD\t", 4) == 0) {
hdr_line = remove_line(hdr_line, n_HD == 0);
n_HD++;
}
if (bns_hdr && strncmp(bns_hdr, "@HD\t", 4) == 0) {
bns_hdr = remove_line(bns_hdr, n_HD == 0);
n_HD++;
}
if (n_HD == 0) err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");

// If neither the user's nor the index's headers provide @SQ lines, generate them
if (n_SQ == 0 && !has_SQ(bns_hdr)) {
for (i = 0; i < bns->n_seqs; ++i) {
err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len);
if (bns->anns[i].is_alt) err_printf("\tAH:*\n");
else err_fputc('\n', stdout);
}
} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
if (n_HD == 0) {
err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
}

if (n_SQ != 0 && n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);

if (bns_hdr) err_printf("%s\n", bns_hdr);
if (hdr_line) err_printf("%s\n", hdr_line);
if (bwa_pg) err_printf("%s\n", bwa_pg);
}

void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
{
print_sam_hdr(bns, NULL, hdr_line);
}

void bwa_print_sam_hdr2(const bntseq_t *bns, const char *prefix, const char *hdr_line)
{
kstring_t str = { 0, 0, NULL };
char *bns_hdr = NULL;

// Ignore index .hdr/.dict file entirely if the user's headers provide @SQ lines
if (!has_SQ(hdr_line)) {
FILE *fp;
// Otherwise read the .hdr file if present
ksprintf(&str, "%s.hdr", prefix);
fp = fopen(str.s, "r");
if (!fp) { // Otherwise also try .dict if present
size_t l;
str.l -= 4;
if (str.l >= 3 && strncmp(&str.s[str.l-3], ".gz", 3) == 0) str.l -= 3;
for (l = str.l; l > 0 && str.s[l-1] != '/'; l--)
if (str.s[l-1] == '.') { str.l = l-1; break; }
ksprintf(&str, ".dict");
fp = fopen(str.s, "r");
}
if (fp) {
int c, n_SQ;
str.l = 0;
while ((c = getc(fp)) != EOF)
if (c != '\r') kputc(c, &str);
fclose(fp);

while (str.l > 0 && str.s[str.l-1] == '\n') str.l--;
str.s[str.l] = '\0';
if (str.l > 0) bns_hdr = str.s;

n_SQ = count_SQ(str.s);
if (n_SQ != 0 && n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided in \"%s.hdr\"; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, prefix, bns->n_seqs);
}
}

print_sam_hdr(bns, bns_hdr, hdr_line);
free(str.s);
}

static char *bwa_escape(char *s)
{
char *p, *q;
Expand Down
1 change: 1 addition & 0 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ extern "C" {
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);

void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
void bwa_print_sam_hdr2(const bntseq_t *bns, const char *prefix, const char *hdr_line);
char *bwa_set_rg(const char *s);
char *bwa_insert_header(const char *s, char *hdr);

Expand Down
2 changes: 1 addition & 1 deletion bwape.c
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
}

// core loop
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_hdr2(bns, prefix, rg_line);
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
isize_info_t ii;
Expand Down
2 changes: 1 addition & 1 deletion bwase.c
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
exit(1);
}
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_hdr2(bns, prefix, rg_line);
// set ks
ks = bwa_open_reads(opt.mode, fn_fa);
// core loop
Expand Down
2 changes: 1 addition & 1 deletion fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ int main_mem(int argc, char *argv[])
opt->flag |= MEM_F_PE;
}
}
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
bwa_print_sam_hdr2(aux.idx->bns, argv[optind], hdr_line);
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
free(hdr_line);
Expand Down