Skip to content
Browse files

prepare to replace the SAM printing code

This move is dangerous as SAM printing is very complex, but it will benefit in
the long run. The planned change will reduce the redundancy, improves clarity
and most importantly makes it much easier to output multiple primary hits in an
optional tag.
  • Loading branch information...
1 parent 5581cb9 commit 8f0d43991356bd26bcb8fc9fcea4bdbba566373e @lh3 committed
Showing with 124 additions and 7 deletions.
  1. +101 −5 bwamem.c
  2. +6 −2 bwamem.h
  3. +17 −0 kstring.h
View
106 bwamem.c
@@ -714,6 +714,95 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
#undef is_mapped
}
+void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m)
+{
+ int i, copy_mate = 0;
+ mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert
+
+ // set flag
+ p->flag |= m? 0x1 : 0; // is paired in sequencing
+ p->flag |= p->rid < 0? 4 : 0; // is mapped
+ p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped
+ if (p->rid < 0 && m && m->rid >= 0)
+ p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1;
+ p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand
+ p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand
+ if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read
+
+ // print up to CIGAR
+ kputs(s->name, str); kputc('\t', str); // QNAME
+ kputw(p->flag, str); kputc('\t', str); // FLAG
+ if (p->rid >= 0) { // with coordinate
+ kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
+ kputl(p->pos + 1, str); kputc('\t', str); // POS
+ kputw(p->mapq, str); kputc('\t', str); // MAPQ
+ if (p->n_cigar) { // aligned
+ for (i = 0; i < p->n_cigar; ++i) {
+ kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str);
+ }
+ } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
+ } else kputsn("*\t0\t0\t*", 7, str); // without coordinte
+ kputc('\t', str);
+
+ // print the mate position if applicable
+ if (m && m->rid) {
+ if (p->rid == m->rid) kputc('=', str);
+ else kputs(bns->anns[m->rid].name, str);
+ kputc('\t', str);
+ kputl(m->pos + 1, str);
+ if (p->rid == m->rid) {
+ int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5;
+ int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5;
+ kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
+ } else kputc('0', str);
+ } else if (m && p->rid) {
+ kputsn("\t=\t", 3, str);
+ kputl(p->pos + 1, str);
+ kputsn("\t0\t", 3, str);
+ } else kputsn("*\t0\t0\t", 6, str);
+
+ // print SEQ and QUAL
+ if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL
+ kputsn("*\t*", 3, str);
+ } else if (!p->is_rev) { // the forward strand
+ int i, qb = 0, qe = s->l_seq;
+ if (p->n_cigar) {
+ if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4;
+ if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4;
+ }
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
+ kputc('\t', str);
+ if (s->qual) { // printf qual
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
+ str->s[str->l] = 0;
+ } else kputc('*', str);
+ } else { // the reverse strand
+ int i, qb = 0, qe = s->l_seq;
+ if (p->n_cigar) {
+ if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4;
+ if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4;
+ }
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
+ kputc('\t', str);
+ if (s->qual) { // printf qual
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
+ str->s[str->l] = 0;
+ } else kputc('*', str);
+ }
+
+ // print optional tags
+ if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); }
+ if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
+ if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
+ if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
+ if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+ kputc('\n', str);
+}
+
/************************
* Integrated interface *
************************/
@@ -816,14 +905,20 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
{
mem_aln_t a;
- int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
- int64_t pos, rb = ar->rb, re = ar->re;
+ int i, w2, qb, qe, NM, score, is_rev;
+ int64_t pos, rb, re;
uint8_t *query;
+ memset(&a, 0, sizeof(mem_aln_t));
+ if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record
+ a.rid = -1; a.pos = -1; a.flag |= 0x4;
+ return a;
+ }
+ qb = ar->qb, qe = ar->qe;
+ rb = ar->rb, re = ar->re;
query = malloc(l_query);
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
- memset(&a, 0, sizeof(mem_aln_t));
a.mapq = mem_approx_mapq_se(opt, ar);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
@@ -839,13 +934,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
if (clip5) {
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
- a.cigar[0] = clip5<<4|3;
+ a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3);
++a.n_cigar;
}
- if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
+ if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3);
}
a.rid = bns_pos2rid(bns, pos);
a.pos = pos - bns->anns[a.rid].offset;
+ a.r5 = rb; a.score = ar->score; a.sub = ar->sub;
free(query);
return a;
}
View
8 bwamem.h
@@ -67,11 +67,15 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of
} bwahit_t;
typedef struct { // This struct is only used for the convenience of API.
- int rid; // reference sequence index in bntseq_t
- int pos; // forward strand 5'-end mapping position
+ int64_t pos; // forward strand 5'-end mapping position
+ int rid; // reference sequence index in bntseq_t; <0 for unmapped
+ int flag; // extra flag
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
+
+ int64_t r5; // position of the 5'-end of read (for computing TLEN)
+ int score, sub;
} mem_aln_t;
#ifdef __cplusplus
View
17 kstring.h
@@ -89,6 +89,23 @@ static inline int kputuw(unsigned c, kstring_t *s)
return 0;
}
+static inline int kputl(long c, kstring_t *s)
+{
+ char buf[32];
+ long l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
int ksprintf(kstring_t *s, const char *fmt, ...);
#endif

0 comments on commit 8f0d439

Please sign in to comment.
Something went wrong with that request. Please try again.