Permalink
Browse files

simple ace2sam converter

  • Loading branch information...
Heng Li
Heng Li committed Sep 23, 2011
1 parent dd1ac67 commit e8db9defffd5beb6b4a4947c12b2b94290c44d03
Showing with 532 additions and 0 deletions.
  1. +23 −0 seq/ace2sam/README
  2. +142 −0 seq/ace2sam/ace2sam.c
  3. +224 −0 seq/ace2sam/kseq.h
  4. +143 −0 seq/ace2sam/kstring.h
View
@@ -0,0 +1,23 @@
+To compile:
+
+ gcc -g -O2 -Wall -o ace2sam ace2sam.c -lz
+
+To run:
+
+ ace2sam in.ace > a2s.out 2> a2s.err
+ (grep ^H a2s.err | sed s,^..,,; cat a2s.out) > a2s.sam
+ grep ^S a2s.err | sed s,^..,, > a2s.fa
+
+The immediate output of ace2sam is a SAM without header. Ace2sam works on a
+stream. It is unable to write the SAM header until it sees all the contigs.
+
+Assumptions about ACE:
+
+ * Fields appear in the following order: AS->(CO->[BQ]->(AF)->(RD->QA)).
+
+ * The order of reads in AF is identical to the order in RD.
+
+ * Words and numbers are separated by a single space or TAB.
+
+ * Each line ends with '\n', not '\r' or '\r\n'.
+
View
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include "kstring.h"
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define N_TMPSTR 5
+
+#define write_cigar(_c, _n, _m, _v) do { \
+ if (_n == _m) { \
+ _m = _m? _m<<1 : 4; \
+ _c = realloc(_c, _m * sizeof(unsigned)); \
+ } \
+ _c[_n++] = (_v); \
+ } while (0)
+
+int main(int argc, char *argv[])
+{
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t s, t[N_TMPSTR];
+ int dret, i, af_n, af_max, af_i;
+ long n_ctgs = 0, n_reads = 0, m_cigar = 0, n_cigar = 0;
+ unsigned *af, *cigar = 0;
+
+ if (argc == 1) {
+ fprintf(stderr, "Usage: ace2sam <in.ace>\n");
+ return 1;
+ }
+ s.l = s.m = 0; s.s = 0;
+ af_n = af_max = af_i = 0; af = 0;
+ for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
+ fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ if (strcmp(s.s, "AS") == 0) {
+ ks_getuntil(ks, 0, &s, &dret); n_ctgs = atol(s.s);
+ ks_getuntil(ks, 0, &s, &dret); n_reads = atol(s.s);
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ } else if (strcmp(s.s, "CO") == 0) {
+ t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded; 2: unpadded; 3: CIGAR/flat-cigar; 4: SAM line
+ af_n = af_i = 0;
+ ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]);
+ ks_getuntil(ks, 0, &s, &dret);
+ ks_getuntil(ks, 0, &s, &dret); n_reads = atoi(s.s);
+ ks_getuntil(ks, '\n', &s, &dret);
+ fprintf(stderr, "S >%s\n", t[0].s);
+ while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) {
+ kputs(s.s, &t[1]);
+ fputs("S ", stderr); fputs(s.s, stderr); fputc('\n', stderr);
+ }
+
+#define __padded2cigar(sp, su) do { \
+ int i, l_M = 0, l_D = 0; \
+ kputsn(sp.s, sp.l, &su); su.l = 0; \
+ for (i = 0; i < sp.l; ++i) { \
+ if (sp.s[i] == '*') { \
+ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
+ ++l_D; l_M = 0; \
+ } else { \
+ if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
+ ++l_M; l_D = 0; \
+ su.s[su.l++] = sp.s[i]; \
+ } \
+ } \
+ su.s[su.l] = 0; \
+ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
+ else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
+ } while (0)
+
+ n_cigar = 0;
+ __padded2cigar(t[1], t[2]);
+ for (i = 0; i < n_cigar; ++i) {
+ kputw(cigar[i]>>4, &t[3]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[3]);
+ }
+ kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
+ kputsn(t[3].s, t[3].l, &t[4]); kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
+ fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[1].l);
+ } else if (strcmp(s.s, "BQ") == 0) {
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ t[4].s[--t[4].l] = 0;
+ for (i = 0; i < t[2].l; ++i) {
+ int q;
+ if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
+ q = atoi(s.s) + 33;
+ if (q > 126) q = 126;
+ kputc(q, &t[4]);
+ }
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ ks_getuntil(ks, '\n', &s, &dret);
+ puts(t[4].s); t[4].l = 0;
+ } else if (strcmp(s.s, "AF") == 0) { // read coordinate
+ int reversed, neg, pos;
+ if (t[4].l) puts(t[4].s);
+ t[4].l = 0;
+ ks_getuntil(ks, 0, &s, &dret);
+ ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0;
+ ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos;
+ if (af_n == af_max) {
+ af_max = af_max? af_max<<1 : 4;
+ af = realloc(af, af_max * sizeof(unsigned));
+ }
+ af[af_n++] = pos << 2 | neg << 1 | reversed;
+ } else if (strcmp(s.s, "RD") == 0) { // read sequence
+ t[2].l = t[3].l = t[4].l = 0;
+ ks_getuntil(ks, 0, &t[4], &dret); kputc('\t', &t[4]); // read name
+ kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]);
+ kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // the SAM line stops at RNAME
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]);
+ } else if (strcmp(s.s, "QA") == 0) { // clipping
+ int beg, end, pos;
+ ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
+ ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
+ ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s);
+ pos = af[af_i]>>2;
+ if (af[af_i]>>1&1) pos = -pos;
+ pos += beg;
+ kputw(pos, &t[4]); kputs("\t60\t", &t[4]);
+ n_cigar = 0; // start to generate CIGAR
+ if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
+ __padded2cigar(t[2], t[3]);
+ if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l-end)<<4|4);
+ if (n_cigar > 1 && (cigar[0]&0xf) == 4) cigar[1] -= beg<<4;
+ if (n_cigar > 1 && (cigar[n_cigar-1]&0xf) == 4) cigar[n_cigar-2] -= cigar[n_cigar-1]>>4<<4;
+ for (i = 0; i < n_cigar; ++i) {
+ kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
+ }
+ kputs("\t*\t0\t0\t", &t[4]);
+ kputsn(t[3].s, t[3].l, &t[4]); kputs("\t*", &t[4]);
+ puts(t[4].s);
+ ++af_i;
+ } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(af); free(s.s); free(cigar);
+ for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
+ return 0;
+}
View
@@ -0,0 +1,224 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Last Modified: 18AUG2011 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_MAX 1
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < i - ks->begin + 1) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t) \
+ static inline kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ static inline void kseq_rewind(kseq_t *ks) \
+ { \
+ ks->last_char = 0; \
+ ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
+ } \
+ static inline void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ \
+ static int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT(type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 4096) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(type_t) \
+ __KSEQ_READ
+
+#endif
Oops, something went wrong.

0 comments on commit e8db9de

Please sign in to comment.