Permalink
Browse files

ropebwt2 for bwt construction

1 parent 6c668d3 commit be2020c4af30f9dee417762d3dae8eb00a0309a3 @lh3 committed Jan 26, 2016
Showing with 678 additions and 11 deletions.
  1. +1 −1 Makefile
  2. +4 −3 bwa.1
  3. +29 −7 bwtindex.c
  4. +191 −0 rle.c
  5. +77 −0 rle.h
  6. +318 −0 rope.c
  7. +58 −0 rope.h
View
@@ -5,7 +5,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
-AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
+AOBJS= QSufSort.o bwt_gen.o rope.o rle.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o maxk.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
View
7 bwa.1
@@ -60,11 +60,12 @@ Index database sequences in the FASTA format.
Prefix of the output database [same as db filename]
.TP
.BI -a \ STR
-Algorithm for constructing BWT index. BWA implements two algorithms for BWT
+Algorithm for constructing BWT index. BWA implements three algorithms for BWT
construction:
-.B is
+.BR is ,
+.B bwtsw
and
-.BR bwtsw .
+.BR rb2 .
The first algorithm is a little faster for small database but requires large
RAM and does not work for databases with total length longer than 2GB. The
second algorithm is adapted from the BWT-SW source code. It in theory works
View
@@ -34,6 +34,8 @@
#include "bntseq.h"
#include "bwt.h"
#include "utils.h"
+#include "rle.h"
+#include "rope.h"
#ifdef _DIVBWT
#include "divsufsort.h"
@@ -90,11 +92,31 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
if (use_is) {
bwt->primary = is_bwt(buf, bwt->seq_len);
} else {
-#ifdef _DIVBWT
- bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
-#else
- err_fatal_simple("libdivsufsort is not compiled in.");
-#endif
+ rope_t *r;
+ int64_t x;
+ rpitr_t itr;
+ const uint8_t *blk;
+
+ r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
+ for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
+ int c = buf[i] + 1;
+ x = rope_insert_run(r, x, c, 1, 0) + 1;
+ while (--c >= 0) x += r->c[c];
+ }
+ bwt->primary = x;
+ rope_itr_first(r, &itr);
+ x = 0;
+ while ((blk = rope_itr_next_block(&itr)) != 0) {
+ const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
+ while (q < end) {
+ int c = 0;
+ int64_t l;
+ rle_dec1(q, c, l);
+ for (i = 0; i < l; ++i)
+ buf[x++] = c - 1;
+ }
+ }
+ rope_destroy(r);
}
bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
for (i = 0; i < bwt->seq_len; ++i)
@@ -196,7 +218,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
- if (strcmp(optarg, "div") == 0) algo_type = 1;
+ if (strcmp(optarg, "rb2") == 0) algo_type = 1;
else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
else if (strcmp(optarg, "is") == 0) algo_type = 3;
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
@@ -216,7 +238,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
if (optind + 1 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
- fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
+ fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
View
191 rle.c
@@ -0,0 +1,191 @@
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "rle.h"
+
+const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
+
+// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
+int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
+{
+ uint16_t *nptr = (uint16_t*)block;
+ int diff;
+
+ block += 2; // skip the first 2 counting bytes
+ if (*nptr == 0) {
+ memset(cnt, 0, 48);
+ diff = rle_enc1(block, a, rl);
+ } else {
+ uint8_t *p, *end = block + *nptr, *q;
+ int64_t pre, z, l = 0, tot, beg_l;
+ int c = -1, n_bytes = 0, n_bytes2, t = 0;
+ uint8_t tmp[24];
+ beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
+ tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
+ if (x < beg_l) {
+ beg_l = 0, *beg = 0;
+ memset(bc, 0, 48);
+ }
+ if (x == beg_l) {
+ p = q = block + (*beg); z = beg_l;
+ memcpy(cnt, bc, 48);
+ } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
+ z = beg_l; p = block + (*beg);
+ memcpy(cnt, bc, 48);
+ while (z < x) {
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ for (q = p - 1; *q>>6 == 2; --q);
+ } else { // backward
+ memcpy(cnt, ec, 48);
+ z = tot; p = end;
+ while (z >= x) {
+ --p;
+ if (*p>>6 != 2) {
+ l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3;
+ z -= l; cnt[*p&7] -= l;
+ l = 0; t = 0;
+ } else {
+ l |= (*p&0x3fL) << t;
+ t += 6;
+ }
+ }
+ q = p;
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ *beg = q - block;
+ memcpy(bc, cnt, 48);
+ bc[c] -= l;
+ n_bytes = p - q;
+ if (x == z && a != c && p < end) { // then try the next run
+ int tc;
+ int64_t tl;
+ q = p;
+ rle_dec1(q, tc, tl);
+ if (a == tc)
+ c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
+ }
+ if (z != x) cnt[c] -= z - x;
+ pre = x - (z - l); p -= n_bytes;
+ if (a == c) { // insert to the same run
+ n_bytes2 = rle_enc1(tmp, c, l + rl);
+ } else if (x == z) { // at the end; append to the existing run
+ p += n_bytes; n_bytes = 0;
+ n_bytes2 = rle_enc1(tmp, a, rl);
+ } else { // break the current run
+ n_bytes2 = rle_enc1(tmp, c, pre);
+ n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
+ n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
+ }
+ if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
+ memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
+ memcpy(p, tmp, n_bytes2);
+ diff = n_bytes2 - n_bytes;
+ }
+ return (*nptr += diff);
+}
+
+int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
+{
+ int beg = 0;
+ int64_t bc[6];
+ memset(bc, 0, 48);
+ return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
+}
+
+void rle_split(uint8_t *block, uint8_t *new_block)
+{
+ int n = *(uint16_t*)block;
+ uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1);
+ while (*q>>6 == 2) --q;
+ memcpy(new_block + 2, q, end - q);
+ *(uint16_t*)new_block = end - q;
+ *(uint16_t*)block = q - block - 2;
+}
+
+void rle_count(const uint8_t *block, int64_t cnt[6])
+{
+ const uint8_t *q = block + 2, *end = q + *(uint16_t*)block;
+ while (q < end) {
+ int c;
+ int64_t l;
+ rle_dec1(q, c, l);
+ cnt[c] += l;
+ }
+}
+
+void rle_print(const uint8_t *block, int expand)
+{
+ const uint16_t *p = (const uint16_t*)block;
+ const uint8_t *q = block + 2, *end = block + 2 + *p;
+ while (q < end) {
+ int c;
+ int64_t l, x;
+ rle_dec1(q, c, l);
+ if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
+ else printf("%c%ld", "$ACGTN"[c], (long)l);
+ }
+ putchar('\n');
+}
+
+void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6])
+{
+ int a;
+ int64_t tot, cnt[6];
+ const uint8_t *p;
+
+ y = y >= x? y : x;
+ tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
+ if (tot == 0) return;
+ if (x <= (tot - y) + (tot>>3)) {
+ int c = 0;
+ int64_t l, z = 0;
+ memset(cnt, 0, 48);
+ p = block + 2;
+ while (z < x) {
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ for (a = 0; a != 6; ++a) cx[a] += cnt[a];
+ cx[c] -= z - x;
+ if (cy) {
+ while (z < y) {
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ for (a = 0; a != 6; ++a) cy[a] += cnt[a];
+ cy[c] -= z - y;
+ }
+ } else {
+#define move_backward(_x) \
+ while (z >= (_x)) { \
+ --p; \
+ if (*p>>6 != 2) { \
+ l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \
+ z -= l; cnt[*p&7] -= l; \
+ l = 0; t = 0; \
+ } else { \
+ l |= (*p&0x3fL) << t; \
+ t += 6; \
+ } \
+ } \
+
+ int t = 0;
+ int64_t l = 0, z = tot;
+ memcpy(cnt, ec, 48);
+ p = block + 2 + *(const uint16_t*)block;
+ if (cy) {
+ move_backward(y)
+ for (a = 0; a != 6; ++a) cy[a] += cnt[a];
+ cy[*p&7] += y - z;
+ }
+ move_backward(x)
+ for (a = 0; a != 6; ++a) cx[a] += cnt[a];
+ cx[*p&7] += x - z;
+
+#undef move_backward
+ }
+}
View
77 rle.h
@@ -0,0 +1,77 @@
+#ifndef RLE6_H_
+#define RLE6_H_
+
+#include <stdint.h>
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#else
+#define LIKELY(x) (x)
+#endif
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+ int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]);
+ int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
+ void rle_split(uint8_t *block, uint8_t *new_block);
+ void rle_count(const uint8_t *block, int64_t cnt[6]);
+ void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]);
+ #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
+
+ void rle_print(const uint8_t *block, int expand);
+
+#ifdef __cplusplus
+}
+#endif
+
+/******************
+ *** 43+3 codec ***
+ ******************/
+
+const uint8_t rle_auxtab[8];
+
+#define RLE_MIN_SPACE 18
+#define rle_nptr(block) ((uint16_t*)(block))
+
+// decode one run (c,l) and move the pointer p
+#define rle_dec1(p, c, l) do { \
+ (c) = *(p) & 7; \
+ if (LIKELY((*(p)&0x80) == 0)) { \
+ (l) = *(p)++ >> 3; \
+ } else if (LIKELY(*(p)>>5 == 6)) { \
+ (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \
+ (p) += 2; \
+ } else { \
+ int n = ((*(p)&0x10) >> 2) + 4; \
+ (l) = *(p)++ >> 3 & 1; \
+ while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \
+ } \
+ } while (0)
+
+static inline int rle_enc1(uint8_t *p, int c, int64_t l)
+{
+ if (l < 1LL<<4) {
+ *p = l << 3 | c;
+ return 1;
+ } else if (l < 1LL<<8) {
+ *p = 0xC0 | l >> 6 << 3 | c;
+ p[1] = 0x80 | (l & 0x3f);
+ return 2;
+ } else if (l < 1LL<<19) {
+ *p = 0xE0 | l >> 18 << 3 | c;
+ p[1] = 0x80 | (l >> 12 & 0x3f);
+ p[2] = 0x80 | (l >> 6 & 0x3f);
+ p[3] = 0x80 | (l & 0x3f);
+ return 4;
+ } else {
+ int i, shift = 36;
+ *p = 0xF0 | l >> 42 << 3 | c;
+ for (i = 1; i < 8; ++i, shift -= 6)
+ p[i] = 0x80 | (l>>shift & 0x3f);
+ return 8;
+ }
+}
+
+#endif
Oops, something went wrong.

0 comments on commit be2020c

Please sign in to comment.