Permalink
Please sign in to comment.
Showing
with
678 additions
and 11 deletions.
36
bwtindex.c
| @@ -0,0 +1,191 @@ | ||
| +#include <string.h> | ||
| +#include <assert.h> | ||
| +#include <stdlib.h> | ||
| +#include <stdio.h> | ||
| +#include "rle.h" | ||
| + | ||
| +const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 }; | ||
| + | ||
| +// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase | ||
| +int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]) | ||
| +{ | ||
| + uint16_t *nptr = (uint16_t*)block; | ||
| + int diff; | ||
| + | ||
| + block += 2; // skip the first 2 counting bytes | ||
| + if (*nptr == 0) { | ||
| + memset(cnt, 0, 48); | ||
| + diff = rle_enc1(block, a, rl); | ||
| + } else { | ||
| + uint8_t *p, *end = block + *nptr, *q; | ||
| + int64_t pre, z, l = 0, tot, beg_l; | ||
| + int c = -1, n_bytes = 0, n_bytes2, t = 0; | ||
| + uint8_t tmp[24]; | ||
| + beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5]; | ||
| + tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; | ||
| + if (x < beg_l) { | ||
| + beg_l = 0, *beg = 0; | ||
| + memset(bc, 0, 48); | ||
| + } | ||
| + if (x == beg_l) { | ||
| + p = q = block + (*beg); z = beg_l; | ||
| + memcpy(cnt, bc, 48); | ||
| + } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward | ||
| + z = beg_l; p = block + (*beg); | ||
| + memcpy(cnt, bc, 48); | ||
| + while (z < x) { | ||
| + rle_dec1(p, c, l); | ||
| + z += l; cnt[c] += l; | ||
| + } | ||
| + for (q = p - 1; *q>>6 == 2; --q); | ||
| + } else { // backward | ||
| + memcpy(cnt, ec, 48); | ||
| + z = tot; p = end; | ||
| + while (z >= x) { | ||
| + --p; | ||
| + if (*p>>6 != 2) { | ||
| + l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; | ||
| + z -= l; cnt[*p&7] -= l; | ||
| + l = 0; t = 0; | ||
| + } else { | ||
| + l |= (*p&0x3fL) << t; | ||
| + t += 6; | ||
| + } | ||
| + } | ||
| + q = p; | ||
| + rle_dec1(p, c, l); | ||
| + z += l; cnt[c] += l; | ||
| + } | ||
| + *beg = q - block; | ||
| + memcpy(bc, cnt, 48); | ||
| + bc[c] -= l; | ||
| + n_bytes = p - q; | ||
| + if (x == z && a != c && p < end) { // then try the next run | ||
| + int tc; | ||
| + int64_t tl; | ||
| + q = p; | ||
| + rle_dec1(q, tc, tl); | ||
| + if (a == tc) | ||
| + c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl; | ||
| + } | ||
| + if (z != x) cnt[c] -= z - x; | ||
| + pre = x - (z - l); p -= n_bytes; | ||
| + if (a == c) { // insert to the same run | ||
| + n_bytes2 = rle_enc1(tmp, c, l + rl); | ||
| + } else if (x == z) { // at the end; append to the existing run | ||
| + p += n_bytes; n_bytes = 0; | ||
| + n_bytes2 = rle_enc1(tmp, a, rl); | ||
| + } else { // break the current run | ||
| + n_bytes2 = rle_enc1(tmp, c, pre); | ||
| + n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl); | ||
| + n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre); | ||
| + } | ||
| + if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed | ||
| + memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes); | ||
| + memcpy(p, tmp, n_bytes2); | ||
| + diff = n_bytes2 - n_bytes; | ||
| + } | ||
| + return (*nptr += diff); | ||
| +} | ||
| + | ||
| +int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6]) | ||
| +{ | ||
| + int beg = 0; | ||
| + int64_t bc[6]; | ||
| + memset(bc, 0, 48); | ||
| + return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc); | ||
| +} | ||
| + | ||
| +void rle_split(uint8_t *block, uint8_t *new_block) | ||
| +{ | ||
| + int n = *(uint16_t*)block; | ||
| + uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1); | ||
| + while (*q>>6 == 2) --q; | ||
| + memcpy(new_block + 2, q, end - q); | ||
| + *(uint16_t*)new_block = end - q; | ||
| + *(uint16_t*)block = q - block - 2; | ||
| +} | ||
| + | ||
| +void rle_count(const uint8_t *block, int64_t cnt[6]) | ||
| +{ | ||
| + const uint8_t *q = block + 2, *end = q + *(uint16_t*)block; | ||
| + while (q < end) { | ||
| + int c; | ||
| + int64_t l; | ||
| + rle_dec1(q, c, l); | ||
| + cnt[c] += l; | ||
| + } | ||
| +} | ||
| + | ||
| +void rle_print(const uint8_t *block, int expand) | ||
| +{ | ||
| + const uint16_t *p = (const uint16_t*)block; | ||
| + const uint8_t *q = block + 2, *end = block + 2 + *p; | ||
| + while (q < end) { | ||
| + int c; | ||
| + int64_t l, x; | ||
| + rle_dec1(q, c, l); | ||
| + if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]); | ||
| + else printf("%c%ld", "$ACGTN"[c], (long)l); | ||
| + } | ||
| + putchar('\n'); | ||
| +} | ||
| + | ||
| +void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]) | ||
| +{ | ||
| + int a; | ||
| + int64_t tot, cnt[6]; | ||
| + const uint8_t *p; | ||
| + | ||
| + y = y >= x? y : x; | ||
| + tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; | ||
| + if (tot == 0) return; | ||
| + if (x <= (tot - y) + (tot>>3)) { | ||
| + int c = 0; | ||
| + int64_t l, z = 0; | ||
| + memset(cnt, 0, 48); | ||
| + p = block + 2; | ||
| + while (z < x) { | ||
| + rle_dec1(p, c, l); | ||
| + z += l; cnt[c] += l; | ||
| + } | ||
| + for (a = 0; a != 6; ++a) cx[a] += cnt[a]; | ||
| + cx[c] -= z - x; | ||
| + if (cy) { | ||
| + while (z < y) { | ||
| + rle_dec1(p, c, l); | ||
| + z += l; cnt[c] += l; | ||
| + } | ||
| + for (a = 0; a != 6; ++a) cy[a] += cnt[a]; | ||
| + cy[c] -= z - y; | ||
| + } | ||
| + } else { | ||
| +#define move_backward(_x) \ | ||
| + while (z >= (_x)) { \ | ||
| + --p; \ | ||
| + if (*p>>6 != 2) { \ | ||
| + l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \ | ||
| + z -= l; cnt[*p&7] -= l; \ | ||
| + l = 0; t = 0; \ | ||
| + } else { \ | ||
| + l |= (*p&0x3fL) << t; \ | ||
| + t += 6; \ | ||
| + } \ | ||
| + } \ | ||
| + | ||
| + int t = 0; | ||
| + int64_t l = 0, z = tot; | ||
| + memcpy(cnt, ec, 48); | ||
| + p = block + 2 + *(const uint16_t*)block; | ||
| + if (cy) { | ||
| + move_backward(y) | ||
| + for (a = 0; a != 6; ++a) cy[a] += cnt[a]; | ||
| + cy[*p&7] += y - z; | ||
| + } | ||
| + move_backward(x) | ||
| + for (a = 0; a != 6; ++a) cx[a] += cnt[a]; | ||
| + cx[*p&7] += x - z; | ||
| + | ||
| +#undef move_backward | ||
| + } | ||
| +} |
| @@ -0,0 +1,77 @@ | ||
| +#ifndef RLE6_H_ | ||
| +#define RLE6_H_ | ||
| + | ||
| +#include <stdint.h> | ||
| + | ||
| +#ifdef __GNUC__ | ||
| +#define LIKELY(x) __builtin_expect((x),1) | ||
| +#else | ||
| +#define LIKELY(x) (x) | ||
| +#endif | ||
| +#ifdef __cplusplus | ||
| + | ||
| +extern "C" { | ||
| +#endif | ||
| + | ||
| + int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]); | ||
| + int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]); | ||
| + void rle_split(uint8_t *block, uint8_t *new_block); | ||
| + void rle_count(const uint8_t *block, int64_t cnt[6]); | ||
| + void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]); | ||
| + #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec) | ||
| + | ||
| + void rle_print(const uint8_t *block, int expand); | ||
| + | ||
| +#ifdef __cplusplus | ||
| +} | ||
| +#endif | ||
| + | ||
| +/****************** | ||
| + *** 43+3 codec *** | ||
| + ******************/ | ||
| + | ||
| +const uint8_t rle_auxtab[8]; | ||
| + | ||
| +#define RLE_MIN_SPACE 18 | ||
| +#define rle_nptr(block) ((uint16_t*)(block)) | ||
| + | ||
| +// decode one run (c,l) and move the pointer p | ||
| +#define rle_dec1(p, c, l) do { \ | ||
| + (c) = *(p) & 7; \ | ||
| + if (LIKELY((*(p)&0x80) == 0)) { \ | ||
| + (l) = *(p)++ >> 3; \ | ||
| + } else if (LIKELY(*(p)>>5 == 6)) { \ | ||
| + (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \ | ||
| + (p) += 2; \ | ||
| + } else { \ | ||
| + int n = ((*(p)&0x10) >> 2) + 4; \ | ||
| + (l) = *(p)++ >> 3 & 1; \ | ||
| + while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \ | ||
| + } \ | ||
| + } while (0) | ||
| + | ||
| +static inline int rle_enc1(uint8_t *p, int c, int64_t l) | ||
| +{ | ||
| + if (l < 1LL<<4) { | ||
| + *p = l << 3 | c; | ||
| + return 1; | ||
| + } else if (l < 1LL<<8) { | ||
| + *p = 0xC0 | l >> 6 << 3 | c; | ||
| + p[1] = 0x80 | (l & 0x3f); | ||
| + return 2; | ||
| + } else if (l < 1LL<<19) { | ||
| + *p = 0xE0 | l >> 18 << 3 | c; | ||
| + p[1] = 0x80 | (l >> 12 & 0x3f); | ||
| + p[2] = 0x80 | (l >> 6 & 0x3f); | ||
| + p[3] = 0x80 | (l & 0x3f); | ||
| + return 4; | ||
| + } else { | ||
| + int i, shift = 36; | ||
| + *p = 0xF0 | l >> 42 << 3 | c; | ||
| + for (i = 1; i < 8; ++i, shift -= 6) | ||
| + p[i] = 0x80 | (l>>shift & 0x3f); | ||
| + return 8; | ||
| + } | ||
| +} | ||
| + | ||
| +#endif |
Oops, something went wrong.
0 comments on commit
be2020c