From 91323f5fa5142239116048f5ef640a81acc64fbb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 18 Feb 2020 11:35:53 +0000 Subject: [PATCH] Added support for big endian platforms. Mostly this is just via endian agnostic ways of doing things, but the 16-bit rans is slightly better if it can make endianness assumptions so we also now have a new header file to detect endianness. If the detection fails it uses the case code path as big endian, so this is safe. --- .travis.yml | 19 ++++++-- htscodecs/htscodecs_endian.h | 93 ++++++++++++++++++++++++++++++++++++ htscodecs/rANS_word.h | 67 ++++++++++++++++++-------- htscodecs/tokenise_name3.c | 33 ++++++++----- 4 files changed, 177 insertions(+), 35 deletions(-) create mode 100644 htscodecs/htscodecs_endian.h diff --git a/.travis.yml b/.travis.yml index 0c64de4..14d376f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,19 @@ language: c compiler: gcc -os: - - linux - - osx +matrix: + include: + # A big endian platform + - os: linux + arch: s390x -script: autoreconf -i; ./configure CFLAGS="-g -Wall -O3 -fsanitize=address" && make && (make check || (cat tests/test-suite.log; false)) + # A little endian platform running linux plus address sanitizer + - os: linux + compiler: gcc + env: SANITIZE=-fsanitize=address + + # A little endian platform running MacOS X + - os: osx + compiler: clang + +script: autoreconf -i; ./configure CFLAGS="-g -Wall -O3 $SANITIZE" LDFLAGS="$SANITIZE" && make && (make check || (cat tests/test-suite.log; false)) diff --git a/htscodecs/htscodecs_endian.h b/htscodecs/htscodecs_endian.h new file mode 100644 index 0000000..e038aec --- /dev/null +++ b/htscodecs/htscodecs_endian.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2020 Genome Research Ltd. + * Author(s): James Bonfield + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger + * Institute nor the names of its contributors may be used to endorse + * or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH + * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTSCODECS_ENDIAN_H +#define HTSCODECS_ENDIAN_H + +// Endianness checking. + +// Sets HTSCODECS_ENDIAN_KNOWN if system type detected and either +// HTSCODECS_LITTLE_ENDIAN or HTSCODECS_BIG_ENDIAN. + +/* + * In general our preferred route is to write code in an endian agnostic + * fashion, but our data formats are natively little endian. Therefore + * in time critical code it's sometimes best to exploit that. + * + * Therefore we'll optimise code along the lines of: + * + * #ifdef HTSCODECS_LITTLE_ENDIAN + * // do something little endian specific + * #else + * // do something in an endian agnostic fashion + * #endif + * + * This means our code works even if we fail to recognise the + * specific machine type. + */ + +#if (defined(__i386__) \ + || defined(__i386) \ + || defined(__amd64__) \ + || defined(__amd64) \ + || defined(__x86_64__) \ + || defined(__x86_64) \ + || defined(__i686__) \ + || defined(__i686)) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + || defined(__LITTLE_ENDIAN__) \ + || defined(__ARMEL__) \ + || defined(__THUMBEL__) \ + || defined(__AARCH64EL__) \ + || defined(_MIPSEL) \ + || defined(__MIPSEL) \ + || defined(__MIPSEL__) + // Little endian +# define HTSCODECS_LITTLE_ENDIAN +# define HTSCODECS_ENDIAN_KNOWN +#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \ + || defined(__BIG_ENDIAN__) \ + || defined(__ARMEB__) \ + || defined(__THUMBEB__) \ + || defined(__AAARCHEB__) \ + || defined(_MIPSEB) \ + || defined(__MIPSEB) \ + || defined(__MIPSEB__) + // Big endian +# define HTSCODECS_BIG_ENDIAN +# define HTSCODECS_ENDIAN_KNOWN +#else +// Unknown - code will need to check HTSCODES_ENDIAN_KNOWN and do endian agnostic +#endif + +#endif /* HTSCODECS_ENDIAN_H */ diff --git a/htscodecs/rANS_word.h b/htscodecs/rANS_word.h index f9c2d5b..0edfca4 100644 --- a/htscodecs/rANS_word.h +++ b/htscodecs/rANS_word.h @@ -19,6 +19,7 @@ #include #include #include +#include "htscodecs_endian.h" #ifdef assert #define RansAssert assert @@ -285,19 +286,22 @@ static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol uint32_t x = *r; uint32_t x_max = sym->x_max; -// uint32_t c = x < sym->x_max; -// uint16_t *p16 = (uint16_t *)(*pptr-2); -// uint32_t p1 = x, x1 = x >> 16; -// *p16 = c ? *p16 : p1; -// *pptr = c ? *pptr : (uint8_t *)p16; -// x = c ? x : x1; - +#ifdef HTSCODECS_LITTLE_ENDIAN + if (x >= x_max) { + (*pptr) -= 2; + **(uint16_t **)pptr = x; + x >>= 16; + } +#else if (x >= x_max) { - uint16_t* ptr = *(uint16_t **)pptr; - *--ptr = x;//(uint16_t) (x & 0xffff); + uint8_t* ptr = *pptr; + ptr -= 2; + ptr[0] = x & 0xff; + ptr[1] = (x >> 8) & 0xff; x >>= 16; - *pptr = (uint8_t *)ptr; + *pptr = ptr; } +#endif // x = C(s,x) // NOTE: written this way so we get a 32-bit "multiply high" when @@ -342,10 +346,8 @@ static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* s // Renormalize. -// FIXME: this is endian specific. We need big endian versions of these -// functions. - #if defined(__x86_64) && !defined(__ILP32__) + /* * Assembly variants of the RansDecRenorm code. * These are based on joint ideas from Rob Davies and from looking at @@ -381,16 +383,35 @@ static inline void RansDecRenorm(RansState* r, uint8_t** pptr) // renormalize uint32_t x = *r; - // q4 q40 - // clang 801/540 750/438 (generated cmov; equiv to asm above) - // gcc8 896/624 352/286 (conditionals) - uint16_t* ptr = *(uint16_t **)pptr; - uint32_t y = *ptr; + // Generally this is faster than asm for highly predictable data + // (branch prediction is efficient), but slower for complex data. + // + // However we'll be using RLE and/or PACK for highly predictable + // data, turning into smaller but less predictable as far as + // entropy encoding goes. So the latter is what we tune for. + // We do however still have to use this code on big endian systems. + + // Times for asm code (above) / this code (LE) / this code (BE). + // + // q4-0 q40-0 + // clang 648/903/904 648/392/391 this faster q4, slower q40 + // gcc7 650/851/851 650/394/393 + // q4-1 q40-1(OPT) + // clang 487/567/568 445/310/312 this faster q4, slower q40 + // gcc7 413/497/498 381/282/282 + // q4-193(OPT) + // clang 990/907/903 this slower + // gcc7 952/875/868 this slower +#ifdef HTSCODECS_LITTLE_ENDIAN + uint32_t y = **(uint16_t **)pptr; // 32-bit quicker here +#else + uint16_t y = **(uint16_t **)pptr; + y = (y<<8) | (y>>8); +#endif if (x < RANS_BYTE_L) - ptr++; + (*pptr)+=2; if (x < RANS_BYTE_L) x = (x << 16) | y; - *pptr = (uint8_t *)ptr; *r = x; } @@ -400,9 +421,15 @@ static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_ { uint32_t x = *r; if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return; +#ifdef HTSCODECS_LITTLE_ENDIAN uint16_t* ptr = *(uint16_t **)pptr; x = (x << 16) | *ptr++; *pptr = (uint8_t *)ptr; +#else + uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8); + x = (x << 16) | y; + (*pptr) += 2; +#endif *r = x; } diff --git a/htscodecs/tokenise_name3.c b/htscodecs/tokenise_name3.c index 4cb10d2..3e05e3f 100644 --- a/htscodecs/tokenise_name3.c +++ b/htscodecs/tokenise_name3.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Genome Research Ltd. + * Copyright (c) 2016-2020 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without @@ -362,8 +362,11 @@ static int encode_token_int(name_context *ctx, int ntok, if (encode_token_type(ctx, ntok, type) < 0) return -1; if (descriptor_grow(&ctx->desc[id], 4) < 0) return -1; - // Assumes little endian and unalign access OK. - *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l) = val; + uint8_t *cp = &ctx->desc[id].buf[ctx->desc[id].buf_l]; + cp[0] = (val >> 0) & 0xff; + cp[1] = (val >> 8) & 0xff; + cp[2] = (val >> 16) & 0xff; + cp[3] = (val >> 24) & 0xff; ctx->desc[id].buf_l += 4; return 0; @@ -377,8 +380,8 @@ static int decode_token_int(name_context *ctx, int ntok, if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a) return -1; - // Assumes little endian and unalign access OK. - *val = *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l); + uint8_t *cp = ctx->desc[id].buf + ctx->desc[id].buf_l; + *val = (cp[0]) + (cp[1]<<8) + (cp[2]<<16) + (cp[3]<<24); ctx->desc[id].buf_l += 4; return 0; @@ -1478,12 +1481,18 @@ uint8_t *encode_names(char *blk, int len, int level, int use_arith, } uint8_t *cp = out; - //*out_len = tot_size+4; - //*(uint32_t *)cp = tot_size; cp += 4; *out_len = tot_size; - *(uint32_t *)cp = last_start; cp += 4; - *(uint32_t *)cp = nreads; cp += 4; +// *(uint32_t *)cp = last_start; cp += 4; +// *(uint32_t *)cp = nreads; cp += 4; + *cp++ = (last_start >> 0) & 0xff; + *cp++ = (last_start >> 8) & 0xff; + *cp++ = (last_start >> 16) & 0xff; + *cp++ = (last_start >> 24) & 0xff; + *cp++ = (nreads >> 0) & 0xff; + *cp++ = (nreads >> 8) & 0xff; + *cp++ = (nreads >> 16) & 0xff; + *cp++ = (nreads >> 24) & 0xff; *cp++ = use_arith; //write(1, &nreads, 4); int last_tnum = -1; @@ -1524,12 +1533,14 @@ uint8_t *decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) { return NULL; int i, o = 9; - int ulen = *(uint32_t *)in; + //int ulen = *(uint32_t *)in; + int ulen = (in[0]<<0) | (in[1]<<8) | (in[2]<<16) | (in[3]<<24); if (ulen < 0 || ulen >= INT_MAX-1024) return NULL; - int nreads = *(uint32_t *)(in+4); + //int nreads = *(uint32_t *)(in+4); + int nreads = (in[4]<<0) | (in[5]<<8) | (in[6]<<16) | (in[7]<<24); int use_arith = in[8]; name_context *ctx = create_context(nreads); if (!ctx)