Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C conversion/adaptation of jeaiii_to_text.h compared to C conversion/adaptation of itoa_jeaiii.cpp not as much faster as hoped #17

Open
MasterDuke17 opened this issue Apr 25, 2024 · 11 comments

Comments

@MasterDuke17
Copy link

See MoarVM/MoarVM#1800. Obviously there's lots of overhead, but the readme mentioned a 25% speedup for the new version, and I'm seeing nowhere near that (with gcc 13.2.1 in linux, compiling with -O3 -march=native, on a Zen 2). Do you see anything obviously problematic in my conversion?

@MasterDuke17
Copy link
Author

Huh, this is interesting. A standalone benchmark of my two C versions shows the newer one as slightly slower and almost double the instructions.

old (~0.137s and 2,282,158,658 instructions):

#include <stdio.h>
#include <stdint.h>

typedef struct pair { char t, o; } pair;
#define P(T) T, '0',  T, '1', T, '2', T, '3', T, '4', T, '5', T, '6', T, '7', T, '8', T, '9'
static const pair s_pairs[] = { P('0'), P('1'), P('2'), P('3'), P('4'), P('5'), P('6'), P('7'), P('8'), P('9') };

#define W(N, I) *(pair*)&b[N] = s_pairs[I]
#define A(N) t = ((uint64_t)(1) << (32 + N / 5 * N * 53 / 16)) / (uint32_t)(1e##N) + 1 + N/6 - N/8, t *= u, t >>= N / 5 * N * 53 / 16, t += N / 6 * 4, W(0, t >> 32)
#define S(N) b[N] = (char)((uint64_t)(10) * (uint32_t)(t) >> 32) + '0'
#define D(N) t = (uint64_t)(100) * (uint32_t)(t), W(N, t >> 32)

#define L0 b[0] = (char)(u) + '0'
#define L1 W(0, u)
#define L2 A(1), S(2)
#define L3 A(2), D(2)
#define L4 A(3), D(2), S(4)
#define L5 A(4), D(2), D(4)
#define L6 A(5), D(2), D(4), S(6)
#define L7 A(6), D(2), D(4), D(6)
#define L8 A(7), D(2), D(4), D(6), S(8)
#define L9 A(8), D(2), D(4), D(6), D(8)

#define LN(N) (L##N, b += N + 1)
#define LZ LN

#define LG(F) (u<100 ? u<10 ? F(0) : F(1) : u<1000000 ? u<10000 ? u<1000 ? F(2) : F(3) : u<100000 ? F(4) : F(5) : u<100000000 ? u<10000000 ? F(6) : F(7) : u<1000000000 ? F(8) : F(9))

static char * u64toa_jeaiii(uint64_t n, char* b) {
    uint32_t u;
    uint64_t t;

    if ((uint32_t)(n >> 32) == 0)
        return u = (uint32_t)(n), LG(LZ);

    uint64_t a = n / 100000000;

    if ((uint32_t)(a >> 32) == 0) {
        u = (uint32_t)(a);
        LG(LN);
    }
    else {
        u = (uint32_t)(a / 100000000);
        LG(LN);
        u = a % 100000000;
        LN(7);
    }

    u = n % 100000000;
    return LZ(7);
}

void main(void) {
    char work[20];
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[0];
    }
    printf("sum = %ld\n", sum);
}

new (~0.188s and 4,094,107,679 instructions):

#include <stdio.h>
#include <stdint.h>

struct pair
{
    char dd[2];
};

static const struct pair dd[100] =
{
    {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const struct pair fd[100] =
{
    {'0','\0'}, {'1','\0'}, {'2','\0'}, {'3','\0'}, {'4','\0'}, {'5','\0'}, {'6','\0'}, {'7','\0'}, {'8','\0'}, {'9','\0'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};

static const uint64_t mask24 = ((uint64_t)(1) << 24) - 1;
static const uint64_t mask32 = ((uint64_t)(1) << 32) - 1;
static const uint64_t mask57 = ((uint64_t)(1) << 57) - 1;

static char * u64toa_jeaiii(uint64_t n, char* b) {
    if (n < (uint32_t)(1e2))
    {
        *(struct pair *)(b) = fd[n];
        return n < 10 ? b + 1 : b + 2;
    }
    if (n < (uint32_t)(1e6))
    {
        if (n < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * n;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= n < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            return b + 4;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 32ull)/ 1e5 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= n < (uint32_t)(1e5);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        return b + 6;
    }
    if (n < (uint64_t)(1ull << 32ull))
    {
        if (n < (uint32_t)(1e8))
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * n >> 16;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= n < (uint32_t)(1e7);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            uint64_t f6 = (f4 & mask32) * 100;
            *(struct pair *)(b + 6) = dd[f6 >> 32];
            return b + 8;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= n < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        return b + 10;
    }

    // if we get here U must be (uint64_t) but some compilers don't know that, so reassign n to a (uint64_t) to avoid warnings
    uint32_t z = n % (uint32_t)(1e8);
    uint64_t u = n / (uint32_t)(1e8);

    if (u < (uint32_t)(1e2))
    {
        // u can't be 1 digit (if u < 10 it would have been handled above as a 9 digit 32bit number)
        *(struct pair *)(b) = dd[u];
        b += 2;
    }
    else if (u < (uint32_t)(1e6))
    {
        if (u < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        else
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 32ull) / 1e5 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= u < (uint32_t)(1e5);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            b += 6;
        }
    }
    else if (u < (uint32_t)(1e8))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * u >> 16;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= u < (uint32_t)(1e7);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    else if (u < (uint64_t)(1ull << 32ull))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * u;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= u < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        b += 10;
    }
    else
    {
        uint32_t y = u % (uint32_t)(1e8);
        u /= (uint32_t)(1e8);

        // u is 2, 3, or 4 digits (if u < 10 it would have been handled above)
        if (u < (uint32_t)(1e2))
        {
            *(struct pair *)(b) = dd[u];
            b += 2;
        }
        else
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        // do 8 digits
        uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * y >> 16) + 1;
        *(struct pair *)(b) = dd[f0 >> 32];
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    // do 8 digits
    uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * z >> 16) + 1;
    *(struct pair *)(b) = dd[f0 >> 32];
    uint64_t f2 = (f0 & mask32) * 100;
    *(struct pair *)(b + 2) = dd[f2 >> 32];
    uint64_t f4 = (f2 & mask32) * 100;
    *(struct pair *)(b + 4) = dd[f4 >> 32];
    uint64_t f6 = (f4 & mask32) * 100;
    *(struct pair *)(b + 6) = dd[f6 >> 32];
    return b + 8;
}

void main(void) {
    char work[20];
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[0];
    }
    printf("sum = %ld\n", sum);
}

@jeaiii
Copy link
Owner

jeaiii commented Jun 27, 2024

"benchmarking is hard"

My suspicion is the optimizer is interfering with your simple benchmark to different degrees as seen in this godbolt:

https://godbolt.org/z/EEsYPGzrs

Since the first code is simpler the optimizer can "see" into it more and produce the sum (which in theory the compiler could compute fully at compile time) with less work than compared to the second code. Both however aren't doing the full amount of work you would expect if the number being converted was completely unknown at compile time

Sorry for the super later reply

@jeaiii
Copy link
Owner

jeaiii commented Jun 27, 2024

with a few changes to make it constexpr friendly you can see clang calculates the result fully (if you lower the limit, otherwise godbolt times out)

https://godbolt.org/z/o1rocnvva

@jeaiii
Copy link
Owner

jeaiii commented Jun 27, 2024

the 25% speed up refers to my benchmark's uniform random digit length category where I take equal quantities of numbers with digits length from 1 to 10 and shuffle them together so that the CPU can't predict how many digits each conversion will have. This is the worst case for algorithms that try to do less work based on how "large" the number is. For example converting only 10 digit numbers is twice as fast as converting random digit length numbers.

https://jeaiii.github.io/itoa/

@MasterDuke17
Copy link
Author

In a new wrinkle, when using gcc with -O3, it gives a different final value in my example (I looked at the objdump -S output and both were actually running the code). Gcc 14.1.1 prints sum = 4777499475, but clang 17.0.6 prints sum = 5249999475. Both print sum = 5249999475 with -O2. My example (compiled with -g -O(2|3) -Wall -Wextra -Wpedantic -Wno-missing-braces):

#include <stdio.h>
#include <stdint.h>

struct pair
{
    char dd[2];
};

static const struct pair dd[100] =
{
    {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const struct pair fd[100] =
{
    {'0','\0'}, {'1','\0'}, {'2','\0'}, {'3','\0'}, {'4','\0'}, {'5','\0'}, {'6','\0'}, {'7','\0'}, {'8','\0'}, {'9','\0'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};

static const uint64_t mask24 = ((uint64_t)(1) << 24) - 1;
static const uint64_t mask32 = ((uint64_t)(1) << 32) - 1;
static const uint64_t mask57 = ((uint64_t)(1) << 57) - 1;

static char * u64toa_jeaiii(uint64_t n, char* b) {
    if (n < (uint32_t)(1e2))
    {
        *(struct pair *)(b) = fd[n];
        return n < 10 ? b + 1 : b + 2;
    }
    if (n < (uint32_t)(1e6))
    {
        if (n < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * n;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= n < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            return b + 4;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 32ull)/ 1e5 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= n < (uint32_t)(1e5);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        return b + 6;
    }
    if (n < (uint64_t)(1ull << 32ull))
    {
        if (n < (uint32_t)(1e8))
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * n >> 16;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= n < (uint32_t)(1e7);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            uint64_t f6 = (f4 & mask32) * 100;
            *(struct pair *)(b + 6) = dd[f6 >> 32];
            return b + 8;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= n < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        return b + 10;
    }

    // if we get here U must be (uint64_t) but some compilers don't know that, so reassign n to a (uint64_t) to avoid warnings
    uint32_t z = n % (uint32_t)(1e8);
    uint64_t u = n / (uint32_t)(1e8);

    if (u < (uint32_t)(1e2))
    {
        // u can't be 1 digit (if u < 10 it would have been handled above as a 9 digit 32bit number)
        *(struct pair *)(b) = dd[u];
        b += 2;
    }
    else if (u < (uint32_t)(1e6))
    {
        if (u < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        else
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 32ull) / 1e5 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= u < (uint32_t)(1e5);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            b += 6;
        }
    }
    else if (u < (uint32_t)(1e8))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * u >> 16;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= u < (uint32_t)(1e7);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    else if (u < (uint64_t)(1ull << 32ull))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * u;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= u < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        b += 10;
    }
    else
    {
        uint32_t y = u % (uint32_t)(1e8);
        u /= (uint32_t)(1e8);

        // u is 2, 3, or 4 digits (if u < 10 it would have been handled above)
        if (u < (uint32_t)(1e2))
        {
            *(struct pair *)(b) = dd[u];
            b += 2;
        }
        else
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        // do 8 digits
        uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * y >> 16) + 1;
        *(struct pair *)(b) = dd[f0 >> 32];
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    // do 8 digits
    uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * z >> 16) + 1;
    *(struct pair *)(b) = dd[f0 >> 32];
    uint64_t f2 = (f0 & mask32) * 100;
    *(struct pair *)(b + 2) = dd[f2 >> 32];
    uint64_t f4 = (f2 & mask32) * 100;
    *(struct pair *)(b + 4) = dd[f4 >> 32];
    uint64_t f6 = (f4 & mask32) * 100;
    *(struct pair *)(b + 6) = dd[f6 >> 32];
    return b + 8;
}

int main(void) {
    char work[20] = {0};
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[1];
    }
    return printf("sum = %ld\n", sum);
}

@jeaiii
Copy link
Owner

jeaiii commented Jun 30, 2024

well that's super sus. -O1 and no args also does 4777499475. Only gcc -O2 matches clang, and clang is the same at any optimization level...

@MasterDuke17
Copy link
Author

Yeah, and a simple C++ version gives 4777499475 for g++ with -O3, but 5249999475 for every other combination of -O and g++/clang++.

#include <cstdio>
#include <cstdint>
#include "jeaiii_to_text.h"

int main(void) {
    char work[20] = {0};
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[1];
    }
    return printf("sum = %ld\n", sum);
}

@MasterDuke17
Copy link
Author

MasterDuke17 commented Jul 1, 2024

Huh. Adding -fsanitize=address to g++ with -O3 results in 5249999475. Same with -fsanitize=undefined, and neither reports any problems.

@jeaiii
Copy link
Owner

jeaiii commented Jul 1, 2024

is there a bug bounty for gcc? :-)

https://godbolt.org/z/q3azMGdr8

@MasterDuke17
Copy link
Author

is there a bug bounty for gcc? :-)

godbolt.org/z/q3azMGdr8

It seems like 9.1 (going just by major versions) is the first where it doesn't print 0 and 0 at -O3. I've never reported a bug to gcc (and don't really understand the code and resulting assembly well enough to thoroughly explain what's going on), do you mind doing so?

@jeaiii
Copy link
Owner

jeaiii commented Jul 2, 2024

I've only reported bugs for clang and msvc in the past, but will make an account so I can report this to gcc. I have found a few work arounds, but not sure yet which one is best. For C, I would suggest making 4 versions for int , unsigned int, long long, and unsigned long long

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants