Permalink
| /** | |
| * Demonstrates how can map a 32-bit integer to a range faster than | |
| * a modulo reduction. | |
| * Assumes x64 processor. | |
| * gcc -O3 -o fastrange fastrange.c | |
| */ | |
| #include <stdint.h> | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #define RDTSC_START(cycles) \ | |
| do { \ | |
| register unsigned cyc_high, cyc_low; \ | |
| __asm volatile( \ | |
| "cpuid\n\t" \ | |
| "rdtsc\n\t" \ | |
| "mov %%edx, %0\n\t" \ | |
| "mov %%eax, %1\n\t" \ | |
| : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ | |
| (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
| } while (0) | |
| #define RDTSC_FINAL(cycles) \ | |
| do { \ | |
| register unsigned cyc_high, cyc_low; \ | |
| __asm volatile( \ | |
| "rdtscp\n\t" \ | |
| "mov %%edx, %0\n\t" \ | |
| "mov %%eax, %1\n\t" \ | |
| "cpuid\n\t" \ | |
| : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ | |
| (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
| } while (0) | |
| /* | |
| * Prints the best number of operations per cycle where | |
| * test is the function call, answer is the expected answer generated by | |
| * test, repeat is the number of times we should repeat and size is the | |
| * number of operations represented by test. | |
| */ | |
| #define BEST_TIME(test, answer, repeat, size) \ | |
| do { \ | |
| printf("%s: ", #test); \ | |
| fflush(NULL); \ | |
| uint64_t cycles_start, cycles_final, cycles_diff; \ | |
| uint64_t min_diff = (uint64_t)-1; \ | |
| int wrong_answer = 0; \ | |
| for (int i = 0; i < repeat; i++) { \ | |
| __asm volatile("" ::: /* pretend to clobber */ "memory"); \ | |
| RDTSC_START(cycles_start); \ | |
| if (test != answer) wrong_answer = 1; \ | |
| RDTSC_FINAL(cycles_final); \ | |
| cycles_diff = (cycles_final - cycles_start); \ | |
| if (cycles_diff < min_diff) min_diff = cycles_diff; \ | |
| } \ | |
| uint64_t S = (uint64_t)size; \ | |
| float cycle_per_op = (min_diff) / (float)S; \ | |
| printf(" %.2f cycles per operation", cycle_per_op); \ | |
| if (wrong_answer) printf(" [ERROR]"); \ | |
| printf("\n"); \ | |
| fflush(NULL); \ | |
| } while (0) | |
| uint32_t modsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { | |
| uint32_t sum = 0; | |
| for(uint32_t j = 0; j < nmbr ; ++j ) { | |
| sum += z[accesses[j] % N]; | |
| } | |
| return sum; | |
| } | |
| uint32_t fastsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { | |
| uint32_t sum = 0; | |
| uint64_t N64 = (uint64_t) N; | |
| for(uint32_t j = 0; j < nmbr ; ++j ) { | |
| sum += z[(accesses[j] * N64)>> 32] ; | |
| } | |
| return sum; | |
| } | |
| void demo(uint32_t N) { | |
| printf("N = %d\n", N); | |
| uint32_t * z = malloc(N * sizeof(uint32_t)); | |
| for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number | |
| uint32_t nmbr = 500; | |
| uint32_t * accesses = malloc(nmbr * sizeof(uint32_t)); | |
| for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number | |
| uint32_t expected1 = modsum(z,N,accesses,nmbr); | |
| uint32_t expected2 = fastsum(z,N,accesses,nmbr); | |
| BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr); | |
| BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr); | |
| free(z); | |
| free(accesses); | |
| } | |
| int main() { | |
| demo(31); | |
| demo(1500); | |
| demo(15000); | |
| } |