Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Saving... f62bc5b Aug 20, 2018
1 contributor

Users who have contributed to this file

134 lines (119 sloc) 5.54 KB
/**
* Demonstrates how can map a 32-bit integer to a range faster than
* a modulo reduction.
* Assumes x64 processor.
* gcc -O3 -o fastrange fastrange.c
*/
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#define RDTSC_START(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_FINAL(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
/*
* Prints the best number of operations per cycle where
* test is the function call, answer is the expected answer generated by
* test, repeat is the number of times we should repeat and size is the
* number of operations represented by test.
*/
#define BEST_TIME(test, answer, repeat, size) \
do { \
printf("%s: ", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
int wrong_answer = 0; \
for (int i = 0; i < repeat; i++) { \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
if (test != answer) wrong_answer = 1; \
RDTSC_FINAL(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
} \
uint64_t S = (uint64_t)size; \
float cycle_per_op = (min_diff) / (float)S; \
printf(" %.2f cycles per operation", cycle_per_op); \
if (wrong_answer) printf(" [ERROR]"); \
printf("\n"); \
fflush(NULL); \
} while (0)
uint32_t modsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) {
uint32_t sum = 0;
for(uint32_t j = 0; j < nmbr ; ++j ) {
sum += z[accesses[j] % N];
}
return sum;
}
uint32_t fastsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) {
uint32_t sum = 0;
uint64_t N64 = (uint64_t) N;
for(uint32_t j = 0; j < nmbr ; ++j ) {
sum += z[(accesses[j] * N64)>> 32] ;
}
return sum;
}
// N is a power of two
uint32_t maskedsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) {
uint32_t sum = 0;
for(uint32_t j = 0; j < nmbr ; ++j ) {
sum += z[accesses[j] & (N-1)] ;
}
return sum;
}
void demo(uint32_t N) {
printf("N = %d\n", N);
uint32_t * z = malloc(N * sizeof(uint32_t));
for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number
uint32_t nmbr = 500;
uint32_t * accesses = malloc(nmbr * sizeof(uint32_t));
for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number
uint32_t expected1 = modsum(z,N,accesses,nmbr);
uint32_t expected2 = fastsum(z,N,accesses,nmbr);
BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr);
free(z);
free(accesses);
}
void demopoweroftwo(uint32_t N) {
printf("N = %d\n", N);
uint32_t * z = malloc(N * sizeof(uint32_t));
for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number
uint32_t nmbr = 500;
uint32_t * accesses = malloc(nmbr * sizeof(uint32_t));
for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number
uint32_t expected1 = modsum(z,N,accesses,nmbr);
uint32_t expected2 = fastsum(z,N,accesses,nmbr);
BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr);
BEST_TIME(maskedsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
free(z);
free(accesses);
}
int main() {
demo(31);
demo(1500);
demo(15000);
demopoweroftwo(32);
demopoweroftwo(4096);
demopoweroftwo(65536);
}
You can’t perform that action at this time.