Skip to content

Commit

Permalink
Modernize FRB quantizer (#1191)
Browse files Browse the repository at this point in the history
  • Loading branch information
eschnett committed Mar 21, 2024
1 parent 4ebf9cc commit bfc1658
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 55 deletions.
13 changes: 6 additions & 7 deletions lib/cuda/cudaQuantize.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include "cudaQuantize.hpp"

#include "cudaUtils.hpp"
#include "math.h"
#include "mma.h"

#include <cmath>
#include <vector>

void launch_quantize_kernel(cudaStream_t stream, int nframes, const __half2* in_base,
__half2* outf_base, unsigned int* outi_base, const int* index_array);

Expand Down Expand Up @@ -34,10 +36,8 @@ cudaQuantize::cudaQuantize(Config& config, const std::string& unique_name,
size_t index_array_len = (size_t)_num_chunks * 2 * sizeof(int32_t);
int32_t* index_array_memory = (int32_t*)device.get_gpu_memory(_gpu_mem_index, index_array_len);

int32_t* index_array_host = (int32_t*)malloc(index_array_len);
assert(index_array_host);
std::vector<int32_t> index_array_host(index_array_len / sizeof(int32_t), 0);

memset(index_array_host, 0, index_array_len);
for (int f = 0; f < _num_chunks / FRAME_SIZE; f++) {
for (int i = 0; i < FRAME_SIZE; i++) {
// offset of the start of the chunk in the input array, for each chunk.
Expand All @@ -50,9 +50,8 @@ cudaQuantize::cudaQuantize(Config& config, const std::string& unique_name,
index_array_host[f * FRAME_SIZE * 2 + FRAME_SIZE + 1] = f * FRAME_SIZE * (CHUNK_SIZE / 8);
}

CHECK_CUDA_ERROR(
cudaMemcpy(index_array_memory, index_array_host, index_array_len, cudaMemcpyHostToDevice));
free(index_array_host);
CHECK_CUDA_ERROR(cudaMemcpy(index_array_memory, index_array_host.data(), index_array_len,
cudaMemcpyHostToDevice));
}

cudaQuantize::~cudaQuantize() {}
Expand Down
113 changes: 69 additions & 44 deletions lib/cuda/quantize_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,80 +1,105 @@
#include <cuda_runtime.h>
#include <cuda_fp16.h>
//#include "header.cuh"
#include <stdio.h> /*printf*/
#include <cuda_runtime.h>

#define chunksPerFrame 32
const int chunksPerFrame = 32;

__device__ unsigned int clamped_round_down(__half x)
{
x = (__hgt(x, 1.5)) ? x : __float2half(1.5); // max(x, 1.5)
x = (__hlt(x, 15.5)) ? x : __float2half(15.5); // min(x, 15.5)

__device__ unsigned int clamped_round_down(__half x) {
x = __hmax(x, __float2half(1.5f));
x = __hmin(x, __float2half(15.5f));
return __half2int_rd(x);
}

__device__ float sum_half2 (__half2 x) {
return __low2float(x) + __high2float(x);
__device__ float sum_half2(__half2 x) {
return __low2float(x) + __high2float(x);
}

__global__ void quantize (const __half2 *__restrict__ in_base, __half2 *__restrict__ outf_base, unsigned int *__restrict__ outi_base, const int *__restrict__ index_array)
{
__half2 * __restrict__ outf = outf_base + index_array[64*blockIdx.x + 32];
unsigned int * __restrict__ outi = outi_base + index_array[64*blockIdx.x + 33];
__global__ void quantize(const __half2* __restrict__ in_base, __half2* __restrict__ outf_base,
unsigned int* __restrict__ outi_base,
const int* __restrict__ index_array) {
__half2* __restrict__ outf = outf_base + index_array[64 * blockIdx.x + 32];
unsigned int* __restrict__ outi = outi_base + index_array[64 * blockIdx.x + 33];
__half2 outf_temp = __float2half2_rn(0);
for (int chunk = 0; chunk < chunksPerFrame; chunk++)
{
const __half2 * chunkIn = in_base + index_array[64*blockIdx.x + chunk]; //the memory for the current chunk
for (int chunk = 0; chunk < chunksPerFrame; chunk++) {
const __half2* chunkIn =
in_base + index_array[64 * blockIdx.x + chunk]; // the memory for the current chunk

float mean = 0;
for (int j = 0; j < 4; j++) //moving along the chunk
for (int j = 0; j < 4; j++) // moving along the chunk
{
mean += sum_half2(chunkIn[j*32 + threadIdx.x]);
mean += sum_half2(chunkIn[j * 32 + threadIdx.x]);
}
mean += __shfl_sync(0xffffffff, mean, threadIdx.x ^ 1);
mean += __shfl_sync(0xffffffff, mean, threadIdx.x ^ 2);
mean += __shfl_sync(0xffffffff, mean, threadIdx.x ^ 4);
mean += __shfl_sync(0xffffffff, mean, threadIdx.x ^ 8);
mean += __shfl_sync(0xffffffff, mean, threadIdx.x ^ 16);
mean /= 256.0;
mean /= 256;

float scale = 0;
for (int j = 0; j < 4; j++)
{
const __half2 diff = __hsub2(chunkIn[j*32 + threadIdx.x], __float2half2_rn(mean));
for (int j = 0; j < 4; j++) {
const __half2 diff = __hsub2(chunkIn[j * 32 + threadIdx.x], __float2half2_rn(mean));
scale += sum_half2(__hmul2(diff, diff));
}
scale += __shfl_sync(0xffffffff, scale, threadIdx.x ^ 1);
scale += __shfl_sync(0xffffffff, scale, threadIdx.x ^ 2);
scale += __shfl_sync(0xffffffff, scale, threadIdx.x ^ 4);
scale += __shfl_sync(0xffffffff, scale, threadIdx.x ^ 8);
scale += __shfl_sync(0xffffffff, scale, threadIdx.x ^ 16);
scale = (sqrtf(scale/255.0))/2.3;
// scale = (sqrtf(scale / 255)) / 2.3f;
scale = sqrtf(scale) / (2.3f * sqrtf(255));

outf_temp = (threadIdx.x == chunk)
? __floats2half2_rn(mean, scale)
: outf_temp; // for each thread, saving a single outf_temp value, then I'll
// merge them all later

scale = (scale == 0) ? 1 : scale; // avoid division by 0

outf_temp = (threadIdx.x == chunk) ? __floats2half2_rn(mean, scale) : outf_temp; //for each thread, saving a single outf_temp value, then I'll merge them all later

scale = (scale == 0) ? 1 : scale; //avoid division by 0

//computing i values
__half meanh = __float2half(mean); __half scaleh = __float2half(scale);
unsigned int i = clamped_round_down(__hadd((__low2half(chunkIn[threadIdx.x]) - meanh)/scaleh, __half(8.5)));
i = i | (clamped_round_down(__hadd((__high2half(chunkIn[threadIdx.x]) - meanh)/scaleh, __half(8.5))) << 4);
i = i | (clamped_round_down(__hadd((__low2half( chunkIn[threadIdx.x + 32]) - meanh)/scaleh, __half(8.5))) << 8);
i = i | (clamped_round_down(__hadd((__high2half(chunkIn[threadIdx.x + 32]) - meanh)/scaleh, __half(8.5))) << 12);
i = i | (clamped_round_down(__hadd((__low2half( chunkIn[threadIdx.x + 64]) - meanh)/scaleh, __half(8.5))) << 16);
i = i | (clamped_round_down(__hadd((__high2half(chunkIn[threadIdx.x + 64]) - meanh)/scaleh, __half(8.5))) << 20);
i = i | (clamped_round_down(__hadd((__low2half( chunkIn[threadIdx.x + 96]) - meanh)/scaleh, __half(8.5))) << 24);
i = i | (clamped_round_down(__hadd((__high2half(chunkIn[threadIdx.x + 96]) - meanh)/scaleh, __half(8.5))) << 28);
// computing i values
__half meanh = __float2half(mean);
__half scaleh = __float2half(scale);
unsigned int i =
clamped_round_down((__low2half(chunkIn[threadIdx.x]) - meanh) / scaleh + __half(8.5f));
i = i
| (clamped_round_down((__high2half(chunkIn[threadIdx.x]) - meanh) / scaleh
+ __half(8.5f))
<< 4);
i = i
| (clamped_round_down((__low2half(chunkIn[threadIdx.x + 32]) - meanh) / scaleh
+ __half(8.5f))
<< 8);
i = i
| (clamped_round_down((__high2half(chunkIn[threadIdx.x + 32]) - meanh) / scaleh
+ __half(8.5f))
<< 12);
i = i
| (clamped_round_down((__low2half(chunkIn[threadIdx.x + 64]) - meanh) / scaleh
+ __half(8.5f))
<< 16);
i = i
| (clamped_round_down((__high2half(chunkIn[threadIdx.x + 64]) - meanh) / scaleh
+ __half(8.5f))
<< 20);
i = i
| (clamped_round_down((__low2half(chunkIn[threadIdx.x + 96]) - meanh) / scaleh
+ __half(8.5f))
<< 24);
i = i
| (clamped_round_down((__high2half(chunkIn[threadIdx.x + 96]) - meanh) / scaleh
+ __half(8.5f))
<< 28);

unsigned int i_shfl = __shfl_sync(0xffffffff, i, threadIdx.x ^ 1);
unsigned int selector = (threadIdx.x % 2) ? (3<<12) + (7<<8) + (1<<4) + (5<<0) : (6<<12) + (2<<8) + (4<<4) + (0<<0);
unsigned int selector = (threadIdx.x % 2) ? (3 << 12) + (7 << 8) + (1 << 4) + (5 << 0)
: (6 << 12) + (2 << 8) + (4 << 4) + (0 << 0);
i = __byte_perm(i, i_shfl, selector);
i_shfl = __shfl_sync(0xffffffff, i, threadIdx.x ^ 2);
selector = (threadIdx.x%4/2) ? (3<<12) + (2<<8) + (7<<4) + (6<<0) : (5<<12) + (4<<8) + (1<<4) + (0<<0);
selector = (threadIdx.x % 4 / 2) ? (3 << 12) + (2 << 8) + (7 << 4) + (6 << 0)
: (5 << 12) + (4 << 8) + (1 << 4) + (0 << 0);
i = __byte_perm(i, i_shfl, selector);

outi[chunk*32 + threadIdx.x/4 + (threadIdx.x%4)*8] = i; //writing result to memory
outi[chunk * 32 + threadIdx.x / 4 + (threadIdx.x % 4) * 8] = i; // writing result to memory
}
//writing the means and scales for this frame to global
// writing the means and scales for this frame to global
outf[threadIdx.x] = outf_temp;
}
8 changes: 4 additions & 4 deletions lib/cuda/quantize_kernel_wrapper.cu
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#include "quantize_kernel.cu"

void launch_quantize_kernel(cudaStream_t stream, int nframes, const __half2 *in_base,
__half2 *outf_base, unsigned int *outi_base,
const int *index_array) {
void launch_quantize_kernel(cudaStream_t stream, int nframes, const __half2* in_base,
__half2* outf_base, unsigned int* outi_base, const int* index_array) {
dim3 nblocks;
nblocks.x = nframes;
nblocks.y = 1;
nblocks.z = 1;
int nthreads = 32;
int shmem_nbytes = 0;
quantize <<<nblocks, nthreads, shmem_nbytes, stream >>> (in_base, outf_base, outi_base, index_array);
quantize<<<nblocks, nthreads, shmem_nbytes, stream>>>(in_base, outf_base, outi_base,
index_array);
}

0 comments on commit bfc1658

Please sign in to comment.