Permalink
Browse files

Rewrite cl to use single work group to do everything

  • Loading branch information...
john-hu committed Jan 19, 2015
1 parent cd23d91 commit 8f6a3fc3f443a5703c567bc5269ce01bb787a7fd
Showing with 54 additions and 32 deletions.
  1. +31 −8 histogram/histogram.c
  2. +23 −24 histogram/histogram.py
View
@@ -2,35 +2,58 @@
#define BIN_SIZE 1024
#define RESULT_SIZE 768
__kernel void histogram(__global unsigned char* bytes, __global unsigned int* pixelCount,
__global unsigned int* result)
__kernel void histogram(__global unsigned char* bytes,
__global unsigned int* pixelCount,
__global unsigned int* tempResult,
__global unsigned int* finalResult)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
unsigned int groupId = get_group_id(0);
unsigned int gsize = get_local_size(0);
unsigned int globalId = get_global_id(0);
unsigned int totalTasks = get_num_groups(0) * gsize;
unsigned int i, bValue;
unsigned int basePixelIdx = lid * BIN_SIZE + gid * gsize * BIN_SIZE;
unsigned int basePixelIdx = lid * BIN_SIZE + groupId * gsize * BIN_SIZE;
unsigned int baseResultIdx = globalId * RESULT_SIZE;
unsigned int maxPixel = *pixelCount;
unsigned int privateBuffer[RESULT_SIZE];
// Only use the latest 768 work items to copy the data. We assume that the
// latest 768 work items are the last group executes.
unsigned int lastGroup = totalTasks - RESULT_SIZE;
for (i = 0; i < RESULT_SIZE; i++) {
result[baseResultIdx + i] = 0;
privateBuffer[i] = 0;
}
if (globalId >= lastGroup) {
finalResult[globalId] = 0;
}
unsigned int processIndex = 0;
while (processIndex < BIN_SIZE && (basePixelIdx + processIndex < maxPixel)) {
// data partition of bytes is RGBRGBRGB....
bValue = bytes[basePixelIdx * 3 + processIndex * 3];
// result partition is RR..RRGG..GGBB..BB.
result[baseResultIdx + bValue]++;
privateBuffer[bValue]++;
// G
bValue = bytes[basePixelIdx * 3 + processIndex * 3 + 1];
result[baseResultIdx + 256 + bValue]++;
privateBuffer[256 + bValue]++;
// B
bValue = bytes[basePixelIdx * 3 + processIndex * 3 + 2];
result[baseResultIdx + 512 + bValue]++;
privateBuffer[512 + bValue]++;
processIndex++;
}
for (i = 0; i < RESULT_SIZE; i++) {
tempResult[baseResultIdx + i] = privateBuffer[i];
}
barrier(CLK_GLOBAL_MEM_FENCE);
if (globalId >= lastGroup) {
for (i = 0; i < totalTasks; i++) {
finalResult[globalId - lastGroup] += tempResult[globalId - lastGroup + i * RESULT_SIZE];
}
}
}
View
@@ -9,19 +9,18 @@
def cpu_histogram(img):
return img.histogram()
def opencl_histogram(img):
def opencl_histogram(pixels):
# format of pixels is RGBRGBRGB each of character in a byte
pixels = image.tobytes()
# calculate buffer size
groupSize = 4
groupSize = 1
binSize = 1024
pixelSize = len(pixels) / 3
trunkSize = int(math.ceil(math.ceil(pixelSize / groupSize) / binSize))
globalSize = int(math.ceil(pixelSize / binSize))
globalSize += (groupSize - globalSize % groupSize)
# buffer size is 768(whole space) * group size * trunk size
outputBufSize = 768 * groupSize * trunkSize
print 'pixel count: {}, trunk count: {}, buffer size: {}, global size: {}'.format(pixelSize, trunkSize, outputBufSize, globalSize)
#create context/queue
clContext = cl.create_some_context()
clQueue = cl.CommandQueue(clContext)
@@ -33,23 +32,20 @@ def opencl_histogram(img):
mf = cl.mem_flags
bufPixels = cl.Buffer(clContext, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=pixels)
bufPixelSize = cl.Buffer(clContext, mf.READ_ONLY | mf.USE_HOST_PTR, size=4, hostbuf=numpy.asarray([pixelSize]).astype(numpy.uint32))
bufOutput = cl.Buffer(clContext, mf.WRITE_ONLY, size=outputBufSize * 4, hostbuf=None)
start_time = time()
clProgram.histogram(clQueue, (globalSize, ), (groupSize, ), bufPixels, bufPixelSize, bufOutput)
end_time = time()
print ('time: {}'.format(end_time - start_time))
semiFinal = numpy.zeros(outputBufSize, dtype=numpy.uint32)
evt = cl.enqueue_read_buffer(clQueue, bufOutput, semiFinal)
evt.wait()
finalResult = [0] * 768
for i in range(outputBufSize):
finalResult[i % 768] += semiFinal[i]
bufTempResult = cl.Buffer(clContext, mf.READ_WRITE, size=outputBufSize * 4, hostbuf=None)
bufOutput = cl.Buffer(clContext, mf.WRITE_ONLY, size=768 * 4, hostbuf=None)
# execute program
clProgram.histogram(clQueue, (globalSize, ), None,
bufPixels, bufPixelSize, bufTempResult, bufOutput)
# read data back
finalResult = numpy.zeros(768, dtype=numpy.uint32)
cl.enqueue_read_buffer(clQueue, bufOutput, finalResult)
clQueue.finish()
return finalResult
parser = argparse.ArgumentParser(description='Dump histogram data.')
parser.add_argument('--input', help='the input image')
parser.add_argument('--dump', help='dump the histogram')
args = parser.parse_args()
@@ -65,26 +61,29 @@ def opencl_histogram(img):
print ('-' * 20)
# the histogram format is RRRR...RRGGGG...GGGBBB...BBB.
start_time = time()
histogram = cpu_histogram(image)
histogramC = cpu_histogram(image)
end_time = time()
print ('time elapsed with sequential CPU: {0}s'.format(end_time - start_time))
print ('-' * 20)
start_time = time()
histogram = opencl_histogram(image)
histogramG = opencl_histogram(image.tobytes())
end_time = time()
print ('time elapsed with open cl: {0}s'.format(end_time - start_time))
histogram = histogramC;
print ('-' * 20)
print ('file.mode: {}'.format(image.mode))
print ('file.size: {0}x{1}'.format(width, height))
print ('file.format: {}'.format(image.format))
print ('-' * 20)
print ('(size: {0})'.format(len(histogram)))
for i in range(256):
print ('R: {0}, G: {0}, B: {0} => ({1}, {2}, {3})'.format(i,
histogram[i],
histogram[256 + i],
histogram[256 * 2 + i]))
if args.dump is not None:
for i in range(256):
print ('R: {0}, G: {0}, B: {0} => ({1}, {2}, {3})'.format(i,
histogram[i],
histogram[256 + i],
histogram[256 * 2 + i]))
print ('=' * 20)

0 comments on commit 8f6a3fc

Please sign in to comment.