Permalink
Browse files

Sorting numbers on GPU using CUDA

Signed-off-by: Jan Stępień <jstepien@users.sourceforge.net>
  • Loading branch information...
1 parent a89b701 commit 83fa84b4d836cb407d44cfe037c2bc9dad78de7e @jstepien committed May 27, 2010
Showing with 96 additions and 3 deletions.
  1. +13 −3 Makefile
  2. +45 −0 gpusort.cu
  3. +14 −0 gpusort.h
  4. +24 −0 redis.c
View
@@ -2,6 +2,8 @@
# Copyright (C) 2009 Salvatore Sanfilippo <antirez at gmail dot com>
# This file is released under the BSD license, see the COPYING file
+.SUFFIXES: .cu
+
release_hdr := $(shell sh -c './mkreleasehdr.sh')
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
OPTIMIZATION?=-O2
@@ -12,10 +14,13 @@ else
CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W $(ARCH) $(PROF)
CCLINK?= -lm -pthread
endif
+CUDALINK= -lcudart -lcudpp
CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF)
DEBUG?= -g -rdynamic -ggdb
+NVCC?=nvcc
+NVCCFLAGS?=$(PROF) -g
-OBJ = adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o
+OBJ = adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o gpusort.o
BENCHOBJ = ae.o anet.o redis-benchmark.o sds.o adlist.o zmalloc.o
CLIOBJ = anet.o sds.o adlist.o redis-cli.o zmalloc.o linenoise.o
CHECKDUMPOBJ = redis-check-dump.o lzf_c.o lzf_d.o
@@ -50,13 +55,15 @@ redis-check-dump.o: redis-check-dump.c lzf.h
redis-cli.o: redis-cli.c fmacros.h anet.h sds.h adlist.h zmalloc.h \
linenoise.h
redis.o: redis.c fmacros.h config.h redis.h ae.h sds.h anet.h dict.h \
- adlist.h zmalloc.h lzf.h pqsort.h zipmap.h staticsymbols.h sha1.h
+ adlist.h zmalloc.h lzf.h pqsort.h zipmap.h staticsymbols.h sha1.h \
+ gpusort.h
sds.o: sds.c sds.h zmalloc.h
zipmap.o: zipmap.c zmalloc.h
zmalloc.o: zmalloc.c config.h
+gpusort.o: gpusort.cu gpusort.h
redis-server: $(OBJ)
- $(CC) -o $(PRGNAME) $(CCOPT) $(DEBUG) $(OBJ)
+ $(CC) -o $(PRGNAME) $(CCOPT) $(DEBUG) $(OBJ) $(CUDALINK)
@echo ""
@echo "Hint: To run 'make test' is a good idea ;)"
@echo ""
@@ -76,6 +83,9 @@ redis-check-aof: $(CHECKAOFOBJ)
.c.o:
$(CC) -c $(CFLAGS) $(DEBUG) $(COMPILE_TIME) $<
+.cu.o:
+ $(NVCC) -c $(NVCCFLAGS) $< -o $@
+
clean:
rm -rf $(PRGNAME) $(BENCHPRGNAME) $(CLIPRGNAME) $(CHECKDUMPPRGNAME) $(CHECKAOFPRGNAME) *.o *.gcda *.gcno *.gcov
View
@@ -0,0 +1,45 @@
+#include <cudpp.h>
+#include <stdio.h>
+#include "gpusort.h"
+
+#define check_cuda_error() {\
+ if (cudaError_t e = cudaGetLastError()) { \
+ fprintf(stderr, "%s:%i: %s\n", __FILE__, __LINE__, \
+ cudaGetErrorString(e)); \
+ exit(-1); \
+ } }
+
+void runSortingKernel(float *keys, int *values, int n) {
+ float *d_keys = 0;
+ int *d_values = 0;
+ CUDPPConfiguration config;
+ size_t keys_bytes = n * sizeof(*d_keys),
+ values_bytes = n * sizeof(*d_values);
+ config.algorithm = CUDPP_SORT_RADIX;
+ config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
+ config.datatype = CUDPP_FLOAT;
+ CUDPPHandle planhandle = 0;
+ CUDPPResult result = cudppPlan(&planhandle, config, n, 1, 0);
+ if (CUDPP_SUCCESS != result) {
+ fprintf(stderr, "Error creating CUDPPPlan\n");
+ exit(-1);
+ }
+ cudaMalloc((void**) &d_keys, keys_bytes);
+ check_cuda_error();
+ cudaMalloc((void**) &d_values, values_bytes);
+ check_cuda_error();
+ cudaMemcpy(d_keys, keys, keys_bytes, cudaMemcpyHostToDevice);
+ check_cuda_error();
+ cudaMemcpy(d_values, values, values_bytes, cudaMemcpyHostToDevice);
+ check_cuda_error();
+ cudppSort(planhandle, d_keys, d_values, sizeof(*d_keys) * 8, n);
+ check_cuda_error();
+ cudppDestroyPlan(planhandle);
+ check_cuda_error();
+ cudaMemcpy(values, d_values, values_bytes, cudaMemcpyDeviceToHost);
+ check_cuda_error();
+ cudaThreadSynchronize();
+ check_cuda_error();
+ cudaFree(d_keys);
+ cudaFree(d_values);
+}
View
@@ -0,0 +1,14 @@
+#ifndef GPUSORT_H_
+#define GPUSORT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void runSortingKernel(float *keys, int *values, int n);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GPUSORT_H_ */
View
24 redis.c
@@ -77,6 +77,7 @@
#include "zipmap.h" /* Compact dictionary-alike data structure */
#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
#include "release.h" /* Release and/or git repository information */
+#include "gpusort.h"
/* Error codes */
#define REDIS_OK 0
@@ -6935,6 +6936,27 @@ static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
return o;
}
+static inline void gpuSort(redisSortObject *vector, int vectorlen) {
+ float *keys = zmalloc(sizeof(float)*vectorlen);
+ robj **values = zmalloc(sizeof(robj*)*vectorlen);
+ int i;
+ if (server.sort_desc)
+ for (i = 0; i < vectorlen; ++i) {
+ keys[i] = -vector[i].u.score;
+ values[i] = vector[i].obj;
+ }
+ else
+ for (i = 0; i < vectorlen; ++i) {
+ keys[i] = vector[i].u.score;
+ values[i] = vector[i].obj;
+ }
+ runSortingKernel(keys, (int*) values, vectorlen);
+ zfree(keys);
+ for (i = 0; i < vectorlen; ++i)
+ vector[i].obj = values[i];
+ zfree(values);
+}
+
/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
* the additional parameter is not standard but a BSD-specific we have to
* pass sorting parameters via the global 'server' structure */
@@ -7144,6 +7166,8 @@ static void sortCommand(redisClient *c) {
server.sort_bypattern = sortby ? 1 : 0;
if (sortby && (start != 0 || end != vectorlen-1))
pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
+ else if (!alpha && vectorlen >= 50000)
+ gpuSort(vector, vectorlen);
else
qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
}

0 comments on commit 83fa84b

Please sign in to comment.