Updated the vcnn

jimmy-ren · Oct 6, 2015 · d7ef6a7 · d7ef6a7
1 parent 213a512
commit d7ef6a7
Show file tree

Hide file tree

Showing 119 changed files with 2,503 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 vcnn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/cuda/compile_kernels.m b/cuda/compile_kernels.m
@@ -0,0 +1,11 @@
+function compile_kernels()
+    [dirname, ~, ~] = fileparts(mfilename('fullpath'));
+
+    compile_(dirname, 'im2col.cu');
+
+end
+
+
+function compile_(dirname, cu_fn)
+    system(sprintf('(cd %s && nvcc --ptx %s)', dirname, cu_fn));
+end
diff --git a/cuda/get_kernel.m b/cuda/get_kernel.m
@@ -0,0 +1,23 @@
+function ker = get_kernel(ker_name, cu_filename)
+
+global data;
+
+switch data.gpu.float_t
+case 'double',
+    ker_name = [ker_name '_d'];
+case 'single',
+    ker_name = [ker_name '_f'];
+end
+
+if ~isfield(data.gpu.kernels, ker_name)
+    cu_filename = fullfile('cuda', cu_filename);
+    ptx_filename = regexprep(cu_filename, '\.cu$', '.ptx');
+    ker = struct();
+    ker.name = ker_name;
+    ker.ker = parallel.gpu.CUDAKernel(ptx_filename, cu_filename, ker_name);
+    ker.block_size = ker.ker.ThreadBlockSize;
+    ker.grid_size = ker.ker.GridSize;
+    data.gpu.kernels.(ker_name) = ker;
+end
+
+ker = data.gpu.kernels.(ker_name);
diff --git a/cuda/im2col.cu b/cuda/im2col.cu
@@ -0,0 +1,102 @@
+#define IDX3(X, n1, n2, n3, i1, i2, i3) (X[(i1)*((n2)*(n3)) + (i2)*(n3) + (i3)])
+
+template<class T>
+__device__ void im2col_ker(const T *im, T *patches,
+                           int im_ni, int im_nj, int nimgs,
+                           int p_ni, int p_nj, int npatches)
+{
+	int total_threads = gridDim.x * blockDim.x;
+    int patch = blockIdx.x * blockDim.x + threadIdx.x;
+    int patches_per_img = npatches / nimgs;
+
+	for (; patch < npatches; patch += total_threads) {
+		int im_k = patch / patches_per_img;  /* image index */
+		int im_j0 = patch / (im_ni - p_ni + 1);  /* patch topleft j in image */
+		int im_i0 = patch % (im_ni - p_ni + 1);  /* patch topleft i in image */
+
+		for (int pj = 0; pj < p_nj; ++pj) {
+			for (int pi = 0; pi < p_ni; ++pi) {
+				IDX3(patches, npatches, p_nj, p_ni,
+							  patch, pj, pi)
+						= IDX3(im, nimgs, im_nj, im_ni,
+								   im_k, im_j0 + pj, im_i0 + pi);
+			}
+		}
+	}
+}
+
+
+template<class T>
+__device__ void scol2im_ker(T *im, const T *patches,
+                            int im_ni, int im_nj, int nimgs,
+                            int p_ni, int p_nj, int npatches)
+{
+	int total_threads = gridDim.x * blockDim.x;
+    int pixel = blockIdx.x * blockDim.x + threadIdx.x;
+	int valid_nj = im_nj - p_nj + 1;
+	int valid_ni = im_ni - p_ni + 1;
+    int npixels = nimgs * im_nj * im_ni;
+    int patches_per_img = npatches / nimgs;
+
+	for (; pixel < npixels; pixel += total_threads) {
+		T x = 0;
+
+		int im_k = pixel / (im_ni * im_nj);  /* image index */
+		int im_j = pixel / im_ni;  /* pixel in image */
+		int im_i = pixel % im_ni;
+
+		for (int pj = 0; pj < p_nj; ++pj) {
+			for (int pi = 0; pi < p_ni; ++pi) {
+				int im_pj = im_j - pj;  /* topleft of patch in image */
+				int im_pi = im_i - pi;  /* topleft of patch in image */
+				if (im_pi < 0 || im_pj < 0 || 
+					im_pj >= valid_nj || im_pi >= valid_ni)
+						continue;
+
+				int patch = im_k * patches_per_img + im_pj * valid_ni + im_pi;
+				x += IDX3(patches, npatches, p_nj, p_ni,
+								   patch, pj, pi);
+			}
+		}
+
+		IDX3(im, nimgs, im_nj, im_ni,
+				 im_k, im_j, im_i) = x;
+	}
+}
+
+__global__ void im2col_d(const double *im, double *patches,
+                         int im_ni, int im_nj, int nimgs,
+                         int p_ni, int p_nj, int npatches)
+{
+    im2col_ker<double>(im, patches,
+                       im_ni, im_nj, nimgs,
+                       p_ni, p_nj, npatches);
+}
+
+__global__ void scol2im_d(double *im, const double *patches,
+                          int im_ni, int im_nj, int nimgs,
+                          int p_ni, int p_nj, int npatches)
+{
+    scol2im_ker<double>(im, patches,
+                        im_ni, im_nj, nimgs,
+                        p_ni, p_nj, npatches);
+}
+
+__global__ void im2col_f(const float *im, float *patches,
+                         int im_ni, int im_nj, int nimgs,
+                         int p_ni, int p_nj, int npatches)
+{
+    im2col_ker<float>(im, patches,
+                      im_ni, im_nj, nimgs,
+                      p_ni, p_nj, npatches);
+}
+
+__global__ void scol2im_f(float *im, const float *patches,
+                          int im_ni, int im_nj, int nimgs,
+                          int p_ni, int p_nj, int npatches)
+{
+    scol2im_ker<float>(im, patches,
+                       im_ni, im_nj, nimgs,
+                       p_ni, p_nj, npatches);
+}
+