Import libraries

In [8]:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np

Defining kernel code

In [9]:
kernel_code = """
  __global__ void euclidean_distance( float* a, float* b, float* c, int vec_length, int vec_num) {

    int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
    float term_diff = 0;
    float difference = 0;
    
    if ( thread_index > vec_num  ) {
      return;
    }

    for ( int ii = 0; ii < vec_length; ++ii ) {
      
      term_diff = 0;

      int vec_index = vec_length*thread_index + ii;
    
      term_diff = b[ii] - a[ vec_index ];
      difference += term_diff*term_diff;
      
      __syncthreads();

    }

    c[ thread_index ] = sqrtf( difference );
    
   
  }
"""

Read data

In [10]:
data = np.loadtxt( "MLoGPU_data1_train.csv", delimiter=",", dtype=np.float32 )
training_vecs = np.array( data[ :, 0:-1 ] )
labels = np.array( data[ :, -1 ] )

# Trying the code for smaller batch of data
#training_vecs = training_vecs[ 0:3, : ]
#print( training_vecs )

Defining constants and block and grid dims

In [11]:
N_training = training_vecs.shape[0]
N_features = training_vecs.shape[1]

if N_training < 1024:
  THREADS = N_training
else:
  THREADS = 1024

gridDim = int( np.ceil( N_training/THREADS ) )
block_dims = ( THREADS, 1, 1 )
grid_dims = ( gridDim, 1, 1 )

print( N_training)
print( N_features)
print( gridDim )
print( block_dims )
print( grid_dims )

4000
10
4
(1024, 1, 1)
(4, 1, 1)


Create test point, allocate memory to GPU

In [12]:
test_point = 1*np.ones( [1, 10], dtype = np.float32 )

print( test_point )

training_vecs_gpu = cuda.mem_alloc( training_vecs.nbytes )
cuda.memcpy_htod( training_vecs_gpu, training_vecs )

test_point_gpu = cuda.mem_alloc( test_point.nbytes )
cuda.memcpy_htod( test_point_gpu, test_point )

distances = np.empty( [ training_vecs.shape[0], 1 ], dtype = np.float32 )
distances_gpu = cuda.mem_alloc( distances.nbytes )

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


Defining the amont of shared memory in the kernel code and running the kernel

In [13]:
# Compile the kernel code
modd = SourceModule( kernel_code )

dist_fun = modd.get_function( "euclidean_distance" )

dist_fun( training_vecs_gpu, test_point_gpu, distances_gpu, np.int32( N_features ), np.int32( N_training ),
          block = block_dims, grid = grid_dims )

cuda.memcpy_dtoh( distances, distances_gpu )