<a href="https://colab.research.google.com/github/linyuehzzz/hedetniemi_distance/blob/master/all_pair_distance_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**CUDA for all-pair distance algorithms**
CUDA parallelism for all-pair distance algorithms.  
Yue Lin (lin.3326 at osu.edu)  
Created: 6/12/2020

In [21]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


#### **Install packages** 

In [2]:
!pip install timeout-decorator

Collecting timeout-decorator
  Downloading https://files.pythonhosted.org/packages/07/1c/0d9adcb848f1690f3253dcb1c1557b6cf229a93e724977cb83f266cbd0ae/timeout-decorator-0.4.1.tar.gz
Building wheels for collected packages: timeout-decorator
  Building wheel for timeout-decorator (setup.py) ... [?25l[?25hdone
  Created wheel for timeout-decorator: filename=timeout_decorator-0.4.1-cp36-none-any.whl size=5021 sha256=121582383f71f6667722e6b6aa1fe65533da523314b4691f989139a270e80c65
  Stored in directory: /root/.cache/pip/wheels/f1/e6/ea/7387e3629cb46ba65140141f972745b823f4486c6fe884ccb8
Successfully built timeout-decorator
Installing collected packages: timeout-decorator
Successfully installed timeout-decorator-0.4.1


#### **CUDA device query** 

In [None]:
!nvcc --version
from numba import cuda
print(cuda.gpus)

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
<Managed Device 0>


In [None]:
%cd /usr/local/cuda-10.1/samples/1_Utilities/deviceQuery
!ls

/usr/local/cuda-10.1/samples/1_Utilities/deviceQuery
deviceQuery.cpp  Makefile  NsightEclipse.xml  readme.txt


In [None]:
!make
!./deviceQuery

make: Nothing to be done for 'all'.
./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "Tesla P100-PCIE-16GB"
  CUDA Driver Version / Runtime Version          10.1 / 10.1
  CUDA Capability Major/Minor version number:    6.0
  Total amount of global memory:                 16281 MBytes (17071734784 bytes)
  (56) Multiprocessors, ( 64) CUDA Cores/MP:     3584 CUDA Cores
  GPU Max Clock rate:                            1329 MHz (1.33 GHz)
  Memory Clock rate:                             715 Mhz
  Memory Bus Width:                              4096-bit
  L2 Cache Size:                                 4194304 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory: 

#### **Read graph data** 

##### Data from the original article

In [3]:
## [node i, node j, distance between node i and j]
## using data from example 1: San Francisco Bay Area Graph of Time-Distances (in minutes)
data = [[1, 2, 30], [1, 4, 30], [1, 9, 40],
        [2, 3, 25], [2, 4, 40], [3, 4, 50],
        [4, 5, 30], [4, 6, 20], [5, 7, 25],
        [6, 7, 20], [6, 9, 20], [7, 8, 25],
        [8, 9, 20]]
nodes = 9

##### Read random graph

In [31]:
%cd '/content/gdrive/My Drive/Colab Notebooks/hedetniemi_matrix_sum'

## Number of nodes (100/1,000/10,000/100,000/1,000,000)
nodes = 1000
print('Nodes: ', nodes)
## Total degree
degree = 3
print('Degree: ', degree)

data = []
with open('graph_n' + str(nodes) + '_d' + str(degree) + '.txt', 'r') as f:
  lines = f.read().splitlines()
  for line in lines:
    l = line.split()
    item = [int(l[0]), int(l[1]), float(l[2])]
    data.append(item)

print(data[0])

/content/gdrive/My Drive/Colab Notebooks/hedetniemi_matrix_sum
Nodes:  1000
Degree:  3
[609, 621, 18.019071417527243]


#### **Configure CUDA** 

In [32]:
import math

# number of streams
NUM_STREAMS = 5
# number of threads per block: 32、128、256
NUM_THREADS = 32

def get_cuda_execution_config(n):
  numStream = NUM_STREAMS
  numSegment = n // numStream
  dimBlock = (NUM_THREADS, NUM_THREADS)
  dimGrid = (math.ceil(numSegment / NUM_THREADS), math.ceil(numSegment / NUM_THREADS))

  return dimGrid, dimBlock, numStream, numSegment


dimGrid, dimBlock, numStream, numSegment = get_cuda_execution_config(nodes)
print('numStream: ', numStream)
print('numSegment: ', numSegment)
print('dimGrid: ', dimGrid)
print('dimBlock: ', dimBlock)

numStream:  5
numSegment:  200
dimGrid:  (7, 7)
dimBlock:  (32, 32)


#### **Hedetniemi distance** 

##### Construct distance matrix

In [None]:
from timeit import default_timer
from numba import cuda, njit
import numpy as np


@cuda.jit
def graph2dist(graph, dist_mtx, n):
  stride = cuda.gridDim.x * cuda.blockDim.x

  ## initialize distance matrix
  x, y = cuda.grid(2)
  for i in range(x, n, stride):
    for j in range(y, n, stride):
      dist_mtx[i,j] = np.inf

  ## calculate distance matrix
  x = cuda.grid(1)
  for i in range(x, graph.shape[0], stride):
    a = int(graph[i,0]) - 1
    b = int(graph[i,1]) - 1
    d = graph[i,2]
    dist_mtx[a,b] = d
    dist_mtx[b,a] = d
  
  ## set diagonal to 0
  y = cuda.grid(1)
  if y < n:
    dist_mtx[y,y] = 0.0


def distance_matrix(graph, n):
  ## copy data to device
  graph_device = cuda.to_device(graph)
  dist_mtx_device = cuda.device_array(shape=(n,n))

  ## calculate distance matrix
  graph2dist[dimGrid, dimBlock](graph_device, dist_mtx_device, n)
  
  ## copy data to host
  dist_mtx_host = dist_mtx_device.copy_to_host()
 
  return dist_mtx_host


## print time costs
try:
  start = default_timer()
  dist_mtx = distance_matrix(np.array(data), nodes)
  print(dist_mtx)
  stop = default_timer()
  print('Time: ', stop - start)
except:
  print('Time: inf')
  raise

[[ 0. 30. inf 30. inf inf inf inf 40.]
 [30.  0. 25. 40. inf inf inf inf inf]
 [inf 25.  0. 50. inf inf inf inf inf]
 [30. 40. 50.  0. 30. 20. inf inf inf]
 [inf inf inf 30.  0. inf 25. inf inf]
 [inf inf inf 20. inf  0. 20. inf 20.]
 [inf inf inf inf 25. 20.  0. 25. inf]
 [inf inf inf inf inf inf 25.  0. 20.]
 [40. inf inf inf inf 20. inf 20.  0.]]
Time:  0.2706781830002001


##### Calculate Hedetniemi Matrix Sum

In [None]:
from timeit import default_timer
from numba import cuda, njit, float32
from operator import *
import numpy as np

@cuda.jit
def init_mtx(matrix, mtx_a_t_1, mtx_a_t, n):
  # initialize distance matrix
  x, y = cuda.grid(2)
  if x < n and y < n:
    mtx_a_t[x,y] = np.inf
    mtx_a_t_1[x,y] = matrix[x,y]

@cuda.jit
def cal_mtx(matrix, mtx_a_t_1, mtx_a_t, n):
  # bpg = cuda.gridDim.x
  # tpb = cuda.blockDim.x

  # stride = cuda.gridDim.x * cuda.blockDim.x  
  # tx = cuda.threadIdx.x
  # ty = cuda.threadIdx.y
  # sA = cuda.shared.array(shape=(NUM_THREADS, NUM_THREADS), dtype=float32)
  # sB = cuda.shared.array(shape=(NUM_THREADS, NUM_THREADS), dtype=float32)

  # calculate hedetniemi matrix sum
  x, y = cuda.grid(2)
  if x < n and y < n:
    summ = np.inf
    z = cuda.grid(1)
    if z < n:
      summ = min(summ, mtx_a_t_1[x, z] + matrix[z, y])
    mtx_a_t[x,y] = summ
    # summ = np.inf
    # for i in range(bpg):
    #   sA[tx, ty] = mtx_a_t_1[x, ty + i * tpb]
    #   sB[tx, ty] = matrix[tx + i * tpb, y]
    #   cuda.syncthreads()
    #   for j in range(tpb):
    #     summ = min(summ, sA[tx, j] + sB[j, ty])
    #     cuda.syncthreads()
    # mtx_a_t[x,y] = summ

  # x, y = cuda.grid(2)
  # if x < n and y < n:
  #   mtx_a_t_1[x,y] = mtx_a_t[x,y]    


def hede_distance(matrix, n):
  ## copy data to device
  matrix_device = cuda.to_device(matrix)
  mtx_a_t_1_device = cuda.device_array(shape=(n,n))
  mtx_a_t_device = cuda.device_array(shape=(n,n))

  ## initialize hedetniemi distance
  init_mtx[dimGrid, dimBlock](matrix_device, mtx_a_t_1_device, mtx_a_t_device, n)

  ## calculate hedetniemi distance
  # for p in range(n):
  cal_mtx[dimGrid, dimBlock](matrix_device, mtx_a_t_1_device, mtx_a_t_device, n)
  print(mtx_a_t_1_device.copy_to_host()[0])
  print(mtx_a_t_device.copy_to_host()[0])
  cal_mtx[dimGrid, dimBlock](matrix_device, mtx_a_t_1_device, mtx_a_t_device, n)
  print(mtx_a_t_1_device.copy_to_host()[0])
  print(mtx_a_t_device.copy_to_host()[0])
  
  ## copy data to host
  mtx_a_t_host = mtx_a_t_device.copy_to_host()
 
  return mtx_a_t_host


## print time costs
try:
  start = default_timer()
  mtx_a_t = hede_distance(dist_mtx, nodes)
  print(mtx_a_t)
  stop = default_timer()
  print('Time: ', stop - start)
except:
  print('Time: inf')
  raise

## print shortest path matrix
with open('hedet_mtx_nb_cuda.txt', 'w') as fw:
  fw.write('\n'.join(['\t'.join([str(cell) for cell in row]) for row in mtx_a_t.tolist()]))

[ 0. 30. inf 30. inf inf inf inf 40.]
[ 0. 30. inf 30. inf inf inf inf 40.]
[ 0. 30. inf 30. inf inf inf inf 40.]
[ 0. 30. inf 30. inf inf inf inf 40.]
[[ 0. 30. inf 30. inf inf inf inf 40.]
 [30.  0. 25. 40. inf inf inf inf inf]
 [inf 25.  0. 50. inf inf inf inf inf]
 [30. 40. 50.  0. 30. 20. inf inf inf]
 [inf inf inf 30.  0. inf 25. inf inf]
 [inf inf inf 20. inf  0. 20. inf 20.]
 [inf inf inf inf 25. 20.  0. 25. inf]
 [inf inf inf inf inf inf 25.  0. 20.]
 [40. inf inf inf inf 20. inf 20.  0.]]
Time:  0.28587139699993713


#### **Floyd–Warshall distance** 

##### Construct distance matrix

In [39]:
from timeit import default_timer
from numba import cuda, njit
import numpy as np


@cuda.jit
def graph2dist(graph, dist_mtx, n):
  stride = cuda.gridDim.x * cuda.blockDim.x

  ## initialize distance matrix
  x, y = cuda.grid(2)
  for i in range(x, n, stride):
    for j in range(y, n, stride):
      dist_mtx[i,j] = np.inf

  ## calculate distance matrix
  x = cuda.grid(1)
  for i in range(x, graph.shape[0], stride):
    a = int(graph[i,0]) - 1
    b = int(graph[i,1]) - 1
    d = graph[i,2]
    dist_mtx[a,b] = d
    dist_mtx[b,a] = d
  
  ## set diagonal to 0
  y = cuda.grid(1)
  if y < n:
    dist_mtx[y,y] = 0.0


def distance_matrix(graph, n):
  ## copy data to device
  graph_device = cuda.to_device(graph)
  dist_mtx_device = cuda.device_array(shape=(n,n))

  ## calculate distance matrix
  graph2dist[dimGrid, dimBlock](graph_device, dist_mtx_device, n)
  
  ## copy data to host
  dist_mtx_host = dist_mtx_device.copy_to_host()
 
  return dist_mtx_host


## print time costs
try:
  start = default_timer()
  dist_mtx = distance_matrix(np.array(data), nodes)
  stop = default_timer()
  print('Time: ', stop - start)
except:
  print('Time: inf')
  raise

Time:  0.30542288800006645


##### Calculate Floyd–Warshall distance

In [43]:
from timeit import default_timer
from numba import cuda, njit
from operator import *
import numpy as np

@cuda.jit
def all_pair_floyd(matrix, k, n):
  stride = cuda.gridDim.x * cuda.blockDim.x
  x, y = cuda.grid(2)
  if x < n and y < n:
    matrix[x,y] = min(matrix[x,y], matrix[x,k] + matrix[k,y])
  # for i in range(x, n, stride):
  #   for j in range(y, n, stride):
  #     matrix[i,j] = min(matrix[i,j], matrix[i,k] + matrix[k,j])


def floyd_distance(matrix, n):
  ## copy data to device
  matrix_device = cuda.to_device(matrix)

  ## calculate hedetniemi distance
  for k in range(n):
    all_pair_floyd[dimGrid, dimBlock](matrix_device, k, n)
  
  ## copy data to host
  matrix_host = matrix_device.copy_to_host()
 
  return matrix


# print time costs
try:
  start = default_timer()
  mtx_a_t = floyd_distance(dist_mtx, nodes)
  stop = default_timer()
  print('Time: ', stop - start)
except:
  print('Time: inf')
  raise

Time:  0.31029173400020227


#### **Compare results** 

In [None]:
!diff 'hedet_mtx_list.txt' 'hedet_mtx_nb_cuda.txt'