<a href="https://colab.research.google.com/github/ismirnov56/MatmultCUDA/blob/ismirnov56-patch-1/2Numba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 2.2 Numba

Из-за возникновения конфликта в настройках Nvidia принято решение развести по разным notebook реализации

Устновка:

In [None]:
!apt-get install nvidia-cuda-toolkit
!pip install numba

In [None]:
!find / -iname 'libdevice'
!find / -iname 'libnvvm.so'

/usr/local/cuda-10.1/nvvm/libdevice
/usr/local/cuda-10.0/nvvm/libdevice
/usr/lib/nvidia-cuda-toolkit/libdevice
/usr/lib/cuda/nvvm/libdevice
/usr/local/cuda-10.1/nvvm/lib64/libnvvm.so
/usr/local/cuda-10.0/nvvm/lib64/libnvvm.so


In [None]:
import os
os.environ['NUMBAPRO_LIBDEVICE'] = "/usr/local/cuda-10.1/nvvm/libdevice"
os.environ['NUMBAPRO_NVVM'] = "/usr/local/cuda-10.1/nvvm/lib64/libnvvm.so"

In [None]:
import numba.cuda.api
import numba.cuda.cudadrv.libs
numba.cuda.cudadrv.libs.test()


Finding cublas from System
	located at libcublas.so
	trying to open library...	ok
Finding cusparse from System
	located at /usr/local/cuda/lib64/libcusparse.so.10.3.0.243
	trying to open library...	ok
Finding cufft from System
	located at /usr/local/cuda/lib64/libcufft.so.10.1.1.243
	trying to open library...	ok
Finding curand from System
	located at /usr/local/cuda/lib64/libcurand.so.10.1.1.243
	trying to open library...	ok
Finding nvvm from System
	located at /usr/local/cuda/nvvm/lib64/libnvvm.so.3.3.0
	trying to open library...	ok
Finding libdevice from System
	searching for compute_20...	ok
	searching for compute_30...	ok
	searching for compute_35...	ok
	searching for compute_50...	ok


True

In [None]:
numba.cuda.api.detect()

Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      compute capability: 7.5
                           pci device id: 4
                              pci bus id: 0
Summary:
	1/1 devices are supported


True

### 2.2.1 Реализация в 'лоб'
Аналогично PyCuda

In [None]:
from numba import cuda, float32
import numpy as np
import math
from time import time

# Вычисления будут выполняться на блоках элементов TPBxTPB.
TPB = 32
n = 10

@cuda.jit
def matmul(A, B, C):

    x, y = cuda.grid(2)

    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    row = by * cuda.blockDim.y + ty
    col = bx * cuda.blockDim.x + tx
    
    if row < n and col < n:
      tmp = 0
      for j in range(n):
        tmp += A[row, j] * B[j, col]
      C[row, col] = tmp

A = np.random.randn(n, n).astype(np.float32)
B = np.random.randn(n, n).astype(np.float32)
C = np.dot(A, B)

threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[1]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[0]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((n, n))
    
matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
res = C_global_mem.copy_to_host()
print(n)
print('-'*80)
print(C)
print('-'*80)
print(res)

10
--------------------------------------------------------------------------------
[[ 1.8246052  -1.7834735  -3.0392675   2.7423606   4.322127    0.80730397
  -2.6904116  -1.042314    0.5716245   3.1254756 ]
 [ 0.89561504  5.0263715  -5.772845   -1.3087717   0.12531497  1.8100029
  -3.4151542  -0.6627056  -2.1695244  -1.2350144 ]
 [ 3.2640185  -1.6468567  -1.628558    0.4240253  -0.95938325  0.04865081
   1.8554276   3.1296644   0.06229217  0.22955242]
 [ 3.707739    0.64840996 -1.6767858  -3.63755     5.5408792   4.794858
   1.751004    0.62051094  2.0569193   1.1650298 ]
 [ 1.1197258  -2.1967242   2.4276955  -1.381544   -0.52810615 -2.4558988
   0.73375964  2.1939716   0.29488993  1.3896816 ]
 [-1.399037    0.65601075  1.2020674  -0.9390956   0.63052434  1.9789646
   0.09143455 -2.7301276   1.239572    0.9596796 ]
 [ 1.4581041   1.3057268   1.3085347  -0.14379077 -2.7723215  -0.02241931
   2.8070986   0.94326115 -0.7244338  -1.7351402 ]
 [ 1.6119689  -3.2586377   0.43513754  2.53151

Проверка работоспособности прошла успешно, вычислим время.

In [None]:
times_numba = []
for n in range(100, 2001, 100):
    A = np.random.randn(n, n).astype(np.float32)
    B = np.random.randn(n, n).astype(np.float32)

    threadsperblock = (TPB, TPB)
    blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[1]))
    blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[0]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    t = 0
    for i in range(10):
      st = time()
      A_global_mem = cuda.to_device(A)
      B_global_mem = cuda.to_device(B)
      C_global_mem = cuda.device_array((n, n))
    
      matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
      res = C_global_mem.copy_to_host()
      t += time() - st
    times_numba.append(t/10)

### 2.2.2 Релизация с использованеим shared

In [None]:
@cuda.jit
def matmul_with_shared(A, B, C):
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)
    
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    row = by * cuda.blockDim.y + ty
    col = bx * cuda.blockDim.x + tx

    sA[tx, ty] = 0.0
    sB[tx, ty] = 0.0

    tmp = 0
    for i in range(int((n - 1) / TPB + 1)):
        if row < n and (tx + i*TPB) < n:
            sA[tx, ty] = A[x, ty + i * TPB]
        if row < n and (tx + i*TPB) < n:
            sB[tx, ty] = B[tx + i * TPB, y]

        cuda.syncthreads()

        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        cuda.syncthreads()
    if row < n and col < n:
        C[row, col] = tmp

Проверка:

In [None]:
n = 10
A = np.random.randn(n, n).astype(np.float32)
B = np.random.randn(n, n).astype(np.float32)
C = np.dot(A, B)

threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[1]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[0]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((n, n))
    
matmul_with_shared[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
res = C_global_mem.copy_to_host()
print(n)
print('-'*80)
print(C)
print('-'*80)
print(res)

10
--------------------------------------------------------------------------------
[[ 4.251632   -2.2811708   3.8229737  -2.04636    -0.55115527  2.476335
   0.08954062 -1.5529318   2.9790702   0.808498  ]
 [-3.540766    0.73457766  1.0159775   1.6961205  -3.0340152  -2.6670434
   1.1406647   3.2014787  -0.31490362 -3.2506611 ]
 [ 3.2490606  -6.3680205   4.546994    0.4137888   1.8373121  -0.748631
   1.0753028   3.562995    3.4899702  -1.0950687 ]
 [ 1.8796214  -4.9007993  -3.3819342   0.48979256  0.1553577   4.2891
   1.2050543   2.6844423   7.4768367  -2.0674667 ]
 [-0.98333883  1.025317   -7.2726393  -2.5283287  -1.576848    0.70418245
   4.096247    0.18753578 -1.5071696   0.5979374 ]
 [-2.9552157   1.082218    0.905845   -1.2961289   1.9854758  -4.7101984
   1.5323998  -3.2175598  -7.4496646  -3.9771557 ]
 [ 2.900821   -0.29322642 -1.8906037   0.5075871   2.2675257   2.1781967
  -2.4110382   1.1541088   1.8727137   3.552895  ]
 [ 2.2977428  -3.9292727  -0.4959403  -1.0762008   0

Всё работает, теперь расчёт времени

In [None]:
times_numba_shared = []
for n in range(100, 2001, 100):
    A = np.random.randn(n, n).astype(np.float32)
    B = np.random.randn(n, n).astype(np.float32)

    threadsperblock = (TPB, TPB)
    blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[1]))
    blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[0]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    t = 0
    for i in range(10):
      st = time()
      A_global_mem = cuda.to_device(A)
      B_global_mem = cuda.to_device(B)
      C_global_mem = cuda.device_array((n, n))
    
      matmul_with_shared[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
      res = C_global_mem.copy_to_host()
      t += time() - st
    times_numba_shared.append(t/10)

Выводим время

In [None]:
times_numba

[0.001285243034362793,
 0.00206449031829834,
 0.003477048873901367,
 0.005562138557434082,
 0.009096956253051758,
 0.01176450252532959,
 0.01358644962310791,
 0.014847350120544434,
 0.018949294090270997,
 0.02438662052154541,
 0.03184633255004883,
 0.04041223526000977,
 0.049691176414489745,
 0.06105129718780518,
 0.07454085350036621,
 0.08881425857543945,
 0.10757997035980224,
 0.12703230381011962,
 0.14787740707397462,
 0.1707094669342041]

In [None]:
times_numba_shared

[0.0024385929107666017,
 0.0014611005783081055,
 0.0019822120666503906,
 0.0025400638580322264,
 0.0032777309417724608,
 0.004489827156066895,
 0.005399155616760254,
 0.007057762145996094,
 0.008211445808410645,
 0.009281086921691894,
 0.011311197280883789,
 0.012795519828796387,
 0.014497089385986327,
 0.016321587562561034,
 0.016684937477111816,
 0.019545793533325195,
 0.022096800804138183,
 0.020410966873168946,
 0.02316761016845703,
 0.022792983055114745]

In [None]:
Сохраняем результаты с помощью pandas, сравнительный анализ в results.ipynb

In [None]:
import pandas as pd

data = {
    'N': [*range(100, 2001, 100)],
    'numba': times_numba,
    'numba_with_shared': times_numba_shared,
}

df = pd.DataFrame(data)
df

In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
df.to_csv('drive/My Drive/data/numba.csv', index=False)