In [2]:
!nvidia-smi


Sat Aug 11 01:08:35 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111                Driver Version: 384.111                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    29W / 149W |      0MiB / 11439MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [3]:
!apt-get install python3-pycuda

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  accountsservice acpid activity-log-manager adwaita-icon-theme apg aspell
  aspell-en at-spi2-core avahi-daemon avahi-utils bbswitch-dkms bind9-host
  binfmt-support bluez bluez-obexd bsdmainutils ca-certificates-java
  cheese-common clang-3.8 cracklib-runtime crda cups-pk-helper dbus
  dbus-user-session dbus-x11 dconf-cli dconf-gsettings-backend dconf-service
  default-jre default-jre-headless desktop-file-utils dictionaries-common
  distro-info-data dkms dmsetup dns-root-data dnsmasq-base dosfstools eject
  emacsen-common enchant evolution-data-server evolution-data-server-common
  fakeroot file fontconfig fonts-dejavu-extra fonts-mathjax fuse gcr gdisk
  geoclue geoclue-ubuntu-geoip geoip-database gettext-base gir1.2-atk-1.0
  gir1.2-freedesktop gir1.2-gdkpixbuf-2.0 gir1.2-glib-2.0 gir1.2-gtk-3.0
  gir1.2-ibus-1.0 gir1.2-notify-0.7 gir

In [4]:
!apt install gcc-4.8 g++-4.8

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  cpp-4.8 gcc-4.8-base libasan0 libgcc-4.8-dev libstdc++-4.8-dev
Suggested packages:
  gcc-4.8-locales g++-4.8-multilib gcc-4.8-doc libstdc++6-4.8-dbg
  gcc-4.8-multilib libgcc1-dbg libgomp1-dbg libitm1-dbg libatomic1-dbg
  libasan0-dbg libtsan0-dbg libquadmath0-dbg libstdc++-4.8-doc
The following NEW packages will be installed:
  cpp-4.8 g++-4.8 gcc-4.8 gcc-4.8-base libasan0 libgcc-4.8-dev
  libstdc++-4.8-dev
0 upgraded, 7 newly installed, 0 to remove and 0 not upgraded.
Need to get 29.3 MB of archives.
After this operation, 73.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu artful/universe amd64 gcc-4.8-base amd64 4.8.5-4ubuntu6 [15.2 kB]
Get:2 http://archive.ubuntu.com/ubuntu artful/universe amd64 cpp-4.8 amd64 4.8.5-4ubuntu6 [4,366 kB]
Get:3 http://archive.ubuntu.com/ubuntu artful/universe amd64 libas

In [0]:
!rm /usr/bin/gcc

In [8]:
!rm /usr/bin/g++

rm: cannot remove '/usr/bin/g++': No such file or directory


In [0]:
!ln -s /usr/bin/gcc-4.8 /usr/bin/gcc

In [0]:
!ln -s /usr/bin/g++-4.8 /usr/bin/g++

In [0]:
import numpy as np

In [0]:
from pycuda import driver, compiler, gpuarray, tools

In [0]:
import pycuda.autoinit

In [0]:
kernel_code_template = """
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
    // 2D Thread ID (assuming that only *one* block will be executed)
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Pvalue is used to store the element of the matrix
    // that is computed by the thread
    float Pvalue = 0;

    // Each thread loads one row of M and one column of N, 
    //   to produce one element of P.
    for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
        float Aelement = a[ty * %(MATRIX_SIZE)s + k];
        float Belement = b[k * %(MATRIX_SIZE)s + tx];
        Pvalue += Aelement * Belement;
    }

    // Write the matrix to device memory;
    // each thread writes one element
    c[ty * %(MATRIX_SIZE)s + tx] = Pvalue;
}
"""


In [0]:

# define the (square) matrix size
#  note that we'll only use *one* block of threads here
#  as a consequence this number (squared) can't exceed max_threads,
#  see http://documen.tician.de/pycuda/util.html#pycuda.tools.DeviceData
#  for more information on how to get this number for your device
MATRIX_SIZE = 2


In [0]:
# create two random square matrices
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)

In [0]:
# compute reference on the CPU to verify GPU computation
c_cpu = np.dot(a_cpu, b_cpu)


In [0]:
# transfer host (CPU) memory to device (GPU) memory 
a_gpu = gpuarray.to_gpu(a_cpu) 
b_gpu = gpuarray.to_gpu(b_cpu)

In [0]:
# create empty gpu array for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)


In [0]:
# get the kernel code from the template 
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {
    'MATRIX_SIZE': MATRIX_SIZE 
    }

In [0]:
# compile the kernel code 
mod = compiler.SourceModule(kernel_code)

In [0]:
# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")

In [0]:
# call the kernel on the card
matrixmul(
    # inputs
    a_gpu, b_gpu, 
    # output
    c_gpu, 
    # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
    block = (MATRIX_SIZE, MATRIX_SIZE, 1),
    )


In [26]:
# print the results
print( "-" * 80)
print("Matrix A (GPU):")
print(a_gpu.get())


--------------------------------------------------------------------------------
Matrix A (GPU):
[[-0.679303   -2.4584105 ]
 [ 0.6384657  -0.41555864]]


In [28]:
print("-" * 80)
print("Matrix B (GPU):")
print(b_gpu.get())

--------------------------------------------------------------------------------
Matrix B (GPU):
[[ 1.9355801   0.03724427]
 [-0.33160982 -0.14067584]]


In [29]:
print("-" * 80)
print("Matrix C (GPU):")
print(c_gpu.get())

--------------------------------------------------------------------------------
Matrix C (GPU):
[[-0.49961227  0.32053882]
 [ 1.3736049   0.08223825]]


In [30]:
print("-" * 80)
print("CPU-GPU difference:")
print(c_cpu - c_gpu.get())

--------------------------------------------------------------------------------
CPU-GPU difference:
[[0. 0.]
 [0. 0.]]


In [31]:
np.allclose(c_cpu, c_gpu.get())

True