In [1]:
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas

np.set_printoptions(formatter={'float': '{: .3f}'.format})

M, N, K = 4, 4, 2

#A = np.array(([1, 2, 3], [4, 5, 6]), order = 'F').astype(np.float64)
A = np.array(np.random.rand(M, K), dtype=np.float64, order='F')

#B = np.array(([7, 8, 1, 5], [9, 10, 0, 9], [11, 12, 5, 5]), order = 'F').astype(np.float64)
B = np.array(np.random.rand(K, N), dtype=np.float64, order='F')

A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)

m, k = A_gpu.shape
k, n = B_gpu.shape

print(f'{m=}, {n=}, {k=}')
C_gpu = gpuarray.empty((m, n), np.float64)

alpha = np.float64(1.0)
beta  = np.float64(0.0)

cublas_handle = cublas.cublasCreate()
cublas.cublasDgemm(cublas_handle, 'n', 'n', m, n, k, alpha, A_gpu.gpudata, m, B_gpu.gpudata, k, beta, C_gpu.gpudata, m)
cublas.cublasDestroy(cublas_handle)

C_gpu = C_gpu.reshape(C_gpu.shape, order = 'F')

C = np.dot(A, B)

err = np.linalg.norm(C_gpu.get() - C)
print(f'A\n{A}')
print(f'B\n{B}')
print(f'C\n{C}')
print(f'C_gpu\n{C_gpu}')
print(f'{err=}')

m=4, n=4, k=2
A
[[ 0.673  0.136]
 [ 0.936  0.785]
 [ 0.350  0.665]
 [ 0.637  0.591]]
B
[[ 0.756  0.755  0.969  0.574]
 [ 0.098  0.719  0.011  0.563]]
C
[[ 0.522  0.606  0.654  0.462]
 [ 0.785  1.271  0.916  0.979]
 [ 0.330  0.742  0.347  0.575]
 [ 0.540  0.906  0.624  0.698]]
C_gpu
[[ 0.522  0.606  0.654  0.462]
 [ 0.785  1.271  0.916  0.979]
 [ 0.330  0.742  0.347  0.575]
 [ 0.540  0.906  0.624  0.698]]
err=0.0
