<a href="https://colab.research.google.com/github/ickma2311/mycolab/blob/main/cuda/matrix_transpose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install cupy-cuda12x




My doc for explanation:
https://docs.google.com/document/d/154J9Xi_Noz3-VK02js3-dZdfQzCae2S3vLR734kb0Fo/edit?tab=t.0


In [None]:
kernel_code="""
extern "C" __global__
void matrix_transpose(float *a, float *b, int rows, int cols) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;

    // b.shape = (cols, rows)
    // a.shape = (rows, cols)
    // check bounds
    if (idx < cols && idy < rows) {
        b[idx * rows + idy] = a[idy * cols + idx];
    }
}
"""

In [None]:
import cupy as cp
import numpy as np

In [None]:
import numpy as np
width = 1024
height=126

a = np.random.randn(height, width).astype(np.float32)
a=a.flatten()
a.shape

(129024,)

In [None]:
m_a=cp.asarray(a,dtype=cp.float32)
m_b=cp.zeros_like(m_a)

In [None]:
threads_per_block=(8,8)
import math
blocks_per_grid=(math.ceil(width/threads_per_block[0]),math.ceil(height/threads_per_block[1]))

In [None]:

print(threads_per_block[0]*threads_per_block[1])
print(blocks_per_grid[0]*blocks_per_grid[1])

64
2048


In [None]:
moudle=cp.RawModule(code=kernel_code)
vector_add_kernel=moudle.get_function('matrix_transpose')

In [None]:
vector_add_kernel(
    blocks_per_grid,
    threads_per_block,
    (m_a,
    m_b,
    np.int32(height),
    np.int32(width))
)
cp.cuda.runtime.deviceSynchronize()

In [None]:
r=m_b.get()

In [None]:
a.reshape((height,width)).shape

(126, 1024)

In [None]:
r.reshape((width,height)).shape

(1024, 126)

In [None]:
left=a.reshape((height,width)).transpose()
right=r.reshape((width,height))
np.allclose(left,right,atol=1e-5)

True

In [None]:

left

array([[-2.0288012 ,  0.02668415,  0.3603327 , ...,  0.6248638 ,
        -2.0008454 ,  0.80592144],
       [-2.1025772 ,  1.179263  ,  1.39218   , ...,  0.17204675,
        -0.43237507,  0.62403274],
       [ 0.6338934 , -0.76591396, -0.40607145, ..., -1.0227876 ,
        -2.1723635 , -0.84497386],
       ...,
       [ 0.9127384 ,  1.366531  , -0.57190686, ..., -0.3158428 ,
         0.58616745,  1.8970106 ],
       [-0.69067603,  0.13682574,  0.7258632 , ...,  1.2445982 ,
        -1.7863773 , -0.3067995 ],
       [ 1.6215827 ,  0.74642277, -0.4526856 , ..., -1.9587345 ,
        -0.75264406,  0.39908883]], dtype=float32)

In [None]:
right

array([[-2.0288012 ,  0.02668415,  0.3603327 , ...,  0.6248638 ,
        -2.0008454 ,  0.80592144],
       [-2.1025772 ,  1.179263  ,  1.39218   , ...,  0.17204675,
        -0.43237507,  0.62403274],
       [ 0.6338934 , -0.76591396, -0.40607145, ..., -1.0227876 ,
        -2.1723635 , -0.84497386],
       ...,
       [ 0.9127384 ,  1.366531  , -0.57190686, ..., -0.3158428 ,
         0.58616745,  1.8970106 ],
       [-0.69067603,  0.13682574,  0.7258632 , ...,  1.2445982 ,
        -1.7863773 , -0.3067995 ],
       [ 1.6215827 ,  0.74642277, -0.4526856 , ..., -1.9587345 ,
        -0.75264406,  0.39908883]], dtype=float32)