In [30]:
%%writefile rotate.cu

#include <cuda_runtime.h>
#include <cstdio>
#include <cmath>
#include <vector>

#define CHECK_CUDA(call)                                                       \
    do {                                                                       \
        cudaError_t err__ = (call);                                            \
        if (err__ != cudaSuccess) {                                            \
            std::fprintf(stderr, "CUDA error %s (%d) at %s:%d\n",              \
                         cudaGetErrorString(err__), err__, __FILE__, __LINE__);\
            std::exit(EXIT_FAILURE);                                           \
        }                                                                      \
    } while (0)

// Rotate an image (grayscale) around the top-left origin by `angle_rad`.
// Nearest-neighbor sampling for simplicity.
__global__ void rotate_origin(const unsigned char* src, unsigned char* dst,
                              int width, int height, float angle_rad) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    // Inverse rotation to fetch from the source (avoids holes in dst).
    float c = cosf(angle_rad);
    float s = sinf(angle_rad);
    float src_x =  c * x + s * y;
    float src_y = -s * x + c * y;

    unsigned char value = 0;
    int ix = static_cast<int>(roundf(src_x));
    int iy = static_cast<int>(roundf(src_y));
    if (ix >= 0 && ix < width && iy >= 0 && iy < height) {
        value = src[iy * width + ix];
    }
    dst[y * width + x] = value;
}

int main() {
    const int width = 4;
    const int height = 4;
    const float angle_deg = 30.0f;
    const float angle_rad = angle_deg * 3.1415926535f / 180.0f;

    // Host input: 4x4 matrix with values 0..15 for easy visual check.
    std::vector<unsigned char> h_src(width * height);
    for (int i = 0; i < width * height; ++i) h_src[i] = static_cast<unsigned char>(i);
    std::vector<unsigned char> h_dst(width * height, 0);

    unsigned char *d_src = nullptr, *d_dst = nullptr;
    CHECK_CUDA(cudaMalloc(&d_src, h_src.size()));
    CHECK_CUDA(cudaMalloc(&d_dst, h_dst.size()));
    CHECK_CUDA(cudaMemcpy(d_src, h_src.data(), h_src.size(), cudaMemcpyHostToDevice));

    dim3 block(16, 16);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
    rotate_origin<<<grid, block>>>(d_src, d_dst, width, height, angle_rad);
    CHECK_CUDA(cudaDeviceSynchronize());

    CHECK_CUDA(cudaMemcpy(h_dst.data(), d_dst, h_dst.size(), cudaMemcpyDeviceToHost));

    std::puts("Source 4x4 image (row-major):");
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) std::printf("%3d ", h_src[y * width + x]);
        std::printf("\n");
    }
    std::puts("\nRotated image (around top-left origin):");
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) std::printf("%3d ", h_dst[y * width + x]);
        std::printf("\n");
    }

    cudaFree(d_src);
    cudaFree(d_dst);
    return 0;
}


Writing rotate.cu


In [31]:
!nvcc rotate.cu -o rotate -gencode arch=compute_75,code=sm_75
!./rotate

Source 4x4 image (row-major):
  0   1   2   3 
  4   5   6   7 
  8   9  10  11 
 12  13  14  15 

Rotated image (around top-left origin):
  0   0   0   0 
  5   1   2   0 
  9   6   7   0 
 14  10  11   0 


In [7]:
!nvcc -O2 conv2d.cu -o conv2d
!./conv2d

CUDA error conv2d.cu:43: the provided PTX was compiled with an unsupported toolchain.


In [None]:
from google.colab import files
from PIL import Image
import subprocess

up = files.upload()

KeyboardInterrupt: 