In [5]:
%%writefile gelu.cu

Overwriting gelu.cu


In [20]:
%%writefile cuda_gelu.cu
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>

#define CUDA_CHECK(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
                cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

__global__ void gelu_kernel(float* data, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        if (i < 10) printf("Before GELU: data[%d] = %f\n", i, data[i]);
        data[i] = 0.5f * data[i] * (1.0f + erff(data[i] / sqrtf(2.0f)));
        if (i < 10) printf("After GELU: data[%d] = %f\n", i, data[i]);
    }
}

int main() {
    const int N = 1000000;
    float* A = new float[N];
    for (int i = 0; i < N; i++) A[i] = -1.0f * (float)i / 2.0f;

    std::cout << "Before GELU (Host):\n";
    for (int i = 0; i < 10; ++i) std::cout << "A[" << i << "]: " << A[i] << std::endl;

    float* d_A;
    CUDA_CHECK(cudaMalloc(&d_A, N * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    dim3 threadsPerBlock(256);
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x);
    gelu_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);

    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "\nCUDA kernel time: " << milliseconds / 1000.0 << " seconds" << std::endl;

    CUDA_CHECK(cudaMemcpy(A, d_A, N * sizeof(float), cudaMemcpyHostToDevice));

    std::cout << "\nAfter GELU (Host):\n";
    for (int i = 0; i < 10; ++i) std::cout << "A[" << i << "]: " << A[i] << std::endl;

    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(d_A);
    delete[] A;
    return 0;
}

Writing cuda_gelu.cu


In [21]:
!nvcc cuda_gelu.cu -o cuda_gelu
!./cuda_gelu

Before GELU (Host):
A[0]: -0
A[1]: -0.5
A[2]: -1
A[3]: -1.5
A[4]: -2
A[5]: -2.5
A[6]: -3
A[7]: -3.5
A[8]: -4
A[9]: -4.5
CUDA error in cuda_gelu.cu:44: the provided PTX was compiled with an unsupported toolchain.
