In [1]:
 !nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpdfx99sgz".


In [10]:
%%cuda
#include <stdio.h>

__global__ void hello() {
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main() {
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}





In [16]:
%%writefile matrix_add.cu
#include <stdio.h>
#include <cuda_runtime.h>

void checkCudaError(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        printf("CUDA Error at %s: %s\n", msg, cudaGetErrorString(err));
        exit(-1);
    }
}

__global__ void matrixAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x; // indeks wiersza
    int j = blockIdx.y * blockDim.y + threadIdx.y; // indeks kolumny
    int index = i * N + j; // Indeks w macierzy 1D

    if (i < N && j < N) {
        C[index] = A[index] + B[index];
    }
}

void printMatrix(float *matrix, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%6.2f ", matrix[i * N + j]);
        }
        printf("\n");
    }
}

int main() {
    int N = 4;
    int size = N * N * sizeof(float);

    // macierze na CPU
    float *h_A, *h_B, *h_C;
    h_A = (float *)malloc(size);
    h_B = (float *)malloc(size);
    h_C = (float *)malloc(size);

    // inicjalizacja macierzy
    for (int i = 0; i < N * N; i++) {
        h_A[i] = i + 1; // przykladowe dane 1-16
        h_B[i] = (i + 1) * 2; // przykladowe dane 2-32
    }

    printf("Macierz A:\n");
    printMatrix(h_A, N);

    printf("\nMacierz B:\n");
    printMatrix(h_B, N);

    // macierze na GPU
    float *d_A, *d_B, *d_C;
    checkCudaError(cudaMalloc(&d_A, size), "cudaMalloc d_A");
    checkCudaError(cudaMalloc(&d_B, size), "cudaMalloc d_B");
    checkCudaError(cudaMalloc(&d_C, size), "cudaMalloc d_C");

    // kopiowanie danych z CPU na GPU
    checkCudaError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice), "cudaMemcpy d_A");
    checkCudaError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice), "cudaMemcpy d_B");

    // konfiguracja siatki i bloków
    dim3 threadsPerBlock(16, 16); // 16x16 wątków w bloku
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    printf("\nBlocks per grid: (%d, %d)\n", blocksPerGrid.x, blocksPerGrid.y);
    printf("Threads per block: (%d, %d)\n", threadsPerBlock.x, threadsPerBlock.y);

    // uruchomienie kernela
    matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    checkCudaError(cudaGetLastError(), "kernel launch");

    checkCudaError(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

    // kopiowanie wyniku z GPU na CPU
    checkCudaError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost), "cudaMemcpy d_C");

    printf("\nMacierz C (wynik A + B):\n");
    printMatrix(h_C, N);

    // zwolnienie pamięci
    checkCudaError(cudaFree(d_A), "cudaFree d_A");
    checkCudaError(cudaFree(d_B), "cudaFree d_B");
    checkCudaError(cudaFree(d_C), "cudaFree d_C");
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Overwriting matrix_add.cu


In [17]:
!nvcc -arch=sm_75 matrix_add.cu -o matrix_add

In [18]:
!./matrix_add

Macierz A:
  1.00   2.00   3.00   4.00 
  5.00   6.00   7.00   8.00 
  9.00  10.00  11.00  12.00 
 13.00  14.00  15.00  16.00 

Macierz B:
  2.00   4.00   6.00   8.00 
 10.00  12.00  14.00  16.00 
 18.00  20.00  22.00  24.00 
 26.00  28.00  30.00  32.00 

Blocks per grid: (1, 1)
Threads per block: (16, 16)

Macierz C (wynik A + B):
  3.00   6.00   9.00  12.00 
 15.00  18.00  21.00  24.00 
 27.00  30.00  33.00  36.00 
 39.00  42.00  45.00  48.00 
