In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!nvidia-smi

Wed May 28 13:09:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
%%writefile matrix_multiply_2d.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define TILE_WIDTH 16 // Rozmiar kafelka (tile size)

void checkCudaError(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        printf("Błąd CUDA w %s: %s\n", msg, cudaGetErrorString(err));
        exit(-1);
    }
}

__global__ void matrix_multiply_shared(float *Md, float *Nd, float *Pd, int width) {
    __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
    __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];

    int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
    int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
    float pValue = 0.0f;

    for (int k = 0; k < (width + TILE_WIDTH - 1) / TILE_WIDTH; k++) {
        if (row < width && (k * TILE_WIDTH + threadIdx.x) < width) {
            Mds[threadIdx.y][threadIdx.x] = Md[row * width + (k * TILE_WIDTH + threadIdx.x)];
        } else {
            Mds[threadIdx.y][threadIdx.x] = 0.0f;
        }

        if (col < width && (k * TILE_WIDTH + threadIdx.y) < width) {
            Nds[threadIdx.y][threadIdx.x] = Nd[(k * TILE_WIDTH + threadIdx.y) * width + col];
        } else {
            Nds[threadIdx.y][threadIdx.x] = 0.0f;
        }

        __syncthreads();

        for (int i = 0; i < TILE_WIDTH; i++) {
            pValue += Mds[threadIdx.y][i] * Nds[i][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < width && col < width) {
        Pd[row * width + col] = pValue;
    }
}

void printMatrix(float *matrix, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%8.2f ", matrix[i * N + j]);
        }
        printf("\n");
    }
}

int main() {
    int N;
    char input_choice, display_choice;

    printf("Podaj rozmiar macierzy (N): ");
    scanf("%d", &N);
    if (N <= 0) {
        printf("Rozmiar macierzy musi być dodatni!\n");
        return 1;
    }

    printf("Czy chcesz ręcznie wprowadzić macierze M i N? (t/n): ");
    scanf(" %c", &input_choice);

    int size = N * N * sizeof(float);
    float *h_M = (float *)malloc(size);
    float *h_N = (float *)malloc(size);
    float *h_P = (float *)malloc(size);
    if (!h_M || !h_N || !h_P) {
        printf("Błąd alokacji pamięci na hoście!\n");
        return 1;
    }

    for (int i = 0; i < N * N; i++) {
        h_P[i] = 0.0f;
    }

    if (input_choice == 't' || input_choice == 'T') {
        printf("Wprowadź %d elementów macierzy M (wierszami):\n", N * N);
        for (int i = 0; i < N * N; i++) {
            scanf("%f", &h_M[i]);
        }
        printf("Wprowadź %d elementów macierzy N (wierszami):\n", N * N);
        for (int i = 0; i < N * N; i++) {
            scanf("%f", &h_N[i]);
        }
    } else {
        printf("Generowanie macierzy M i N z wartościami 1 do %d\n", N * N);
        for (int i = 0; i < N * N; i++) {
            h_M[i] = (float)(i + 1);
            h_N[i] = (float)(i + 1);
        }
    }

    printf("Czy wyświetlić macierze wejściowe (M, N) i wyjściową (P)? (t/n): ");
    scanf(" %c", &display_choice);

    if (display_choice == 't' || display_choice == 'T') {
        printf("\nMacierz M (wejściowa):\n");
        printMatrix(h_M, N);
        printf("\nMacierz N (wejściowa):\n");
        printMatrix(h_N, N);
    }

    float *d_M, *d_N, *d_P;
    checkCudaError(cudaMalloc(&d_M, size), "cudaMalloc d_M");
    checkCudaError(cudaMalloc(&d_N, size), "cudaMalloc d_N");
    checkCudaError(cudaMalloc(&d_P, size), "cudaMalloc d_P");

    checkCudaError(cudaMemcpy(d_M, h_M, size, cudaMemcpyHostToDevice), "cudaMemcpy d_M");
    checkCudaError(cudaMemcpy(d_N, h_N, size, cudaMemcpyHostToDevice), "cudaMemcpy d_N");

    dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    printf("\nLiczba bloków w siatce: (%d, %d)\n", blocksPerGrid.x, blocksPerGrid.y);
    printf("Liczba wątków w bloku: (%d, %d)\n", threadsPerBlock.x, threadsPerBlock.y);

    matrix_multiply_shared<<<blocksPerGrid, threadsPerBlock>>>(d_M, d_N, d_P, N);
    checkCudaError(cudaGetLastError(), "uruchomienie kernela");

    checkCudaError(cudaDeviceSynchronize(), "synchronizacja urządzenia");

    checkCudaError(cudaMemcpy(h_P, d_P, size, cudaMemcpyDeviceToHost), "cudaMemcpy d_P");

    if (display_choice == 't' || display_choice == 'T') {
        printf("\nMacierz P (wyjściowa, M * N):\n");
        printMatrix(h_P, N);
    }

    checkCudaError(cudaFree(d_M), "cudaFree d_M");
    checkCudaError(cudaFree(d_N), "cudaFree d_N");
    checkCudaError(cudaFree(d_P), "cudaFree d_P");

    free(h_M);
    free(h_N);
    free(h_P);

    return 0;
}

Writing matrix_multiply_2d.cu


In [4]:
!nvcc -arch=sm_75 matrix_multiply_2d.cu -o matrix_multiply_2d

In [5]:
!./matrix_multiply_2d

Podaj rozmiar macierzy (N): 2
Czy chcesz ręcznie wprowadzić macierze M i N? (t/n): n
Generowanie macierzy M i N z wartościami 1 do 4
Czy wyświetlić macierze wejściowe (M, N) i wyjściową (P)? (t/n): t

Macierz M (wejściowa):
    1.00     2.00 
    3.00     4.00 

Macierz N (wejściowa):
    1.00     2.00 
    3.00     4.00 

Liczba bloków w siatce: (1, 1)
Liczba wątków w bloku: (16, 16)

Macierz P (wyjściowa, M * N):
    7.00    10.00 
   15.00    22.00 
