In [7]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [8]:
!pip install nvcc4jupyter



In [9]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [10]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [11]:
%%writefile cuda_filter.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <time.h>

#define RADIUS 1
#define BLOCK_SIZE 256

void checkCudaError(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA Error at %s: %s\n", msg, cudaGetErrorString(err));
        exit(-1);
    }
}

__global__ void applyFilter(float *input, float *output, int N) {
    __shared__ float temp[BLOCK_SIZE + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    if (gindex < N) {
        temp[lindex] = input[gindex];
    } else {
        temp[lindex] = 0.0f;
    }

    if (threadIdx.x < RADIUS && gindex < N) {
        temp[lindex - RADIUS] = (gindex - RADIUS < 0) ? input[0] : input[gindex - RADIUS];
        temp[lindex + BLOCK_SIZE] = (gindex + BLOCK_SIZE >= N) ? input[N - 1] : input[gindex + BLOCK_SIZE];
    }

    __syncthreads();

    if (gindex < N) {
        float sum = 0.0f;
        int count = 0;
        for (int i = -RADIUS; i <= RADIUS; i++) {
            int neighbor = lindex + i;
            if (neighbor >= 0 && neighbor < BLOCK_SIZE + 2 * RADIUS && (gindex + i >= 0 && gindex + i < N)) {
                sum += temp[neighbor];
                count++;
            }
        }
        output[gindex] = sum / count;
    }
}

void printVector(float *vector, int N) {
    for (int i = 0; i < N; i++) {
        printf("%6.2f ", vector[i]);
    }
    printf("\n");
}

int main() {
    int N;
    char choice;

    // Get vector size from user
    printf("Podaj rozmiar wektora: ");
    scanf("%d", &N);
    while (N <= 0) {
        printf("Rozmiar musi być większy od 0. Podaj ponownie: ");
        scanf("%d", &N);
    }

    // Ask if values should be generated automatically or manually
    printf("Czy wartości mają być wygenerowane automatycznie? (t/n): ");
    scanf(" %c", &choice);
    while (choice != 't' && choice != 'n' && choice != 'T' && choice != 'N') {
        printf("Wprowadź 't' dla tak lub 'n' dla nie: ");
        scanf(" %c", &choice);
    }

    size_t size = N * sizeof(float);
    float *h_input, *h_output;
    h_input = (float *)malloc(size);
    h_output = (float *)malloc(size);

    if (h_input == NULL || h_output == NULL) {
        printf("Błąd alokacji pamięci na CPU!\n");
        return 1;
    }

    // Initialize input vector based on user choice
    if (choice == 't' || choice == 'T') {
        srand(time(NULL));
        for (int i = 0; i < N; i++) {
            h_input[i] = (float)(rand() % 100) / 2.0f; // Random values between 0 and 49.5
        }
    } else {
        printf("Wprowadź %d wartości wektora:\n", N);
        for (int i = 0; i < N; i++) {
            printf("Wartość %d: ", i + 1);
            scanf("%f", &h_input[i]);
        }
    }

    printf("Wektor wejściowy:\n");
    printVector(h_input, N);

    float *d_input, *d_output;
    checkCudaError(cudaMalloc(&d_input, size), "cudaMalloc d_input");
    checkCudaError(cudaMalloc(&d_output, size), "cudaMalloc d_output");

    checkCudaError(cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice), "cudaMemcpy d_input");

    int threadsPerBlock = BLOCK_SIZE;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    printf("\nBlocks per grid: %d\n", blocksPerGrid);
    printf("Threads per block: %d\n", threadsPerBlock);

    applyFilter<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, N);
    checkCudaError(cudaGetLastError(), "kernel launch");
    checkCudaError(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

    checkCudaError(cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost), "cudaMemcpy d_output");

    printf("Wektor wyjściowy (po filtrze):\n");
    printVector(h_output, N);

    checkCudaError(cudaFree(d_input), "cudaFree d_input");
    checkCudaError(cudaFree(d_output), "cudaFree d_output");
    free(h_input);
    free(h_output);

    return 0;
}


Overwriting cuda_filter.cu


In [12]:
!nvcc -arch=sm_75 cuda_filter.cu -o cuda_filter
!./cuda_filter

Podaj rozmiar wektora: 4
Czy wartości mają być wygenerowane automatycznie? (t/n): t
Wektor wejściowy:
 41.00   8.00   7.00  47.00 
CUDA Error at cudaMalloc d_input: CUDA driver version is insufficient for CUDA runtime version
