In [None]:
# Install CUDA C++ plugin for Colab:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [None]:
# Detect selected GPU and its NVIDA architecture:
import subprocess
gpu_info = subprocess.getoutput("nvidia-smi --query-gpu=name,compute_cap --format=csv,noheader,nounits")
if "not found" in gpu_info.lower(): raise RuntimeError("Error: No GPU found. Please select a GPU runtime environment.")
gpu_name, compute_cap = map(str.strip, gpu_info.split(','))
gpu_arch = f"sm_{compute_cap.replace('.', '')}"

print(f"{'GPU Name':<15}: {gpu_name}")
print(f"{'Architecture':<15}: {gpu_arch}")

In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>

__global__ void hello_kernel() {
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    int globalId = threadId + blockId * blockDim.x;

    printf("Hello from block %d, thread %d (global thread %d)\n", blockId, threadId, globalId);
}
int main() {
    int numBlocks = 2;
    int threadsPerBlock = 4;

    hello_kernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize();

    return 0;
}

In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>
#include <cuda.h>

// Kernel: cada bloque suma una parte del vector
__global__ void sum_kernel(int *data, int *partial, int n) {
    extern __shared__ int cache[];  // memoria compartida

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;

    int temp = 0;
    // cada hilo suma su elemento si está dentro del rango
    if (tid < n) {
        temp = data[tid];
    }

    cache[cacheIndex] = temp;
    __syncthreads();

    // Reducción en memoria compartida (árbol binario)
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (cacheIndex < stride) {
            cache[cacheIndex] += cache[cacheIndex + stride];
        }
        __syncthreads();
    }

    // El primer hilo de cada bloque guarda el resultado parcial
    if (cacheIndex == 0) {
        partial[blockIdx.x] = cache[0];
    }
}

int main() {
    const int N = 1 << 16;  // tamaño del vector (65536)
    const int THREADS = 256;
    const int BLOCKS = (N + THREADS - 1) / THREADS;

    int *h_data = new int[N];
    for (int i = 0; i < N; i++) h_data[i] = 1; // inicializamos con 1's

    int *d_data, *d_partial;
    int *h_partial = new int[BLOCKS];

    cudaMalloc(&d_data, N * sizeof(int));
    cudaMalloc(&d_partial, BLOCKS * sizeof(int));

    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);

    // Ejecutar kernel
    sum_kernel<<<BLOCKS, THREADS, THREADS * sizeof(int)>>>(d_data, d_partial, N);
    cudaDeviceSynchronize();

    // Traer resultados parciales al host
    cudaMemcpy(h_partial, d_partial, BLOCKS * sizeof(int), cudaMemcpyDeviceToHost);

    // Acumular los parciales en CPU
    long long total = 0;
    for (int i = 0; i < BLOCKS; i++) total += h_partial[i];

    printf("Suma total = %lld\n", total);

    cudaFree(d_data);
    cudaFree(d_partial);
    delete[] h_data;
    delete[] h_partial;

    return 0;
}


In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>
#include <cuda.h>

// Kernel: cada hilo calcula un elemento del vector resultado
__global__ void vectorMatrixMul(const float *vec, const float *mat, float *res, int N, int M) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // índice de columna
    if (col < M) {
        float sum = 0.0f;
        for (int row = 0; row < N; row++) {
            sum += vec[row] * mat[row * M + col]; // mat en formato fila-major
        }
        res[col] = sum;
    }
}

int main() {
    const int N = 4;  // filas de la matriz (longitud del vector)
    const int M = 5;  // columnas de la matriz
    float h_vec[N] = {1, 2, 3, 4};  // vector
    float h_mat[N * M];             // matriz NxM
    float h_res[M];                 // resultado

    // Inicializamos la matriz con valores sencillos
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            h_mat[i * M + j] = (i + 1) * (j + 1); // ejemplo
        }
    }

    float *d_vec, *d_mat, *d_res;
    cudaMalloc(&d_vec, N * sizeof(float));
    cudaMalloc(&d_mat, N * M * sizeof(float));
    cudaMalloc(&d_res, M * sizeof(float));

    cudaMemcpy(d_vec, h_vec, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_mat, h_mat, N * M * sizeof(float), cudaMemcpyHostToDevice);

    // Configuración de bloques/hilos
    int threadsPerBlock = 256;
    int blocks = (M + threadsPerBlock - 1) / threadsPerBlock;

    // Lanzamos kernel
    vectorMatrixMul<<<blocks, threadsPerBlock>>>(d_vec, d_mat, d_res, N, M);
    cudaDeviceSynchronize();

    // Copiamos el resultado al host
    cudaMemcpy(h_res, d_res, M * sizeof(float), cudaMemcpyDeviceToHost);

    // Mostramos resultado
    printf("Resultado vector x matriz:\n");
    for (int j = 0; j < M; j++) {
        printf("%.1f ", h_res[j]);
    }
    printf("\n");

    cudaFree(d_vec);
    cudaFree(d_mat);
    cudaFree(d_res);

    return 0;
}
