In [None]:
# Install the nvcc4jupyter plugin
!pip install nvcc4jupyter

# Load the extension into the notebook
%load_ext nvcc4jupyter


Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpqcsmzpb7".


In [None]:
%%cuda
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}

Hello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1



In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        printf("Device Number: %d\n", i);
        printf("  Device name: %s\n", prop.name);
        printf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
        printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
               2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }
    return 0;
}


Device Number: 0
  Device name: Tesla T4
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000




In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel_1t1e(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int index = row * N + col;

    if (row < N && col < N) {
        A[index] = B[index] + C[index];
    }
}

void randomInit(float* data, int size) {
    for (int i = 0; i < size; i++)
        data[i] = rand() / (float)RAND_MAX * 100.0;
}

int main() {
    const int N = 1024;  // example size
    size_t size = N * N * sizeof(float);
    float *A, *B, *C, *d_A, *d_B, *d_C;

    // Allocate space for host copies and setup values
    A = (float *)malloc(size); randomInit(A, N*N);
    B = (float *)malloc(size); randomInit(B, N*N);
    C = (float *)malloc(size); randomInit(C, N*N);

    // Allocate space for device copies
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy inputs to device
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, C, size, cudaMemcpyHostToDevice);

    // Launch kernel_1t1e() kernel on GPU
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y);
    kernel_1t1e<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(A, d_A, size, cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(A); free(B); free(C);

    return 0;
}


In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel_1t1r(float *A, float *B, float *C, int N) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N) { // Ensure the row is within the matrix
        int index = row * N;
        for (int col = 0; col < N; col++) {
            A[index + col] = B[index + col] + C[index + col];
        }
    }
}

void randomInit(float* data, int size) {
    for (int i = 0; i < size; i++)
        data[i] = rand() / (float)RAND_MAX * 100.0;
}

int main() {
    const int N = 1024;  // example matrix size
    size_t size = N * N * sizeof(float);
    float *A, *B, *C, *d_A, *d_B, *d_C;

    // Allocate space for host copies of A, B, C and setup values
    A = (float *)malloc(size); randomInit(A, N*N);
    B = (float *)malloc(size); randomInit(B, N*N);
    C = (float *)malloc(size); randomInit(C, N*N);

    // Allocate space for device copies of A, B, C
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy inputs to device
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, C, size, cudaMemcpyHostToDevice);

    // Launch kernel_1t1r() kernel on GPU
    dim3 threadsPerBlock(1);
    dim3 numBlocks(N);
    kernel_1t1r<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(A, d_A, size, cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(A); free(B); free(C);

    return 0;
}


In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel_1t1c(float *A, float *B, float *C, int N) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (col < N) { // Ensure the column is within the matrix
        for (int row = 0; row < N; row++) {
            int index = row * N + col;
            A[index] = B[index] + C[index];
        }
    }
}

void randomInit(float* data, int size) {
    for (int i = 0; i < size; i++)
        data[i] = rand() / (float)RAND_MAX * 100.0;
}

int main() {
    const int N = 1024;  // example matrix size
    size_t size = N * N * sizeof(float);
    float *A, *B, *C, *d_A, *d_B, *d_C;

    // Allocate space for host copies of A, B, C and setup values
    A = (float *)malloc(size); randomInit(A, N*N);
    B = (float *)malloc(size); randomInit(B, N*N);
    C = (float *)malloc(size); randomInit(C, N*N);

    // Allocate space for device copies of A, B, C
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy inputs to device
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, C, size, cudaMemcpyHostToDevice);

    // Launch kernel_1t1c() kernel on GPU
    dim3 threadsPerBlock(1);
    dim3 numBlocks(N);
    kernel_1t1c<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(A, d_A, size, cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(A); free(B); free(C);

    return 0;
}
