In [1]:
!nvidia-smi

Sun Nov 17 06:49:24 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P0             29W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [8]:
%%writefile twomatrixaddition.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 16  // Size of the square matrices (N x N)

__global__ void matrixAdd(int* A, int* B, int* result, int size) {
    int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index
    int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index
    int index = row * size + col;

    if (row < size && col < size) {
        result[index] = A[index] + B[index];
    }
}

void printMatrix(int* matrix, int size) {
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            printf("%d ", matrix[i * size + j]);
        }
        printf("\n");
    }
    printf("\n");
}

int main() {
    int size = N * N;
    int bytes = size * sizeof(int);

    // Allocate host memory
    int *h_A, *h_B, *h_C, *h_D, *h_E, *h_F;
    h_A = (int*)malloc(bytes);
    h_B = (int*)malloc(bytes);
    h_C = (int*)malloc(bytes);
    h_D = (int*)malloc(bytes);
    h_E = (int*)malloc(bytes);
    h_F = (int*)malloc(bytes);

    // Initialize matrices with some values
    for (int i = 0; i < size; ++i) {
        h_A[i] = i;
        h_B[i] = i * 2;
        h_C[i] = i * 3;
        h_D[i] = i * 4;
    }

    // Allocate device memory
    int *d_A, *d_B, *d_C, *d_D, *d_E, *d_F;
    cudaMalloc((void**)&d_A, bytes);
    cudaMalloc((void**)&d_B, bytes);
    cudaMalloc((void**)&d_C, bytes);
    cudaMalloc((void**)&d_D, bytes);
    cudaMalloc((void**)&d_E, bytes);
    cudaMalloc((void**)&d_F, bytes);

    // Copy host data to device
    cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_D, h_D, bytes, cudaMemcpyHostToDevice);

    // Define thread block and grid dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Perform matrix additions on the GPU
    matrixAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_E, N);
    matrixAdd<<<numBlocks, threadsPerBlock>>>(d_C, d_D, d_F, N);

    // Copy results back to host
    cudaMemcpy(h_E, d_E, bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_F, d_F, bytes, cudaMemcpyDeviceToHost);

    // Print results
    printf("Matrix E (A + B):\n");
    printMatrix(h_E, N);
    printf("Matrix F (C + D):\n");
    printMatrix(h_F, N);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaFree(d_D);
    cudaFree(d_E);
    cudaFree(d_F);

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_D);
    free(h_E);
    free(h_F);

    return 0;
}


Writing twomatrixaddition.cu


In [12]:
! nvcc twomatrixaddition.cu -o twomatrixaddition

In [13]:
!./twomatrixaddition

Matrix E (A + B):
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 
48 51 54 57 60 63 66 69 72 75 78 81 84 87 90 93 
96 99 102 105 108 111 114 117 120 123 126 129 132 135 138 141 
144 147 150 153 156 159 162 165 168 171 174 177 180 183 186 189 
192 195 198 201 204 207 210 213 216 219 222 225 228 231 234 237 
240 243 246 249 252 255 258 261 264 267 270 273 276 279 282 285 
288 291 294 297 300 303 306 309 312 315 318 321 324 327 330 333 
336 339 342 345 348 351 354 357 360 363 366 369 372 375 378 381 
384 387 390 393 396 399 402 405 408 411 414 417 420 423 426 429 
432 435 438 441 444 447 450 453 456 459 462 465 468 471 474 477 
480 483 486 489 492 495 498 501 504 507 510 513 516 519 522 525 
528 531 534 537 540 543 546 549 552 555 558 561 564 567 570 573 
576 579 582 585 588 591 594 597 600 603 606 609 612 615 618 621 
624 627 630 633 636 639 642 645 648 651 654 657 660 663 666 669 
672 675 678 681 684 687 690 693 696 699 702 705 708 711 714 717 
720 723 726 729 732 735 738 741 744 747 750 7

In [14]:
%%writefile matrixaddition.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 16  // Size of the square matrix (N x N)

__global__ void matrixAdd(int* A, int* B, int* C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index
    int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index

    if (row < width && col < width) {
        int index = row * width + col;
        C[index] = A[index] + B[index];
    }
}

void printMatrix(int* matrix, int size) {
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            printf("%d ", matrix[i * size + j]);
        }
        printf("\n");
    }
    printf("\n");
}

int main() {
    int size = N * N;
    int bytes = size * sizeof(int);

    // Allocate host memory
    int *h_A, *h_B, *h_C;
    h_A = (int*)malloc(bytes);
    h_B = (int*)malloc(bytes);
    h_C = (int*)malloc(bytes);

    // Initialize matrices A and B
    for (int i = 0; i < size; ++i) {
        h_A[i] = i;
        h_B[i] = size - i;
    }

    // Allocate device memory
    int *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, bytes);
    cudaMalloc((void**)&d_B, bytes);
    cudaMalloc((void**)&d_C, bytes);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(16, 16); // 16x16 threads per block
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch the kernel
    matrixAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result from device to host
    cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost);

    // Print the result
    printf("Matrix A:\n");
    printMatrix(h_A, N);

    printf("Matrix B:\n");
    printMatrix(h_B, N);

    printf("Matrix C (A + B):\n");
    printMatrix(h_C, N);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}


Overwriting matrixaddition.cu


In [15]:
! nvcc matrixaddition.cu -o matrixaddition

In [16]:
!./matrixaddition

Matrix A:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 

Matrix B:
256 255 254 253 252 251 250 249 248 247 246 245 2