Q1: Identify !, %, and %% used in Google Colab

In [18]:
%time x=sum(range(1000000))

CPU times: user 26.1 ms, sys: 28 Âµs, total: 26.1 ms
Wall time: 26.8 ms


In [19]:
%%writefile hello.cu
#include<stdio.h>
int main(){
printf("Hello World\n");
return 0;
}

Writing hello.cu


In [20]:
!nvcc -arch=sm_75 hello.cu -o hello
!./hello

Hello World


Q2: Key nvidia-smi commands with multiple options

In [21]:
!nvidia-smi

Thu Feb 19 23:44:23 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [22]:
!nvidia-smi -q



Timestamp                                 : Thu Feb 19 23:44:24 2026
Driver Version                            : 580.82.07
CUDA Version                              : 13.0

Attached GPUs                             : 1
GPU 00000000:00:04.0
    Product Name                          : Tesla T4
    Product Brand                         : NVIDIA
    Product Architecture                  : Turing
    Display Mode                          : Requested functionality has been deprecated
    Display Attached                      : Yes
    Display Active                        : Disabled
    Persistence Mode                      : Disabled
    Addressing Mode                       : None
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : N/A
        Pending                     

In [23]:
!nvidia-smi --query-gpu=name,memory.total,memory.used,memory.free --format=csv

name, memory.total [MiB], memory.used [MiB], memory.free [MiB]
Tesla T4, 15360 MiB, 0 MiB, 14913 MiB


In [24]:
!nvidia-smi -l 1

Thu Feb 19 23:44:24 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [25]:
!nvidia-smi pmon -c 1

# gpu         pid   type     sm    mem    enc    dec    jpg    ofa    command 
# Idx           #    C/G      %      %      %      %      %      %    name 
    0          -     -      -      -      -      -      -      -    -              


In [26]:
!nvidia-smi dmon

# gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
# Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
    0     27     43      -      0      0      0      0      0      0   5000    780 
    0     14     42      -      0      0      0      0      0      0    405    300 
    0     14     42      -      0      0      0      0      0      0    405    300 
    0     14     42      -      0      0      0      0      0      0    405    300 


Q3: Debug common CUDA errors

A) Zero Output Fix

In [27]:
%%writefile zero_output.cu
#include<stdio.h>

__global__ void f1(){
printf("Hello from GPU\n");
}

int main(){
f1<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}

Writing zero_output.cu


In [28]:
!nvcc -arch=sm_75 zero_output.cu -o zero_output

In [29]:
!./zero_output

Hello from GPU


B) Incorrect Indexing Fix

In [30]:
%%writefile incorrect_index.cu
#include<stdio.h>

__global__ void f2(){
int t=blockIdx.x*blockDim.x+threadIdx.x;
printf("Thread ID: %d\n",t);
}

int main(){
f2<<<2,4>>>();
cudaDeviceSynchronize();
return 0;
}

Writing incorrect_index.cu


In [31]:
!nvcc -arch=sm_75 correct_index.cu -o incorrect_index


[01m[Kcc1plus:[m[K [01;31m[Kfatal error: [m[Kcorrect_index.cu: No such file or directory
compilation terminated.


In [32]:
!./correct_index

/bin/bash: line 1: ./correct_index: No such file or directory


C) PTX Architecture Fix (compile command)

In [33]:
!nvcc -arch=sm_75 correct_index.cu -o correct_index

[01m[Kcc1plus:[m[K [01;31m[Kfatal error: [m[Kcorrect_index.cu: No such file or directory
compilation terminated.


Q4: GPU kernel execution and thread indexing

In [34]:
%%writefile thread_index.cu
#include<stdio.h>

__global__ void g1(){
int global_id=blockIdx.x*blockDim.x+threadIdx.x;
printf("Hello from GPU thread %d\n",global_id);
}

int main(){
printf("Launching kernel...\n");
g1<<<1,8>>>();
cudaDeviceSynchronize();
printf("Back to CPU\n");
return 0;
}

Writing thread_index.cu


In [35]:
!nvcc -arch=sm_75 thread_index.cu -o thread_index

In [36]:
!./thread_index

Launching kernel...
Hello from GPU thread 0
Hello from GPU thread 1
Hello from GPU thread 2
Hello from GPU thread 3
Hello from GPU thread 4
Hello from GPU thread 5
Hello from GPU thread 6
Hello from GPU thread 7
Back to CPU


Q5: Host and Device memory separation

In [37]:
%%writefile memory_demo.cu
#include<stdio.h>
#include<cuda_runtime.h>

#define N 5

__global__ void p1(int *d){
int t=threadIdx.x;
printf("GPU Thread %d: Value = %d\n",t,d[t]);
}

int main(){
int h[N]={10,20,30,40,50};
int *g;
int s=N*sizeof(int);

cudaMalloc((void**)&g,s);
cudaMemcpy(g,h,s,cudaMemcpyHostToDevice);

p1<<<1,N>>>(g);
cudaDeviceSynchronize();

cudaMemcpy(h,g,s,cudaMemcpyDeviceToHost);

printf("\nBack on CPU:\n");
for(int i=0;i<N;i++)
printf("%d ",h[i]);

printf("\n");

cudaFree(g);
return 0;
}

Writing memory_demo.cu


In [38]:
!nvcc -arch=sm_75 memory_demo.cu -o memory_demo


In [39]:
!./memory_demo


GPU Thread 0: Value = 10
GPU Thread 1: Value = 20
GPU Thread 2: Value = 30
GPU Thread 3: Value = 40
GPU Thread 4: Value = 50

Back on CPU:
10 20 30 40 50 


Q6: Compare CPU times of List/tuple with Numpy arrays.

In [40]:
import time
import numpy as np

n=10_000_000
x=list(range(n))
y=list(range(n))

s=time.time()
z=[x[i]+y[i] for i in range(n)]
e=time.time()
print("Python List Time:",e-s)

x1=np.arange(n)
y1=np.arange(n)

s=time.time()
z1=x1+y1
e=time.time()
print("NumPy Time:",e-s)

Python List Time: 0.8915081024169922
NumPy Time: 0.030249595642089844
