In [1]:
!nvidia-smi

Fri Dec 12 18:00:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# COE506 GPU Programming Project
## Point-in-Polygon Aggregation using OpenACC, Numba, and CUDA C++

This notebook demonstrates GPU-accelerated point-in-polygon aggregation using multiple GPU programming approaches:
- **Naive CPU Implementation** (Baseline)
- **OpenACC** (Directive-based GPU programming)
- **Numba CUDA** (Python GPU programming)
- **CUDA C++** (Low-level GPU programming)

**Authors:** COE506 Project Team  
**Course:** GPU Programming and Architecture  
**Instructor:** Dr. Ayaz ul Hassan Khan

## Setup: Google Drive Mount and Directory Configuration

Run this cell first to mount Google Drive and set up the project directory structure.

In [None]:
import os
from google.colab import drive

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully!\n")

# Define base project path - CHANGE THIS IF YOUR FOLDER NAME IS DIFFERENT
PROJECT_NAME = "COE506_Project"
BASE_PATH = f"/content/drive/MyDrive/{PROJECT_NAME}"

print(f"Project directory: {BASE_PATH}\n")

# Create directory structure if it doesn't exist
directories = [
    f"{BASE_PATH}/codes",
    f"{BASE_PATH}/data",
    f"{BASE_PATH}/output_data/naive",
    f"{BASE_PATH}/output_data/openacc",
    f"{BASE_PATH}/output_data/numba",
    f"{BASE_PATH}/output_data/cuda_c",
    f"{BASE_PATH}/preformance_results",
    f"{BASE_PATH}/Profiling/naive",
    f"{BASE_PATH}/Profiling/openacc",
    f"{BASE_PATH}/Profiling/numba",
    f"{BASE_PATH}/Profiling/cuda_c"
]

print("Creating/verifying directory structure...")
for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"  ✓ {directory}")

print("\nDirectory setup complete!")
print(f"\nIMPORTANT: Make sure your source code files are uploaded to:")
print(f"  - {BASE_PATH}/codes/")
print(f"\nAnd your data files are uploaded to:")
print(f"  - {BASE_PATH}/data/")

### Clone Project from GitHub (Run Once)

This cell clones the project from GitHub and copies files to your Google Drive. It will skip if files already exist.

In [None]:
import os
import shutil

# Check if codes already exist in Google Drive
codes_check_file = f"{BASE_PATH}/codes/main_cuda.cu"

if os.path.exists(codes_check_file):
    print("✓ Project files already exist in Google Drive!")
    print(f"  Found: {codes_check_file}")
    print("\nSkipping GitHub clone. Ready to use existing files!\n")
else:
    print("Project files not found in Google Drive.")
    print("Cloning from GitHub repository...\n")
    
    # Clone the repository
    REPO_URL = "https://github.com/kamelth/COE506_Project.git"
    CLONE_DIR = "/content/COE506_Project_temp"
    
    # Remove if exists from previous run
    if os.path.exists(CLONE_DIR):
        shutil.rmtree(CLONE_DIR)
    
    # Clone the repo
    !git clone {REPO_URL} {CLONE_DIR}
    
    print("\n" + "="*60)
    print("Copying files to Google Drive...")
    print("="*60 + "\n")
    
    # Copy directories to Google Drive
    dirs_to_copy = ['codes', 'data']
    
    for dir_name in dirs_to_copy:
        src = os.path.join(CLONE_DIR, dir_name)
        dst = os.path.join(BASE_PATH, dir_name)
        
        if os.path.exists(src):
            print(f"Copying {dir_name}/...")
            
            # Copy all files from source to destination
            for item in os.listdir(src):
                s = os.path.join(src, item)
                d = os.path.join(dst, item)
                
                if os.path.isfile(s):
                    shutil.copy2(s, d)
                    print(f"  ✓ {item}")
                elif os.path.isdir(s):
                    if os.path.exists(d):
                        shutil.rmtree(d)
                    shutil.copytree(s, d)
                    print(f"  ✓ {item}/ (directory)")
            
            print(f"  Done with {dir_name}/\n")
        else:
            print(f"  ⚠ {dir_name}/ not found in repository\n")
    
    # Clean up cloned repository
    print("="*60)
    print("Cleaning up...")
    shutil.rmtree(CLONE_DIR)
    print("✓ Removed temporary clone directory")
    print("="*60)
    
    print("\n✅ Project setup complete!")
    print(f"\nAll files are now in: {BASE_PATH}")
    print("\nYou can now run the rest of the notebook!")

### NVIDIA HPC SDK Installation

In [2]:
%%time
# Downloading and installing deb packages. This will take 5 minutes.
! curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
! echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
! sudo apt-get update -y
! sudo apt-get install -y nvhpc-22-11

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1626  100  1626    0     0   2659      0 --:--:-- --:--:-- --:--:--  2656
deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /
Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64  InRelease [2,126 B]
Get:5 https://cli.github.com/packages stable/main amd64 Packages [345 B]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,204 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu ja

In [3]:
%%bash
apt install environment-modules

Reading package lists...
Building dependency tree...
Reading state information...
The following NEW packages will be installed:
  environment-modules
0 upgraded, 1 newly installed, 0 to remove and 48 not upgraded.
Need to get 254 kB of archives.
After this operation, 836 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 environment-modules amd64 5.0.1-1 [254 kB]
Fetched 254 kB in 1s (405 kB/s)
Selecting previously unselected package environment-modules.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%





In [4]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11
nvaccelinfo


CUDA Driver Version:           12040
NVRM version:                  NVIDIA UNIX x86_64 Kernel Module  550.54.15  Tue Mar  5 22:23:56 UTC 2024

Device Number:                 0
Device Name:                   Tesla T4
Device Revision Number:        7.5
Global Memory Size:            15828320256
Number of Multiprocessors:     40
Concurrent Copy and Execution: Yes
Total Constant Memory:         65536
Total Shared Memory per Block: 49152
Registers per Block:           65536
Warp Size:                     32
Maximum Threads per Block:     1024
Maximum Block Dimensions:      1024, 1024, 64
Maximum Grid Dimensions:       2147483647 x 65535 x 65535
Maximum Memory Pitch:          2147483647B
Texture Alignment:             512B
Clock Rate:                    1590 MHz
Execution Timeout:             No
Integrated Device:             No
Can Map Host Memory:           Yes
Compute Mode:                  default
Concurrent Kernels:            Yes
ECC Enabled:                   Yes
Memory Clock Rate:  

### Naive Implementation

In [5]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11

nvc -acc -ta=multicore -Minfo=accel -fast \
    -I/usr/local/cuda/include/nvtx3 \
    -L/usr/local/cuda/lib64 -lnvToolsExt \
    /content/drive/MyDrive/COE506_ProjectCodes/codes/naive_PointInPloy.c -o /content/drive/MyDrive/COE506_ProjectCodes/codes/naive_PointInPloy

In [6]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11

# Output logfile (performance messages)
LOGFILE="/content/drive/MyDrive/COE506_ProjectCodes/preformance_results/naive_nsys_output_log.txt"
echo "" > $LOGFILE  # clear old log

# Loop over 3 datasets
for i in 1 2 3
do
    echo "=== Running dataset $i ===" | tee -a $LOGFILE

    POINTS="/content/drive/MyDrive/COE506_ProjectCodes/data/points${i}.csv"
    POLYGONS="/content/drive/MyDrive/COE506_ProjectCodes/data/polygons${i}.csv"
    OUTPUT="/content/drive/MyDrive/COE506_ProjectCodes/output_data/naive/out_${i}.csv"
    PROFILE="/content/drive/MyDrive/COE506_ProjectCodes/Profiling/naive/profile_run${i}"     # output name for nsys profile

    # Run nsys and capture ALL output messages
    nsys profile --force-overwrite true -o "$PROFILE" /content/drive/MyDrive/COE506_ProjectCodes/codes/naive_PointInPloy "$POINTS" "$POLYGONS" "$OUTPUT" \
        2>&1 | tee -a "$LOGFILE"

    echo "=== Finished dataset $i ===" | tee -a $LOGFILE
done

=== Running dataset 1 ===
CPU Aggregation Time = 473.612 ms
Generating '/tmp/nsys-report-e54b.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/naive/profile_run1.nsys-rep
=== Finished dataset 1 ===
=== Running dataset 2 ===
CPU Aggregation Time = 11512.814 ms
Generating '/tmp/nsys-report-61b3.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/naive/profile_run2.nsys-rep
=== Finished dataset 2 ===
=== Running dataset 3 ===
CPU Aggregation Time = 31485.176 ms
Generating '/tmp/nsys-report-6c75.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/naive/profile_run3.nsys-rep
=== Finished dataset 3 ===


### OpenACC Implementation

In [7]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11

nvc -acc -ta=tesla -Minfo=accel -fast \
    -I/usr/local/cuda/include/nvtx3 \
    -L/usr/local/cuda/lib64 -lnvToolsExt \
    /content/drive/MyDrive/COE506_ProjectCodes/codes/openacc_code.c -o /content/drive/MyDrive/COE506_ProjectCodes/codes/openacc_code

point_in_polygon:
    188, Generating acc routine seq
         Generating NVIDIA GPU code
aggregate_gpu:
    255, Generating copy(count_per_region[:num_regions]) [if not already present]
         Generating copyin(points_lat[:num_points],points_lon[:num_points],polygon_sizes[:num_regions],polygon_offsets[:num_regions],points_value[:num_points]) [if not already present]
         Generating copy(sum_per_region[:num_regions]) [if not already present]
         Generating copyin(polygon_vertices[:(regions->sizes->*2)+regions->offsets->]) [if not already present]
         Generating NVIDIA GPU code
        192, #pragma acc loop seq
        260, #pragma acc loop gang, vector(256) /* blockIdx.x threadIdx.x */
    267, Accelerator restriction: induction variable live-out from loop: r
    268, Accelerator restriction: induction variable live-out from loop: r
         192, Loop carried scalar dependence for .inl_inside_57 at line 206
              Scalar last value needed after loop for .inl_insi

In [8]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11

# Output logfile (performance messages)
LOGFILE="/content/drive/MyDrive/COE506_ProjectCodes/preformance_results/openacc_nsys_output_log.txt"
echo "" > $LOGFILE  # clear old log

# Loop over 3 datasets
for i in 1 2 3
do
    echo "=== Running dataset $i ===" | tee -a $LOGFILE

    POINTS="/content/drive/MyDrive/COE506_ProjectCodes/data/points${i}.csv"
    POLYGONS="/content/drive/MyDrive/COE506_ProjectCodes/data/polygons${i}.csv"
    OUTPUT="/content/drive/MyDrive/COE506_ProjectCodes/output_data/openacc/out_${i}.csv"
    PROFILE="/content/drive/MyDrive/COE506_ProjectCodes/Profiling/openacc/profile_run${i}"     # output name for nsys profile

    # Run nsys and capture ALL output messages
    nsys profile --force-overwrite true -o "$PROFILE" /content/drive/MyDrive/COE506_ProjectCodes/codes/openacc_code "$POINTS" "$POLYGONS" "$OUTPUT" \
        2>&1 | tee -a "$LOGFILE"

    echo "=== Finished dataset $i ===" | tee -a $LOGFILE
done

=== Running dataset 1 ===
Loading points...
Loaded 100000 points
Loading regions...
Loaded 1000 regions
Running GPU aggregation...
GPU Aggregation Time = 693.149 ms
Writing results to /content/drive/MyDrive/COE506_ProjectCodes/output_data/openacc/out_1.csv...
Done!
Generating '/tmp/nsys-report-0223.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/openacc/profile_run1.nsys-rep
=== Finished dataset 1 ===
=== Running dataset 2 ===
Loading points...
Loaded 500000 points
Loading regions...
Loaded 5000 regions
Running GPU aggregation...
GPU Aggregation Time = 635.371 ms
Writing results to /content/drive/MyDrive/COE506_ProjectCodes/output_data/openacc/out_2.csv...
Done!
Generating '/tmp/nsys-report-e29a.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/openacc/profile_run2.nsys-rep
=== Finished dataset 2 ===
=== Running dataset 3 ===
Loading points...
Loaded 1000000 points
Loading regions...
Loaded 10000 regions
Running GPU aggregation...
G

### Numba Implementaiton

In [9]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11

# Output logfile (performance messages)
LOGFILE="/content/drive/MyDrive/COE506_ProjectCodes/preformance_results/numba_nsys_output_log.txt"
echo "" > $LOGFILE  # clear old log

# Loop over 3 datasets
for i in 1 2 3
do
    echo "=== Running dataset $i ===" | tee -a $LOGFILE

    POINTS="/content/drive/MyDrive/COE506_ProjectCodes/data/points${i}.csv"
    POLYGONS="/content/drive/MyDrive/COE506_ProjectCodes/data/polygons${i}.csv"
    OUTPUT="/content/drive/MyDrive/COE506_ProjectCodes/output_data/numba/out_${i}.csv"
    PROFILE="/content/drive/MyDrive/COE506_ProjectCodes/Profiling/numba/profile_run${i}"     # output name for nsys profile

    # Run nsys and capture ALL output messages
    nsys profile --force-overwrite true -o "$PROFILE" python /content/drive/MyDrive/COE506_ProjectCodes/codes/numba_impl.py "$POINTS" "$POLYGONS" "$OUTPUT" \
        2>&1 | tee -a "$LOGFILE"

    echo "=== Finished dataset $i ===" | tee -a $LOGFILE
done

=== Running dataset 1 ===
Loading points...
Loaded 100000 points
Loading regions...
Loaded 1000 regions
Running GPU aggregation...
GPU Aggregation Time = 2271.901 ms
Writing results to /content/drive/MyDrive/COE506_ProjectCodes/output_data/numba/out_1.csv...
Done!
Generating '/tmp/nsys-report-a4ea.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/numba/profile_run1.nsys-rep
=== Finished dataset 1 ===
=== Running dataset 2 ===
Loading points...
Loaded 500000 points
Loading regions...
Loaded 5000 regions
Running GPU aggregation...
GPU Aggregation Time = 1319.173 ms
Writing results to /content/drive/MyDrive/COE506_ProjectCodes/output_data/numba/out_2.csv...
Done!
Generating '/tmp/nsys-report-b4a9.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/numba/profile_run2.nsys-rep
=== Finished dataset 2 ===
=== Running dataset 3 ===
Loading points...
Loaded 1000000 points
Loading regions...
Loaded 10000 regions
Running GPU aggregation...
GPU Agg

### CUDA C/C++ Implementaiton

In [13]:
# COMPILE THE CODE
# Detect GPU architecture and compile
import subprocess
import os

# Get GPU compute capability
result = subprocess.run(['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
                       capture_output=True, text=True)
compute_cap = result.stdout.strip().replace('.', '')
arch_flag = f"-arch=sm_{compute_cap}"

print(f"Detected GPU Compute Capability: {compute_cap}")
print(f"Using architecture flag: {arch_flag}")
print("\nCompiling...\n")

# Compile with detected architecture
!nvcc -std=c++14 $arch_flag --extended-lambda -O2 /content/drive/MyDrive/COE506_ProjectCodes/codes/main_cuda.cu -o /content/drive/MyDrive/COE506_ProjectCodes/codes/main_cuda

# Check if compilation succeeded
if os.path.exists('/content/drive/MyDrive/COE506_ProjectCodes/codes/main_cuda'):
    print("\n✓ Compilation successful!")
else:
    print("\n✗ Compilation failed!")

Detected GPU Compute Capability: 75
Using architecture flag: -arch=sm_75

Compiling...







✓ Compilation successful!


In [14]:
%%bash
source /usr/share/modules/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc/22.11

# Output logfile (performance messages)
LOGFILE="/content/drive/MyDrive/COE506_ProjectCodes/preformance_results/cuda_nsys_output_log.txt"
echo "" > $LOGFILE  # clear old log

# Loop over 3 datasets
for i in 1 2 3
do
    echo "=== Running dataset $i ===" | tee -a $LOGFILE

    POINTS="/content/drive/MyDrive/COE506_ProjectCodes/data/points${i}.csv"
    POLYGONS="/content/drive/MyDrive/COE506_ProjectCodes/data/polygons${i}.csv"
    OUTPUT="/content/drive/MyDrive/COE506_ProjectCodes/output_data/cuda_c/out_${i}.csv"
    PROFILE="/content/drive/MyDrive/COE506_ProjectCodes/Profiling/cuda_c/profile_run${i}"     # output name for nsys profile

    # Run nsys and capture ALL output messages
    nsys profile --force-overwrite true -o "$PROFILE" /content/drive/MyDrive/COE506_ProjectCodes/codes/main_cuda "$POINTS" "$POLYGONS" "$OUTPUT" \
        2>&1 | tee -a "$LOGFILE"

    echo "=== Finished dataset $i ===" | tee -a $LOGFILE
done

=== Running dataset 1 ===
Loaded 100000 points and 1000 regions
Data Loading Time = 105.223 ms
CUDA Aggregation Time = 425.253 ms
Total Time = 530.476 ms
Generating '/tmp/nsys-report-ae43.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/cuda_c/profile_run1.nsys-rep
=== Finished dataset 1 ===
=== Running dataset 2 ===
Loaded 500000 points and 5000 regions
Data Loading Time = 757.873 ms
CUDA Aggregation Time = 688.147 ms
Total Time = 1446.019 ms
Generating '/tmp/nsys-report-b8a3.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/cuda_c/profile_run2.nsys-rep
=== Finished dataset 2 ===
=== Running dataset 3 ===
Loaded 1000000 points and 10000 regions
Data Loading Time = 1021.411 ms
CUDA Aggregation Time = 874.016 ms
Total Time = 1895.427 ms
Generating '/tmp/nsys-report-265b.qdstrm'
Generated:
    /content/drive/MyDrive/COE506_ProjectCodes/Profiling/cuda_c/profile_run3.nsys-rep
=== Finished dataset 3 ===


### Verification

In [15]:
import pandas as pd
import os
import numpy as np

# Implementations and files
implementations = ['/content/drive/MyDrive/COE506_ProjectCodes/output_data/openacc',
                   '/content/drive/MyDrive/COE506_ProjectCodes/output_data/numba',
                   '/content/drive/MyDrive/COE506_ProjectCodes/output_data/cuda_c']
files = ['out_1.csv', 'out_2.csv', 'out_3.csv']
baseline = '/content/drive/MyDrive/COE506_ProjectCodes/output_data/naive'

def compare_csv(baseline_file, test_file):
    # Read CSVs
    df_base = pd.read_csv(baseline_file)[['region', 'count']]
    df_test = pd.read_csv(test_file)[['region', 'count']]

    # Round to 2 decimal places
    df_base_rounded = df_base
    df_test_rounded = df_test

    # Compare
    if df_base_rounded.equals(df_test_rounded):
        return True
    else:
        # Optional: show differences
        diff = np.where(df_base_rounded != df_test_rounded)
        print(f"Differences found in {test_file}:")
        for row, col in zip(*diff):
            print(f"Row {row}, Column {df_base_rounded.columns[col]}: baseline={df_base_rounded.iat[row, col]}, test={df_test_rounded.iat[row, col]}")
        return False

# Main loop
for impl in implementations:
    print(f"\nChecking implementation: {impl}")
    for f in files:
        baseline_path = os.path.join(baseline, f)
        test_path = os.path.join(impl, f)
        result = compare_csv(baseline_path, test_path)
        if result:
            print(f"{f} matches baseline.")
        else:
            print(f"{f} does NOT match baseline.")



Checking implementation: /content/drive/MyDrive/COE506_ProjectCodes/output_data/openacc
out_1.csv matches baseline.
out_2.csv matches baseline.
out_3.csv matches baseline.

Checking implementation: /content/drive/MyDrive/COE506_ProjectCodes/output_data/numba
out_1.csv matches baseline.
out_2.csv matches baseline.
out_3.csv matches baseline.

Checking implementation: /content/drive/MyDrive/COE506_ProjectCodes/output_data/cuda_c
out_1.csv matches baseline.
out_2.csv matches baseline.
out_3.csv matches baseline.
