In [2]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the a
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin


In [3]:
%%bash
nvcc --version
nvprof --version
nsys --version
ncu --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Apr__9_19:24:57_PDT_2025
Cuda compilation tools, release 12.9, V12.9.41
Build cuda_12.9.r12.9/compiler.35813241_0
nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2025 NVIDIA Corporation
Release version 12.9.19 (21)
NVIDIA Nsight Systems version 2025.1.3.140-251335620677v0
NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2025 NVIDIA Corporation
Version 2025.2.0.0 (build 35613519) (public-release)


In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Nov 21 14:22:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.51.03              Driver Version: 575.51.03      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-PCIE-32GB           Off |   00000000:00:10.0 Off |                    0 |
| N/A   27C    P0             22W /  250W |       0MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
%%writefile origin.cu

#include <cuda_runtime.h>
#include <cstdio>
#include <vector>
#include <random>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cassert>
#include <unordered_set>
#include <limits>
#include <cfloat>

// ---------------- Config ----------------
static const int DIM          = 64;
static const float ALPHA      = 0.7f;
static const int SEED         = 2025; 
static const int N            = 1 << 24; // 1 Million vectors
static const int K            = 1024;    // Clusters
static const int NPROBE       = 32;       // Search depth
static const int TOPK         = 5;
static const int KMEANS_ITERS = 15;      // K-Means iterations

using Vec = std::vector<float>;

// ---------------- Embedding Generator ----------------

static Vec numberBase[76]; // 1..75
static Vec posBase[25];    // 0..24

static void normInPlace(Vec &v) {
    double s = 0;
    for (float x : v) s += (double)x * x;
    float n = float(std::sqrt(s) + 1e-12);
    for (float &x : v) x /= n;
}

static Vec randUnit(std::mt19937 &rng) {
    std::uniform_real_distribution<float> U(-1.f, 1.f);
    Vec v(DIM);
    for (int i = 0; i < DIM; i++) v[i] = U(rng);
    normInPlace(v);
    return v;
}

static void initBases() {
    std::mt19937 rng(SEED);
    for (int n = 1; n <= 75; n++) numberBase[n] = randUnit(rng);
    for (int i = 0; i < 25; i++)  posBase[i]    = randUnit(rng);
}

static Vec cardToVec(const int card[25]) {
    Vec out(DIM, 0.f);
    for (int i = 0; i < 25; i++) {
        int n = card[i];
        const Vec &b = numberBase[n];
        const Vec &p = posBase[i];
        for (int j = 0; j < DIM; j++)
            out[j] += b[j] + ALPHA * p[j];
    }
    normInPlace(out);
    return out;
}

static void genCard(std::mt19937 &rng, int out[25]) {
    std::vector<int> p(75);
    std::iota(p.begin(), p.end(), 1);
    std::shuffle(p.begin(), p.end(), rng);
    for (int i = 0; i < 25; i++) out[i] = p[i];
}

static double dot_host(const float* a, const float* b) {
    double s = 0;
    for (int i = 0; i < DIM; i++) s += (double)a[i] * b[i];
    return s;
}

// ---------------- GPU K-Means Kernels ----------------

// E-step: Assign data to nearest centroid
__global__ void assignAndAccumulateKernel(const float* data,
                                          int N,
                                          const float* centroids,
                                          int K,
                                          int* assign,
                                          float* sums,
                                          int* counts) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < N;
         i += gridDim.x * blockDim.x) {

        const float* xi = data + (size_t)i * DIM;

        int bestC = 0;
        float bestD = 1e30f;

        for (int c = 0; c < K; ++c) {
            const float* ctr = centroids + (size_t)c * DIM;
            float dot = 0.f;
            for (int d = 0; d < DIM; ++d) {
                dot += xi[d] * ctr[d];
            }
            float dist = 1.f - dot;
            if (dist < bestD) {
                bestD = dist;
                bestC = c;
            }
        }

        assign[i] = bestC;

        // Atomic add to accumulators (Naive approach, ok for demo)
        atomicAdd(&counts[bestC], 1);
        size_t base = (size_t)bestC * DIM;
        for (int d = 0; d < DIM; ++d) {
            atomicAdd(&sums[base + d], xi[d]);
        }
    }
}

// M-step: Update centroids
__global__ void updateCentroidsKernel(float* centroids,
                                      const float* sums,
                                      const int* counts,
                                      int K) {
    int c = blockIdx.x * blockDim.x + threadIdx.x;
    if (c >= K) return;

    int cnt = counts[c];
    float* ctr = centroids + (size_t)c * DIM;
    const float* sumc = sums + (size_t)c * DIM;

    if (cnt > 0) {
        double norm2 = 0.0;
        for (int d = 0; d < DIM; ++d) {
            float v = sumc[d] / (float)cnt;
            ctr[d] = v;
            norm2 += (double)v * (double)v;
        }
        float n = float(std::sqrt(norm2) + 1e-12);
        for (int d = 0; d < DIM; ++d) {
            ctr[d] /= n;
        }
    }
}


// ---------------- Host: Inverted Lists ----------------

static void buildInvertedLists(
    const std::vector<int>& assign,
    int N, int K,
    std::vector<std::vector<int>>& lists
) {
    lists.assign(K, {});
    for (int i = 0; i < N; ++i) {
        int c = assign[i];
        if (c >= 0 && c < K) {
            lists[c].push_back(i);
        }
    }
}



// ---------------- Main ----------------
int main() {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int BLOCK  = prop.multiProcessorCount; 
    
    printf("Params: N=%d  K=%d  nprobe=%d  TOPK=%d  DIM=%d  KMEANS_ITERS=%d\n",
           N, K, NPROBE, TOPK, DIM, KMEANS_ITERS);
    printf("Mode: GPU Training -> CPU Search\n");

    initBases();
    std::mt19937 rng(SEED + 7);

    // 1) Data Generation
    std::vector<int>   h_cards((size_t)N * 25);
    std::vector<float> h_data((size_t)N * DIM);

    printf("[INIT] Generating %d vectors...\n", N);
    for (int i = 0; i < N; i++) {
        int c[25];
        genCard(rng, c);
        for (int t = 0; t < 25; ++t) h_cards[(size_t)i * 25 + t] = c[t];
        Vec v = cardToVec(c);
        for (int d = 0; d < DIM; ++d) h_data[(size_t)i * DIM + d] = v[d];
    }

    // 2) Build Query
    int qc[25];
    for (int t = 0; t < 25; ++t) qc[t] = h_cards[t]; // Copy 0-th card
    qc[3] = 75; qc[17] = 1; std::swap(qc[5], qc[19]); // Modify it
    Vec qvec = cardToVec(qc);
    std::vector<float> q_host(qvec.begin(), qvec.end());

    {
        double dot0 = dot_host(q_host.data(), &h_data[0]);
        printf("[DEBUG] Query vs Data[0]: dot=%.9f dist=%.9f\n", dot0, 1.0 - dot0);
    }

    // 3) GPU Allocations for Training
    float *d_data = 0, *d_centroids = 0;
    int   *d_assign = 0, *d_counts = 0;
    float *d_sums = 0;

    cudaMalloc(&d_data,      (size_t)N * DIM * sizeof(float));
    cudaMalloc(&d_centroids, (size_t)K * DIM * sizeof(float));
    cudaMalloc(&d_assign,    N * sizeof(int));
    cudaMalloc(&d_sums,      (size_t)K * DIM * sizeof(float));
    cudaMalloc(&d_counts,    K * sizeof(int));

    cudaMemcpy(d_data, h_data.data(), (size_t)N * DIM * sizeof(float), cudaMemcpyHostToDevice);

    // 4) Initialize Centroids (Random select from data)
    {
        std::vector<int> idx(N);
        std::iota(idx.begin(), idx.end(), 0);
        std::shuffle(idx.begin(), idx.end(), rng);
        std::vector<float> h_initC((size_t)K * DIM);
        for (int c = 0; c < K; ++c) {
            int i = idx[c];
            std::copy_n(&h_data[(size_t)i * DIM], DIM, &h_initC[(size_t)c * DIM]);
        }
        cudaMemcpy(d_centroids, h_initC.data(), (size_t)K * DIM * sizeof(float), cudaMemcpyHostToDevice);
    }

    // 5) GPU K-Means Training
    {
        dim3 block(BLOCK);
        dim3 gridN((N + BLOCK - 1) / BLOCK);
        dim3 gridK((K + BLOCK - 1) / BLOCK);

        printf("[TRAIN] Running K-Means on GPU (%d iters)...\n", KMEANS_ITERS);

        for (int it = 0; it < KMEANS_ITERS; ++it) {
            cudaMemset(d_sums,   0, (size_t)K * DIM * sizeof(float));
            cudaMemset(d_counts, 0, K * sizeof(int));

            assignAndAccumulateKernel<<<gridN, block>>>(
                d_data, N, d_centroids, K, d_assign, d_sums, d_counts
            );
            cudaDeviceSynchronize();

            updateCentroidsKernel<<<gridK, block>>>(
                d_centroids, d_sums, d_counts, K
            );
            cudaDeviceSynchronize();
        }
    }

    // 6) Retrieve Training Results
    std::vector<int> h_assign(N);
    cudaMemcpy(h_assign.data(), d_assign, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Retrieve final centroids for CPU search
    std::vector<float> h_finalCentroids(K * DIM);
    cudaMemcpy(h_finalCentroids.data(), d_centroids, (size_t)K * DIM * sizeof(float), cudaMemcpyDeviceToHost);

    // We are done with GPU memory
    cudaFree(d_data);
    cudaFree(d_centroids);
    cudaFree(d_assign);
    cudaFree(d_sums);
    cudaFree(d_counts);

    // 7) Build IVF Index (Host)
    printf("[INDEX] Building Inverted Lists on Host...\n");
    std::vector<std::vector<int>> lists;
    buildInvertedLists(h_assign, N, K, lists);

    int nonEmpty = 0;
    for(const auto& list : lists) if(!list.empty()) nonEmpty++;
    printf("[INDEX] Non-empty clusters: %d / %d\n", nonEmpty, K);

    // ---------------------------------------------------------
    //  SEARCH PHASE (CPU)
    // ---------------------------------------------------------
    printf("\n[SEARCH] CPU Search (nprobe=%d)...\n", NPROBE);

    // Step A: Coarse Search (Find nearest NPROBE clusters)
    // Format: {distance, cluster_id}
    std::vector<std::pair<float, int>> centerDists;
    centerDists.reserve(K);

    for (int c = 0; c < K; ++c) {
        const float* ctr = &h_finalCentroids[(size_t)c * DIM];
        double dotVal = dot_host(q_host.data(), ctr);
        float dist = 1.0f - (float)dotVal;
        centerDists.push_back({dist, c});
    }

    // Sort centroids by distance ASC
    std::sort(centerDists.begin(), centerDists.end(), 
              [](const std::pair<float, int>& a, const std::pair<float, int>& b){
                  return a.first < b.first;
              });

    // Step B: Gather Candidates & Exact Search
    // Format: {distance, vector_id}
    std::vector<std::pair<float, int>> candidates;
    // Reserve some memory to avoid reallocations (heuristic)
    candidates.reserve((N / K) * NPROBE * 2);

    int visitedVecs = 0;
    for (int i = 0; i < NPROBE && i < K; ++i) {
        int c_id = centerDists[i].second;
        const auto& bucket = lists[c_id];
        visitedVecs += bucket.size();

        for (int vecIdx : bucket) {
            const float* vec = &h_data[(size_t)vecIdx * DIM];
            double dotVal = dot_host(q_host.data(), vec);
            float dist = 1.0f - (float)dotVal;
            candidates.push_back({dist, vecIdx});
        }
    }

    printf("[SEARCH] Scanned %d vectors from top %d clusters.\n", visitedVecs, NPROBE);

    // Step C: Ranking (Top-K)
    if (candidates.empty()) {
        printf("No candidates found.\n");
    } else {
        int finalK = std::min((int)candidates.size(), TOPK);
        
        // Partial sort gives us the smallest K elements at the beginning
        std::partial_sort(candidates.begin(), 
                          candidates.begin() + finalK, 
                          candidates.end(),
                          [](const std::pair<float, int>& a, const std::pair<float, int>& b){
                              return a.first < b.first;
                          });

        printf("\nTop-%d Results:\n", finalK);
        for (int i = 0; i < finalK; ++i) {
            printf("%2d) id=%d  dist=%.6f  sim=%.6f\n",
                   i+1, candidates[i].second, candidates[i].first, 1.f - candidates[i].first);
        }
    }

    return 0;
}

Overwriting origin.cu


In [6]:
%%bash
nvcc -O3 -std=c++17 -arch=sm_70 origin.cu -o origin -Wno-deprecated-gpu-targets

In [7]:
%%bash
nvprof ./origin

==102655== NVPROF is profiling process 102655, command: ./origin


Params: N=16777216  K=1024  nprobe=32  TOPK=5  DIM=64  KMEANS_ITERS=15
Mode: GPU Training -> CPU Search
[INIT] Generating 16777216 vectors...
[DEBUG] Query vs Data[0]: dot=0.927471359 dist=0.072528641
[TRAIN] Running K-Means on GPU (15 iters)...
[INDEX] Building Inverted Lists on Host...
[INDEX] Non-empty clusters: 1024 / 1024

[SEARCH] CPU Search (nprobe=32)...
[SEARCH] Scanned 557487 vectors from top 32 clusters.

Top-5 Results:
 1) id=0  dist=0.072529  sim=0.927471
 2) id=15528589  dist=0.136097  sim=0.863903
 3) id=13929498  dist=0.139304  sim=0.860696
 4) id=8185180  dist=0.146311  sim=0.853689
 5) id=16494452  dist=0.150325  sim=0.849675


==102655== Profiling application: ./origin
==102655== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   70.15%  10.2910s        15  686.07ms  645.03ms  689.22ms  assignAndAccumulateKernel(float const *, int, float const *, int, int*, float*, int*)
                   29.40%  4.31305s         2  2.15653s  52.799us  4.31300s  [CUDA memcpy HtoD]
                    0.45%  65.979ms         2  32.989ms  61.951us  65.917ms  [CUDA memcpy DtoH]
                    0.00%  300.99us        15  20.066us  19.680us  22.240us  updateCentroidsKernel(float*, float const *, int const *, int)
                    0.00%  44.541us        30  1.4840us  1.0870us  2.3030us  [CUDA memset]
      API calls:   65.85%  10.2929s        30  343.10ms  19.968us  689.27ms  cudaDeviceSynchronize
                   28.04%  4.38281s         4  1.09570s  397.92us  4.31431s  cudaMemcpy
                    6.00%  937.81ms         5  187.56ms  5.1660us  937.31

In [8]:
%%bash
nsys profile -o ./origin

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



Params: N=16777216  K=1024  nprobe=32  TOPK=5  DIM=64  KMEANS_ITERS=15
Mode: GPU Training -> CPU Search
[INIT] Generating 16777216 vectors...
[DEBUG] Query vs Data[0]: dot=0.927471359 dist=0.072528641
[TRAIN] Running K-Means on GPU (15 iters)...
[INDEX] Building Inverted Lists on Host...
[INDEX] Non-empty clusters: 1024 / 1024

[SEARCH] CPU Search (nprobe=32)...
[SEARCH] Scanned 557512 vectors from top 32 clusters.

Top-5 Results:
 1) id=0  dist=0.072529  sim=0.927471
 2) id=15528589  dist=0.136097  sim=0.863903
 3) id=13929498  dist=0.139304  sim=0.860696
 4) id=8185180  dist=0.146311  sim=0.853689
 5) id=16494452  dist=0.150325  sim=0.849675
Collecting data...
Generating '/tmp/nsys-report-2f4c.qdstrm'
Generated:
	/home/jupyter-feifan_chen@dlsu.e-15ebb/report1.nsys-rep
