In [2]:
%%writefile C_ann_3.cpp

#include <cstdio>
#include <vector>
#include <random>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cassert>
#include <unordered_set>
#include <limits>
#include <cfloat>
#include <cstring>

// ---------------- Config ----------------
static const int DIM          = 64;
static const float ALPHA      = 0.7f;
static const int SEED         = 2025; 
static const int N            = 1 << 14; // demo: 1M 160m
static const int K            = 1024;
static const int NPROBE       = 4;       // cluster depth during search
static const int TOPK         = 5;

static const int KMEANS_ITERS = 15;      // K-Means iters

using Vec = std::vector<float>;

struct Pair { float key; int id; };

// ---------------- Embedding basic ----------------

static Vec numberBase[76]; // 1..75
static Vec posBase[25];    // 0..24

static void normInPlace(Vec &v) {
    double s = 0;
    for (float x : v) s += (double)x * x;
    float n = float(std::sqrt(s) + 1e-12);
    for (float &x : v) x /= n;
}

static Vec randUnit(std::mt19937 &rng) {
    std::uniform_real_distribution<float> U(-1.f, 1.f);
    Vec v(DIM);
    for (int i = 0; i < DIM; i++) v[i] = U(rng);
    normInPlace(v);
    return v;
}

static void initBases() {
    std::mt19937 rng(SEED);
    for (int n = 1; n <= 75; n++) numberBase[n] = randUnit(rng);
    for (int i = 0; i < 25; i++)  posBase[i]    = randUnit(rng);
}

static Vec cardToVec(const int card[25]) {
    Vec out(DIM, 0.f);
    for (int i = 0; i < 25; i++) {
        int n = card[i];
        if (n < 1 || n > 75) {
            fprintf(stderr, "number out of range\n");
            exit(1);
        }
        const Vec &b = numberBase[n];
        const Vec &p = posBase[i];
        for (int j = 0; j < DIM; j++)
            out[j] += b[j] + ALPHA * p[j];
    }
    normInPlace(out);
    return out;
}

static void genCard(std::mt19937 &rng, int out[25]) {
    std::vector<int> p(75);
    std::iota(p.begin(), p.end(), 1);
    std::shuffle(p.begin(), p.end(), rng);
    for (int i = 0; i < 25; i++) out[i] = p[i];
}

static double dot_host(const float* a, const float* b) {
    double s = 0;
    for (int i = 0; i < DIM; i++) s += (double)a[i] * b[i];
    return s;
}

// ---------------- Device: Distance Kernels ----------------

void cosineDistKernel(const float* data,
                      const float* q,
                      int Ntot,
                      float* out) {
    for (int i = 0;
         i < Ntot;
         i++) {
        const float* row = data + (size_t)i * DIM;
        float dot = 0.f;
        for (int d = 0; d < DIM; ++d) dot += row[d] * q[d];
        out[i] = 1.f - dot;
    }
}

void cosineDistIndexKernel(const float* data,
                                      const float* q,
                                      const int* idxSel,
                                      int M,
                                      float* distSel) {
    for (int j = 0;
         j < M;
         j++) {
        int i = idxSel[j];
        const float* row = data + (size_t)i * DIM;
        float dot = 0.f;
        for (int d = 0; d < DIM; ++d) dot += row[d] * q[d];
        distSel[j] = 1.f - dot;
    }
}

// // data[0..Ntot-1] vs single q
// __global__ void cosineDistKernel(const float* data,
//                                  const float* q,
//                                  int Ntot,
//                                  float* out) {
//     for (int i = blockIdx.x * blockDim.x + threadIdx.x;
//          i < Ntot;
//          i += gridDim.x * blockDim.x) {
//         const float* row = data + (size_t)i * DIM;
//         float dot = 0.f;
//         for (int d = 0; d < DIM; ++d) dot += row[d] * q[d];
//         out[i] = 1.f - dot;
//     }
// }

// // assign idxSel[0..M-1] vs q
// __global__ void cosineDistIndexKernel(const float* data,
//                                       const float* q,
//                                       const int* idxSel,
//                                       int M,
//                                       float* distSel) {
//     for (int j = blockIdx.x * blockDim.x + threadIdx.x;
//          j < M;
//          j += gridDim.x * blockDim.x) {
//         int i = idxSel[j];
//         const float* row = data + (size_t)i * DIM;
//         float dot = 0.f;
//         for (int d = 0; d < DIM; ++d) dot += row[d] * q[d];
//         distSel[j] = 1.f - dot;
//     }
// }

// ----------------  GPU K-Means Kernels ----------------

void assignAndAccumulateKernel(const float* data,
                                          int N,
                                          const float* centroids,
                                          int K,
                                          int* assign,
                                          float* sums,
                                          int* counts) {
    for (int i = 0;
         i < N;
         i++) {

        const float* xi = data + (size_t)i * DIM;

        int bestC = 0;
        float bestD = 1e30f;

        for (int c = 0; c < K; ++c) {
            const float* ctr = centroids + (size_t)c * DIM;
            float dot = 0.f;
            for (int d = 0; d < DIM; ++d) {
                dot += xi[d] * ctr[d];
            }
            float dist = 1.f - dot;
            if (dist < bestD) {
                bestD = dist;
                bestC = c;
            }
        }

        assign[i] = bestC;

        // atomicAdd(&counts[bestC], 1);
        counts[bestC]++;
        size_t base = (size_t)bestC * DIM;
        for (int d = 0; d < DIM; ++d) {
            //atomicAdd(&sums[base + d], xi[d]);
            sums[base + d] += xi[d];
        }
    }
}

void updateCentroidsKernel(float* centroids,
                                      const float* sums,
                                      const int* counts,
                                      int K) {
    for (int c = 0; c < K; c++) {
        int cnt = counts[c];
        float* ctr = centroids + (size_t)c * DIM;
        const float* sumc = sums + (size_t)c * DIM;
    
        if (cnt > 0) {
            double norm2 = 0.0;
            for (int d = 0; d < DIM; ++d) {
                float v = sumc[d] / (float)cnt;
                ctr[d] = v;
                norm2 += (double)v * (double)v;
            }
            float n = float(std::sqrt(norm2) + 1e-12);
            for (int d = 0; d < DIM; ++d) {
                ctr[d] /= n;
            }
        }
    }
}

// // E-step + accumulate:
// // every data to search nearest center -> assign[i]
// // atomicAdd sums[c][d] and counts[c]
// __global__ void assignAndAccumulateKernel(const float* data,
//                                           int N,
//                                           const float* centroids,
//                                           int K,
//                                           int* assign,
//                                           float* sums,
//                                           int* counts) {
//     for (int i = blockIdx.x * blockDim.x + threadIdx.x;
//          i < N;
//          i += gridDim.x * blockDim.x) {

//         const float* xi = data + (size_t)i * DIM;

//         int bestC = 0;
//         float bestD = 1e30f;

//         for (int c = 0; c < K; ++c) {
//             const float* ctr = centroids + (size_t)c * DIM;
//             float dot = 0.f;
//             for (int d = 0; d < DIM; ++d) {
//                 dot += xi[d] * ctr[d];
//             }
//             float dist = 1.f - dot;
//             if (dist < bestD) {
//                 bestD = dist;
//                 bestC = c;
//             }
//         }

//         assign[i] = bestC;

//         atomicAdd(&counts[bestC], 1);
//         size_t base = (size_t)bestC * DIM;
//         for (int d = 0; d < DIM; ++d) {
//             atomicAdd(&sums[base + d], xi[d]);
//         }
//     }
// }

// // M-step: sums/counts -> update centroid + norm
// __global__ void updateCentroidsKernel(float* centroids,
//                                       const float* sums,
//                                       const int* counts,
//                                       int K) {
//     int c = blockIdx.x * blockDim.x + threadIdx.x;
//     if (c >= K) return;

//     int cnt = counts[c];
//     float* ctr = centroids + (size_t)c * DIM;
//     const float* sumc = sums + (size_t)c * DIM;

//     if (cnt > 0) {
//         double norm2 = 0.0;
//         for (int d = 0; d < DIM; ++d) {
//             float v = sumc[d] / (float)cnt;
//             ctr[d] = v;
//             norm2 += (double)v * (double)v;
//         }
//         float n = float(std::sqrt(norm2) + 1e-12);
//         for (int d = 0; d < DIM; ++d) {
//             ctr[d] /= n;
//         }
//     }
// }

// ---------------- Host: Inverted Lists (vector<vector<int>>) ----------------

static void buildInvertedLists(
    const std::vector<int>& assign,
    int N, int K,
    std::vector<std::vector<int>>& lists
) {
    lists.assign(K, {});
    for (int i = 0; i < N; ++i) {
        int c = assign[i];
        if (c >= 0 && c < K) {
            lists[c].push_back(i);
        }
    }
}

static void buildCSRFromLists(
    const std::vector<std::vector<int>>& lists,
    int K, int N,
    std::vector<int>& listOffsets,
    std::vector<int>& listIds
) {
    listOffsets.assign(K + 1, 0);
    listIds.clear();
    listIds.reserve(N);

    int offset = 0;
    for (int c = 0; c < K; ++c) {
        listOffsets[c] = offset;
        const std::vector<int>& bucket = lists[c];
        listIds.insert(listIds.end(), bucket.begin(), bucket.end());
        offset += (int)bucket.size();
    }
    listOffsets[K] = offset;
}

// ---------------- Device: gatherCandidates ----------------

void gatherCandidatesKernel(
    const int* centIds,
    int nprobe,
    const int* listOffsets,
    const int* listIds,
    int* outIdx,
    int* outCount
) {

    for (int p = 0; p < nprobe; p++) {
        int c = centIds[p];
        int start = listOffsets[c];
        int end   = listOffsets[c + 1];
        int len   = end - start;
        if (len <= 0) return;
    
        // int base = atomicAdd(outCount, len);
        int base = *outCount;
        // 2. Perform the addition
        *outCount += len;        
        for (int i = 0; i < len; ++i) {
            outIdx[base + i] = listIds[start + i];
        }
    }
}

// // centIds:     
// // nprobe:      nprobe to dect
// // listOffsets: CSR offsets,
// // listIds:     CSR ids
// // outIdx:     
// // outCount:    
// __global__ void gatherCandidatesKernel(
//     const int* centIds,
//     int nprobe,
//     const int* listOffsets,
//     const int* listIds,
//     int* outIdx,
//     int* outCount
// ) {
//     int p = blockIdx.x * blockDim.x + threadIdx.x;
//     if (p >= nprobe) return;

//     int c = centIds[p];
//     int start = listOffsets[c];
//     int end   = listOffsets[c + 1];
//     int len   = end - start;
//     if (len <= 0) return;

//     int base = atomicAdd(outCount, len);
//     for (int i = 0; i < len; ++i) {
//         outIdx[base + i] = listIds[start + i];
//     }
// }

// ---------------- Main ----------------
int main() {
    printf("Params: N=%d  K=%d  nprobe=%d  TOPK=%d  DIM=%d  KMEANS_ITERS=%d\n",
           N, K, NPROBE, TOPK, DIM, KMEANS_ITERS);

    initBases();
    std::mt19937 rng(SEED + 7);

    clock_t start, end;
    double elapse, time_taken;
    elapse = 0.0f;

    start = clock();

    // 1) building dataset
    std::vector<int>   h_cards((size_t)N * 25);
    std::vector<float> h_data((size_t)N * DIM);

    for (int i = 0; i < N; i++) {
        int c[25];
        genCard(rng, c);
        for (int t = 0; t < 25; ++t)
            h_cards[(size_t)i * 25 + t] = c[t];
        Vec v = cardToVec(c);
        for (int d = 0; d < DIM; ++d)
            h_data[(size_t)i * DIM + d] = v[d];
    }

    // 2) build query
    int qc[25];
    for (int t = 0; t < 25; ++t) qc[t] = h_cards[t];
    qc[3] = 75; qc[17] = 1; std::swap(qc[5], qc[19]);
    Vec qvec = cardToVec(qc);

    {
        std::vector<float> qhost(qvec.begin(), qvec.end());
        std::vector<float> d0(DIM);
        for (int i = 0; i < DIM; i++) d0[i] = h_data[i];
        double dot0 = dot_host(qhost.data(), d0.data());
        printf("[DEBUG #1] host dot(q, data[0])=%.9f  dist=%.9f\n",
               dot0, 1.0 - dot0);
    }

    // 3) GPU buffers
    float *d_data = 0, *d_q = 0, *d_centroids = 0;
    float *d_tmpCentDist = 0;

    d_data = (float*)malloc((size_t)N * DIM * sizeof(float));
    d_q = (float*)malloc(DIM * sizeof(float));
    d_centroids = (float*)malloc((size_t)K * DIM * sizeof(float));
    d_tmpCentDist = (float*)malloc(K * sizeof(float));

    // cudaMalloc(&d_data, (size_t)N * DIM * sizeof(float));
    // cudaMalloc(&d_q, DIM * sizeof(float));
    // cudaMalloc(&d_centroids, (size_t)K * DIM * sizeof(float));
    // cudaMalloc(&d_tmpCentDist, K * sizeof(float));

    memcpy(d_data, h_data.data(),
               (size_t)N * DIM * sizeof(float));
    memcpy(d_q, qvec.data(),
               DIM * sizeof(float));

    // cudaMemcpy(d_data, h_data.data(),
    //            (size_t)N * DIM * sizeof(float),
    //            cudaMemcpyHostToDevice);
    // cudaMemcpy(d_q, qvec.data(),
    //            DIM * sizeof(float),
    //            cudaMemcpyHostToDevice);

    // 4) initial centroids: Host random select K centers
    {
        std::vector<int> idx(N);
        std::iota(idx.begin(), idx.end(), 0);
        std::shuffle(idx.begin(), idx.end(), rng);
        std::vector<float> h_initC((size_t)K * DIM);
        for (int c = 0; c < K; ++c) {
            int i = idx[c];
            std::copy_n(&h_data[(size_t)i * DIM], DIM,
                        &h_initC[(size_t)c * DIM]);
        }

        memcpy(d_centroids, h_initC.data(),
                   (size_t)K * DIM * sizeof(float));
        // cudaMemcpy(d_centroids, h_initC.data(),
        //            (size_t)K * DIM * sizeof(float),
        //            cudaMemcpyHostToDevice);
    }

    int   *d_assign = 0;
    float *d_sums   = 0;
    int   *d_counts = 0;

    d_assign = (int*)malloc(N * sizeof(int));
    d_sums = (float*)malloc((size_t)K * DIM * sizeof(float));
    d_counts = (int*)malloc(K * sizeof(int));

    // cudaMalloc(&d_assign, N * sizeof(int));
    // cudaMalloc(&d_sums,   (size_t)K * DIM * sizeof(float));
    // cudaMalloc(&d_counts, K * sizeof(int));

    // 5) GPU K-Means
    {
        printf("[BUILD] K-Means: iters=%d\n", KMEANS_ITERS);

        for (int it = 0; it < KMEANS_ITERS; ++it) {
            // cudaMemset(d_sums,   0, (size_t)K * DIM * sizeof(float));
            // cudaMemset(d_counts, 0, K * sizeof(int));
            memset(d_sums,   0, (size_t)K * DIM * sizeof(float));
            memset(d_counts, 0, K * sizeof(int));

            assignAndAccumulateKernel(
                d_data, N, d_centroids, K,
                d_assign, d_sums, d_counts
            );
            // cudaDeviceSynchronize();

            updateCentroidsKernel(
                d_centroids, d_sums, d_counts, K
            );
            // cudaDeviceSynchronize();
        }

        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
         printf("K-means (in C++) average time is %f milliseconds to execute an array size %d \n", elapse, N);
    }

    start = clock();

    // 6) build inverted index on host
    std::vector<int> h_assign(N);
    // cudaMemcpy(h_assign.data(), d_assign,
    //            N * sizeof(int),
    //            cudaMemcpyDeviceToHost);
    memcpy(h_assign.data(), d_assign,
               N * sizeof(int));

    // IVF FOR 
    std::vector<std::vector<int>> lists;
    buildInvertedLists(h_assign, N, K, lists);

    int nonEmpty = 0;
    for (int c = 0; c < K; ++c)
        if (!lists[c].empty()) nonEmpty++;
    printf("[BUILD] Non-empty clusters: %d / %d\n", nonEmpty, K);

    // 7) flatten lists -> CSR, copy to device
    std::vector<int> h_listOffsets;
    std::vector<int> h_listIds;
    buildCSRFromLists(lists, K, N, h_listOffsets, h_listIds);

    int *d_listOffsets = 0;
    int *d_listIds     = 0;

    d_listOffsets = (int*)malloc((K + 1) * sizeof(int));
    d_listIds = (int*)malloc(h_listIds.size() * sizeof(int));

    // cudaMalloc(&d_listOffsets, (K + 1) * sizeof(int));
    // cudaMalloc(&d_listIds,     h_listIds.size() * sizeof(int));

    memcpy(d_listOffsets, h_listOffsets.data(),
               (K + 1) * sizeof(int));
    memcpy(d_listIds, h_listIds.data(),
               h_listIds.size() * sizeof(int));

    // cudaMemcpy(d_listOffsets, h_listOffsets.data(),
    //            (K + 1) * sizeof(int),
    //            cudaMemcpyHostToDevice);
    // cudaMemcpy(d_listIds, h_listIds.data(),
    //            h_listIds.size() * sizeof(int),
    //            cudaMemcpyHostToDevice);

    // 8) Search Stage 1: q vs centroids -> Thrust sort for NPROBE
    {
        // dim3 block(BLOCK);
        // dim3 grid((K + BLOCK - 1) / BLOCK);
        // cosineDistKernel<<<grid, block>>>(d_centroids, d_q, K, d_tmpCentDist);
        // cudaDeviceSynchronize();

        cosineDistKernel(d_centroids, d_q, K, d_tmpCentDist);
    }

    // thrust::device_ptr<float> dist_ptr(d_tmpCentDist);
    // thrust::device_vector<int> d_centIds(K);
    // thrust::sequence(d_centIds.begin(), d_centIds.end()); // 0..K-1

    // thrust::sort_by_key(dist_ptr, dist_ptr + K, d_centIds.begin());

    std::vector<int> h_centIds(K);
    std::iota(h_centIds.begin(), h_centIds.end(), 0);
    std::sort(h_centIds.begin(), h_centIds.end(), 
        [d_tmpCentDist](int a, int b) {
            return d_tmpCentDist[a] < d_tmpCentDist[b];
        }
    );

    // 9) GPU  collect the NPROBE numbers of cluster to do candiating
    // int *d_outIdx   = 0;
    // int *d_outCount = 0;
    // cudaMalloc(&d_outIdx,   N * sizeof(int));  // 上界 N（通常远小于）
    // cudaMalloc(&d_outCount, sizeof(int));
    // cudaMemset(d_outCount,  0, sizeof(int));

    int *d_outIdx = (int*)malloc(N * sizeof(int)); 
    int *d_outCount = (int*)malloc(sizeof(int));
    *d_outCount = 0;

    // {
    //     gatherCandidatesKernel<<<BLOCK, threads>>>(
    //         thrust::raw_pointer_cast(d_centIds.data()),
    //         NPROBE,
    //         d_listOffsets,
    //         d_listIds,
    //         d_outIdx,
    //         d_outCount
    //     );
    //     cudaDeviceSynchronize();
    // }

    gatherCandidatesKernel(
        h_centIds.data(),
        NPROBE,
        d_listOffsets,
        d_listIds,
        d_outIdx,
        d_outCount
    );

    int M = 0;
    // cudaMemcpy(&M, d_outCount, sizeof(int), cudaMemcpyDeviceToHost);
    memcpy(&M, d_outCount, sizeof(int));
    printf("Candidates from chosen centers: M=%d (of N=%d)\n", M, N);

    if (M <= 0) {
        printf("No candidates, abort.\n");
    } else {
        // 10) Stage 2: candidate Top-K on GPU via CUB

        float *d_selDist   = 0;   // distance between candidates
        float *d_keys_out  = 0;   // distance after sorting
        int   *d_vals_out  = 0;   // candidates id after sorting
        void  *d_temp_storage = 0;
        size_t temp_bytes = 0;
        
        // 1) 为距离分配空间
        // cudaMalloc(&d_selDist, M * sizeof(float));
        d_selDist = (float*)malloc(M * sizeof(float));
        
        // 2) 计算 query vs 每个候选向量的距离
        // {
        //     dim3 block(BLOCK);
        //     dim3 grid((M + BLOCK - 1) / BLOCK);
        //     // 注意这里直接用 d_outIdx 作为候选 id 列表
        //     cosineDistIndexKernel<<<grid, block>>>(
        //         d_data, d_q, d_outIdx, M, d_selDist
        //     );
        //     cudaDeviceSynchronize();
        // }
        cosineDistIndexKernel(
            d_data, d_q, d_outIdx, M, d_selDist
        );
        
        // 3)CUB malloc space
        // cudaMalloc(&d_keys_out, M * sizeof(float));
        // cudaMalloc(&d_vals_out, M * sizeof(int));

        d_keys_out = (float*)malloc(M * sizeof(float));
        d_vals_out = (int*)malloc(M * sizeof(int));

        std::vector<std::pair<float, int>> pairs(M);
        for (int i = 0; i < M; ++i) {
            pairs[i] = {d_selDist[i], d_outIdx[i]};
        }

        std::sort(pairs.begin(), pairs.end());

        for (int i = 0; i < M; ++i) {
            d_keys_out[i] = pairs[i].first;  // The sorted distance
            d_vals_out[i] = pairs[i].second; // The corresponding vector ID
        }
        
        // // 4) first time using SortPairs  to access size of buffer ares needed
        // cub::DeviceRadixSort::SortPairs(
        //     d_temp_storage, temp_bytes,
        //     d_selDist, d_keys_out,   // keys: distance
        //     d_outIdx,  d_vals_out,   // values: gathered id（
        //     M
        // );
        // cudaMalloc(&d_temp_storage, temp_bytes);
        
        // // 5) really sort from nearest to further
        // cub::DeviceRadixSort::SortPairs(
        //     d_temp_storage, temp_bytes,
        //     d_selDist, d_keys_out,
        //     d_outIdx,  d_vals_out,
        //     M
        // );
        // cudaDeviceSynchronize();
        
        // 6) extract top to CPU，做 TopK & 打印
        int take = std::min(M, TOPK * 4);
        std::vector<float> h_keys(take);
        std::vector<int>   h_ids(take);
        
        // cudaMemcpy(h_keys.data(), d_keys_out,
        //            take * sizeof(float),
        //            cudaMemcpyDeviceToHost);
        // cudaMemcpy(h_ids.data(), d_vals_out,
        //            take * sizeof(int),
        //            cudaMemcpyDeviceToHost);
//
        memcpy(h_keys.data(), d_keys_out,
                   take * sizeof(float));
        memcpy(h_ids.data(), d_vals_out,
                   take * sizeof(int));
        
        std::vector<Pair> uniq;
        uniq.reserve(TOPK);
        std::unordered_set<int> seen;
        
        for (int i = 0; i < take; ++i) {
            int id = h_ids[i];
            float dist = h_keys[i];
            if (id < 0 || !std::isfinite(dist)) continue;
            if (seen.insert(id).second) {
                uniq.push_back(Pair{dist, id});
                if ((int)uniq.size() == TOPK) break;
            }
        }

        printf("Top-%d among %d candidates (CUB SortPairs):\n",
               (int)uniq.size(), M);
        for (int i = 0; i < (int)uniq.size(); ++i) {
            printf("%2d) id=%d  dist=%.6f  sim=%.6f\n",
                   i+1, uniq[i].id, uniq[i].key, 1.f - uniq[i].key);
        }

        if (!uniq.empty()) {
            int bestId = uniq[0].id;
            std::vector<float> qhost(qvec.begin(), qvec.end());
            std::vector<float> dv(DIM);
            for (int i = 0; i < DIM; ++i)
                dv[i] = h_data[(size_t)bestId * DIM + i];
            double dotChk = dot_host(qhost.data(), dv.data());
            printf("[DEBUG] host check best id=%d  dot=%.9f  dist=%.9f\n",
                   bestId, dotChk, 1.0 - dotChk);
        }

        // cudaFree(d_temp_storage);
        // cudaFree(d_keys_out);
        // cudaFree(d_vals_out);
        // cudaFree(d_selDist);

        free(d_temp_storage);
        free(d_keys_out);
        free(d_vals_out);
        free(d_selDist);
    }

    end = clock();
    time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
    elapse = elapse + time_taken;
     printf("Function (in C++) average time is %f milliseconds to execute an array size %d \n", time_taken, N);

    // 11) free
    // cudaFree(d_outIdx);
    // cudaFree(d_outCount);
    // cudaFree(d_tmpCentDist);
    // cudaFree(d_centroids);
    // cudaFree(d_q);
    // cudaFree(d_data);
    // cudaFree(d_assign);
    // cudaFree(d_sums);
    // cudaFree(d_counts);
    // cudaFree(d_listOffsets);
    // cudaFree(d_listIds);

    free(d_outIdx);
    free(d_outCount);
    free(d_tmpCentDist);
    free(d_centroids);
    free(d_q);
    free(d_data);
    free(d_assign);
    free(d_sums);
    free(d_counts);
    free(d_listOffsets);
    free(d_listIds);    

    return 0;
}



Overwriting C_ann_3.cpp


In [3]:
%%bash
g++ -std=c++11 C_ann_3.cpp -o C_ann_3 -lm

In [4]:
%%bash
./C_ann_3

Params: N=16384  K=1024  nprobe=4  TOPK=5  DIM=64  KMEANS_ITERS=15
[DEBUG #1] host dot(q, data[0])=0.927471359  dist=0.072528641
[BUILD] K-Means: iters=15
K-means (in C++) average time is 57194.134000 milliseconds to execute an array size 16384 
[BUILD] Non-empty clusters: 1024 / 1024
Candidates from chosen centers: M=105 (of N=16384)
Top-5 among 105 candidates (CUB SortPairs):
 1) id=0  dist=0.072529  sim=0.927471
 2) id=15967  dist=0.177058  sim=0.822942
 3) id=1427  dist=0.216366  sim=0.783634
 4) id=3831  dist=0.226154  sim=0.773846
 5) id=10211  dist=0.246999  sim=0.753001
[DEBUG] host check best id=0  dot=0.927471359  dist=0.072528641
Function (in C++) average time is 4.311000 milliseconds to execute an array size 16384 
