ingonyama-zk · vhnatyk · Nov 16, 2023 · Oct 11, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/.github/workflows/main-build.yml b/.github/workflows/main-build.yml
@@ -63,9 +63,11 @@ jobs:
         cuda: '12.0.0'
         method: 'network'
         # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust"]'
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
     - name: Build Rust Targets
       if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}
       run: cargo build --release --verbose
 
   build-golang-linux:

diff --git a/.github/workflows/main-format.yml b/.github/workflows/main-format.yml
@@ -37,6 +37,5 @@ jobs:
     - name: Checkout
       uses: actions/checkout@v3
     - name: Check clang-format
-      run: |
-        if [[ $(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
-
+      run: unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file >&2); if [[ $unformatted_files ]]; then echo $unformatted_files; echo "Please run clang-format"; exit 1; fi
+
diff --git a/Cargo.toml b/Cargo.toml
@@ -39,6 +39,7 @@ rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point
 
 [build-dependencies]
 cc = { version = "1.0", features = ["parallel"] }
+cmake = "0.1.50"
 
 [dev-dependencies]
 "criterion" = "0.4.0"

diff --git a/README.md b/README.md
@@ -176,6 +176,8 @@ The script does the following:
 - Creates a file with the curve name in ``src/curves`` with the relevant objects for the curve. 
 - Creates a test file with the curve name in ``src``. 
 
+Also files from ``./icicle/curves/<curve_name>/supported_operations.cu`` should be added individually to ``add_library`` section of [``./icicle/CMakeLists.txt``][CMAKELISTS]
+
 Testing the new curve could be done by running the tests in ``tests_curve_name`` (e.g. ``tests_bls12_381``).
 
 ## Docker
@@ -207,6 +209,12 @@ If you are changing code, please make sure to change your [git hooks path][HOOKS
 git config core.hooksPath ./scripts/hooks
 ```
 
+In case `clang-format` is missing on your system, you can install it  using the following command:
+
+```sh
+sudo apt install clang-format
+```
+
 This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.
 
 ### Hall of Fame
@@ -241,6 +249,7 @@ See [LICENSE-MIT][LMIT] for details.
 [googletest]: https://github.com/google/googletest/
 [HOOKS_DOCS]: https://git-scm.com/docs/githooks
 [HOOKS_PATH]: ./scripts/hooks/
+[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28
 [GOOGLE_COLAB_ICICLE]: https://github.com/gkigiermo/rust-cuda-colab
 [GRANT_PROGRAM]: https://docs.google.com/forms/d/e/1FAIpQLSc967TnNwxZZ4akejcSi4KOUmGrEc68ZZV-FHLfo8KnP1wbpg/viewform
 

diff --git a/build.rs b/build.rs
@@ -1,31 +1,57 @@
-use std::env;
+use std::env::{self, var};
+
+use cmake::Config;
 
 fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
+    let cargo_dir = var("CARGO_MANIFEST_DIR").unwrap();
+    let profile = var("PROFILE").unwrap();
+
+    let target_output_dir = format!("{}/target/{}", cargo_dir, profile);
+    let build_output_dir = format!("{}/build", target_output_dir);
 
     println!("cargo:rerun-if-env-changed=CXXFLAGS");
     println!("cargo:rerun-if-changed=./icicle");
+    println!("cargo:rerun-if-changed=./target/{}", profile); // without this it ignores manual changes to build folder
 
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
+    let mut cmake = Config::new("./icicle");
+    cmake
+        .define("BUILD_TESTS", "OFF")
+        .out_dir(&target_output_dir)
+        .build_target("icicle");
 
-    let mut nvcc = cc::Build::new();
+    let target_profile: &str = if profile == "release" {
+       "Release"
+    } else {
+        "Debug"
+    };
 
-    println!("Compiling icicle library using arch: {}", &arch);
+    cmake.define("CMAKE_BUILD_TYPE", target_profile);
 
     if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
+        cmake.define("G2_DEFINED", "");
+    }
+
+    cmake.build();
+
+    if cfg!(unix) {
+        if let Ok(cuda_path) = var("CUDA_HOME") {
+            println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
+        } else {
+            println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        }
+    } else if cfg!(windows) {
+        let build_output_dir_cmake = format!("{}/{}", build_output_dir, target_profile);
+
+        println!("cargo:rustc-link-search={}", &build_output_dir_cmake);
+    }
+
+    println!("cargo:rustc-link-search={}", &build_output_dir);
+    println!("cargo:rustc-link-search={}", &target_output_dir);
+    println!("cargo:rustc-link-lib=ingo_icicle");
+    println!("cargo:rustc-link-lib=dylib=cuda");
+    println!("cargo:rustc-link-lib=dylib=cudart");
+
+    if cfg!(unix) {
+        println!("cargo:rustc-link-lib=dylib=stdc++");
     }
-    nvcc.cuda(true);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.files(["./icicle/curves/index.cu"]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
 }
diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
@@ -5,44 +5,109 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+
 # add the target cuda architectures
 # each additional architecture increases the compilation time and output file size
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
 else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
+  find_program(_nvidia_smi "nvidia-smi")
+
+  if(_nvidia_smi)
+    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+    # execute nvidia-smi -L to get a short list of GPUs available
+    exec_program(${_nvidia_smi_path} ARGS -L
+      OUTPUT_VARIABLE _nvidia_smi_out
+      RETURN_VALUE _nvidia_smi_ret)
+
+    # process the stdout of nvidia-smi
+    if(_nvidia_smi_ret EQUAL 0)
+      # convert string with newlines to list of strings
+      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+      foreach(_line ${_nvidia_smi_out})
+        if(_line MATCHES "^GPU [0-9]+:")
+          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+          # the UUID is not very useful for the user, remove it
+          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+          if(NOT _gpu_info STREQUAL "")
+            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+          endif()
+        endif()
+      endforeach()
+
+      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+    endif()
+  endif()
+
+  # ##
+  if(DETECT_GPU_COUNT GREATER 0)
+    set(CMAKE_CUDA_ARCHITECTURES native) # do native
+  else()
+    # no GPUs found, like on Github CI runners
+    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
+  endif()
+endif()
+
 project(icicle LANGUAGES CUDA CXX)
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 
-include(FetchContent)
-FetchContent_Declare(
-  googletest
-  URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
-)
-# For Windows: Prevent overriding the parent project's compiler/linker settings
+option(BUILD_TESTS "Build tests" OFF)
 
-# boosting lib
-include_directories("/home/miner/include/boost_1_80_0")
+if(NOT BUILD_TESTS)
+  add_library(
+    icicle STATIC
+    curves/bn254/lde.cu
+    curves/bn254/msm.cu
+    curves/bn254/projective.cu
+    curves/bn254/ve_mod_mult.cu
+    curves/bls12_377/lde.cu
+    curves/bls12_377/msm.cu
+    curves/bls12_377/projective.cu
+    curves/bls12_377/ve_mod_mult.cu
+    curves/bls12_381/lde.cu
+    curves/bls12_381/msm.cu
+    curves/bls12_381/projective.cu
+    curves/bls12_381/ve_mod_mult.cu
+    curves/bls12_381/poseidon.cu
+  )
+
+  set_target_properties(icicle PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  set_target_properties(icicle PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_icicle")
+  target_compile_options(icicle PRIVATE -c)
+
+else()
+  include(FetchContent)
+  FetchContent_Declare(
+    googletest
+    URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
+  )
 
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)
+  # For Windows: Prevent overriding the parent project's compiler/linker settings
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
 
-enable_testing()
+  enable_testing()
 
-add_executable(
-  primitives_test
-  primitives/test.cu
-)
-target_link_libraries(
-  primitives_test
-  GTest::gtest_main
-)
+  add_executable(
+    primitives_test
+    primitives/test.cu
+  )
+  target_link_libraries(
+    primitives_test
+    GTest::gtest_main
+  )
 
-include(GoogleTest)
-set_target_properties(primitives_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  include(GoogleTest)
+  set_target_properties(primitives_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
-gtest_discover_tests(primitives_test)
+  gtest_discover_tests(primitives_test)
+endif()
diff --git a/icicle/README.md b/icicle/README.md
@@ -10,7 +10,7 @@ mkdir -p build; cmake -S . -B build; cmake --build build; cd build && ctest; cd
 
 Before proceeding, make sure the following software installed:
 
-1. CMake at least version 3.16, which can be downloaded from [cmake.org](https://cmake.org/files/)
+1. CMake at least version 3.18, which can be downloaded from [cmake.org](https://cmake.org/files/)
    It is recommended to have the latest version installed.
 2. [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu) version 12.0 or newer.
 3. GCC - version 9 or newer recommended.

diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
@@ -151,9 +151,8 @@ __global__ void add_ones_kernel(A* points, S* scalars, P* results, const unsigne
   results[tid] = sum;
 }
 
-template <typename S>
-__global__ void
-find_cutoff_kernel(unsigned* v, unsigned size, unsigned cutoff, unsigned run_length, S* fake_param, unsigned* result)
+__global__ __forceinline__ void
+find_cutoff_kernel(unsigned* v, unsigned size, unsigned cutoff, unsigned run_length, unsigned* result)
 {
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   const unsigned nof_threads = (size + run_length - 1) / run_length;
@@ -168,9 +167,8 @@ find_cutoff_kernel(unsigned* v, unsigned size, unsigned cutoff, unsigned run_len
   if (tid == 0 && v[size - 1] > cutoff) { result[0] = size; }
 }
 
-template <typename S>
-__global__ void find_max_size(
-  unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, S* fake_param, unsigned* largest_bucket_size)
+__global__ __forceinline__ void
+find_max_size(unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, unsigned* largest_bucket_size)
 {
   for (int i = 0;; i++) {
     if (single_bucket_indices[i] & ((1 << c) - 1)) {
@@ -528,14 +526,14 @@ void bucket_method_msm(
   NUM_THREADS = min(1 << 5, cutoff_nof_runs);
   NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS;
   find_cutoff_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-    sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, d_scalars, nof_large_buckets);
+    sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets);
 
   unsigned h_nof_large_buckets;
   cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);
 
   unsigned* max_res;
   cudaMallocAsync(&max_res, sizeof(unsigned) * 2, stream);
-  find_max_size<<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, d_scalars, max_res);
+  find_max_size<<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, max_res);
 
   unsigned h_max_res[2];
   cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned) * 2, cudaMemcpyDeviceToHost, stream);
@@ -967,7 +965,7 @@ void reference_msm(S* scalars, A* a_points, unsigned size)
   std::cout << P::to_affine(res) << std::endl;
 }
 
-unsigned get_optimal_c(const unsigned size)
+unsigned inline get_optimal_c(const unsigned size)
 {
   if (size < 17) return 1;
   // return 17;

diff --git a/icicle/utils/error_handler.cuh b/icicle/utils/error_handler.cuh
@@ -3,7 +3,7 @@
 
 #define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
 template <typename T>
-void check(T err, const char* const func, const char* const file, const int line)
+void inline check(T err, const char* const func, const char* const file, const int line)
 {
   if (err != cudaSuccess) {
     std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
@@ -12,7 +12,7 @@ void check(T err, const char* const func, const char* const file, const int line
 }
 
 #define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
-void checkLast(const char* const file, const int line)
+void inline checkLast(const char* const file, const int line)
 {
   cudaError_t err{cudaGetLastError()};
   if (err != cudaSuccess) {
@@ -22,7 +22,7 @@ void checkLast(const char* const file, const int line)
 }
 
 #define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__)
-void syncDevice(const char* const file, const int line)
+void inline syncDevice(const char* const file, const int line)
 {
   cudaError_t err{cudaDeviceSynchronize()};
   if (err != cudaSuccess) {

diff --git a/scripts/hooks/pre-push b/scripts/hooks/pre-push
@@ -3,9 +3,13 @@
 status=0
 # Run clang-format on CUDA, C, and CPP files
 # clang-format writes to stderr in dry-run mode. In order to capture the output to detect if there are changes needed we redirect stderr to stdin
-if [[ $(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]];
+# to print list of files 
+unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1)
+
+if [[ $unformatted_files ]];
 then
     echo "🚨 There are files in Icicle Core that need formatting."
+    echo $unformatted_files
     echo "Please format all .c, .cpp, .h, .cu, .cuh files using the following command:"
     echo "find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format -i -style=file"
     status=1