diff --git a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
index fc9bb53659442..1f57b4c6d2ba2 100644
--- a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
+++ b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
@@ -15,6 +15,14 @@ on:
         required: false
         type: boolean
         default: false
+      use_vcpkg:
+        required: false
+        type: boolean
+        default: true
+      enable_wasm_threads:
+        required: false
+        type: boolean
+        default: true
       build_jsep:
         required: false
         type: boolean
@@ -29,7 +37,7 @@ jobs:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
     env:
       buildArch: x64
-      common_build_args: --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --config ${{ inputs.build_config }} --skip_submodule_sync --build_wasm --enable_wasm_simd --enable_wasm_threads ${{ inputs.extra_build_args }}
+      common_build_args: --parallel ${{ inputs.use_vcpkg == true && '--use_vcpkg --use_vcpkg_ms_internal_asset_cache' || '' }} --config ${{ inputs.build_config }} --skip_submodule_sync --build_wasm --enable_wasm_simd ${{ inputs.enable_wasm_threads == true && '--enable_wasm_threads' || '' }} ${{ inputs.extra_build_args }}
 
     steps:
       - name: Checkout code
diff --git a/.github/workflows/web.yml b/.github/workflows/web.yml
index 8f922ef26cd7e..0133e4994e5e9 100644
--- a/.github/workflows/web.yml
+++ b/.github/workflows/web.yml
@@ -52,6 +52,16 @@ jobs:
       build_jsep: true
       build_webgpu: true
 
+  wasm_Release_static_library:
+    needs: precheck
+    uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
+    with:
+      build_config: Release
+      extra_build_args: "--skip_tests --enable_wasm_api_exception_catching --disable_rtti --build_wasm_static_lib"
+      use_vcpkg: false
+      enable_wasm_threads: false
+      skip_publish: true
+
   web_Debug:
     needs:
       - precheck
diff --git a/README.md b/README.md
index f1817282b61a0..019bc8291354e 100644
--- a/README.md
+++ b/README.md
@@ -20,26 +20,6 @@
   - ONNX Runtime Inferencing: [microsoft/onnxruntime-inference-examples](https://github.com/microsoft/onnxruntime-inference-examples)
   - ONNX Runtime Training: [microsoft/onnxruntime-training-examples](https://github.com/microsoft/onnxruntime-training-examples)
 
-## Builtin Pipeline Status
-
-|System|Inference|Training|
-|---|---|---|
-|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CUDA%20CI%20Pipeline?label=Windows+GPU+CUDA)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=218)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20WebGPU%20CI%20Pipeline?label=Windows+GPU+WebGPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=228)||
-|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)|
-|Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)||
-|Android|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)||
-|iOS|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)||
-|Web|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/ONNX%20Runtime%20Web%20CI%20Pipeline?label=Web)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=161)||
-|Other|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/onnxruntime-binary-size-checks-ci-pipeline?repoName=microsoft%2Fonnxruntime&label=Binary+Size+Check)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=187&repoName=microsoft%2Fonnxruntime)||
-
-This project is tested with [BrowserStack](https://www.browserstack.com/home).
-
-## Third-party Pipeline Status
-
-|System|Inference|Training|
-|---|---|---|
-|Linux|[![Build Status](https://github.com/Ascend/onnxruntime/actions/workflows/build-and-test.yaml/badge.svg)](https://github.com/Ascend/onnxruntime/actions/workflows/build-and-test.yaml)||
-
 ## Releases
 
 The current release and past releases can be found here: https://github.com/microsoft/onnxruntime/releases.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 416ed5e49f25a..47bfa3f312eec 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -372,6 +372,7 @@ if (onnxruntime_USE_ROCM)
     if (HIPIFY_PERL_PATH-NOTFOUND)
       MESSAGE(FATAL_ERROR "hipify-perl not found")
     endif()
+    MESSAGE("HIPIFY PATH:"${HIPIFY_PERL_PATH}/hipify-perl)
     set(onnxruntime_HIPIFY_PERL ${HIPIFY_PERL_PATH}/hipify-perl)
   endif()
 
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 8f5ef15c53ef2..6647312e99d8f 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -4,6 +4,8 @@ if (ANDROID)
   # Build shared libraries with support for 16 KB ELF alignment
   # https://source.android.com/docs/core/architecture/16kb-page-size/16kb#build-lib-16kb-alignment
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,max-page-size=16384")
+  # Also apply to MODULE libraries (like libonnxruntime4j_jni.so)
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-z,max-page-size=16384")
 endif()
 
 # Enable space optimization for gcc/clang
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 6e045f6dcdc9d..2df433b0353c6 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -9,9 +9,6 @@
 #since the file contains a version string: "lts_20230802". However, the file is for debugging purposes only and would
 #not affect built binaries.
 #
-# NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
-# See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
-#
 abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240722.0.zip;36ee53eb1466fb6e593fc5c286680de31f8a494a
 coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
@@ -29,7 +26,7 @@ flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.5.zip;cd47d3d272faf353600c8cc2fdec2b52d6f69177
-googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349
+googletest;https://github.com/google/googletest/archive/refs/tags/v1.17.0.zip;f638fa0e724760e2ba07ff8cfba32cd644e1ce28
 #xnnpack 2024.09.04
 googlexnnpack;https://github.com/google/XNNPACK/archive/fe98e0b93565382648129271381c14d6205255e3.zip;14f61dcf17cec2cde34ba2dcf61d6f24bf6059f3
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.zip;5e88795165cc8590138d1f47ce94ee567b85b4d6
@@ -37,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/7fc2b81a275223f5b02a522d9d2649837542a7be.zip;555338a12903941bb45f57540476244f9ffee17b
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.18.0.zip;f156d032a3af91b66d554e11158b33ca77bbb1f2
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
@@ -58,6 +55,6 @@ cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.9.2.zip;b7f8dc4a8
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0c12f53da76d0c31b03b9f0f8ec8f3b4.zip;239063aee4946a9af147b473a4c3da78ba7413b4
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
+cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
 dawn;https://github.com/google/dawn/archive/4cb1f9be152a4fa6bb695c08cd707ab078a1e2fb.zip;de39336b7715f53c14eec61072293b85cc73b691
 kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.4.0.tar.gz;22d3b57b54a61c194ab256ff11b0353a3b220244
diff --git a/cmake/external/cudnn_frontend.cmake b/cmake/external/cudnn_frontend.cmake
index 8642607fa0ca0..d89ab0f669f35 100644
--- a/cmake/external/cudnn_frontend.cmake
+++ b/cmake/external/cudnn_frontend.cmake
@@ -6,8 +6,10 @@ onnxruntime_fetchcontent_declare(
   EXCLUDE_FROM_ALL
 )
 
+set(CUDNN_FRONTEND_SKIP_JSON_LIB OFF CACHE BOOL "" FORCE)
 set(CUDNN_FRONTEND_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
-set(CUDNN_FRONTEND_BUILD_UNIT_TESTS OFF CACHE BOOL "" FORCE)
+set(CUDNN_FRONTEND_BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)
 set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
+
 onnxruntime_fetchcontent_makeavailable(cudnn_frontend)
diff --git a/cmake/external/onnx b/cmake/external/onnx
index 7fc2b81a27522..e709452ef2bbc 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit 7fc2b81a275223f5b02a522d9d2649837542a7be
+Subproject commit e709452ef2bbc1d113faf678c24e6d3467696e83
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index e629df4843109..1e26eede8a66f 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -169,10 +169,6 @@ if(APPLE)
   target_link_libraries(onnxruntime_common PRIVATE "-framework Foundation")
 endif()
 
-if(MSVC)
-  target_link_libraries(onnxruntime_common PRIVATE dxcore.lib)
-endif()
-
 if(MSVC)
   if(onnxruntime_target_platform STREQUAL "ARM64")
     set(ARM64 TRUE)
diff --git a/cmake/onnxruntime_kernel_explorer.cmake b/cmake/onnxruntime_kernel_explorer.cmake
index 62a6d45088052..65a20c4229290 100644
--- a/cmake/onnxruntime_kernel_explorer.cmake
+++ b/cmake/onnxruntime_kernel_explorer.cmake
@@ -64,7 +64,7 @@ elseif (onnxruntime_USE_ROCM)
   )
   auto_set_source_files_hip_language(${kernel_explorer_kernel_srcs} ${kernel_explorer_rocm_kernel_srcs})
   target_sources(kernel_explorer PRIVATE ${kernel_explorer_rocm_kernel_srcs})
-  target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1 HIPBLAS_V2)
+  target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1 HIPBLAS)
   if (onnxruntime_USE_COMPOSABLE_KERNEL)
     target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL)
     if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 3279a17f8cd5e..f8f5546ae9465 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -281,6 +281,9 @@ function(setup_kleidiai)
     ${MLAS_SRC_DIR}/kai_ukernel_interface.cpp
   )
   target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
+
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
+  set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)
 endfunction()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
diff --git a/cmake/onnxruntime_providers_dml.cmake b/cmake/onnxruntime_providers_dml.cmake
index 62136c5c568d7..ac4f0103ea323 100644
--- a/cmake/onnxruntime_providers_dml.cmake
+++ b/cmake/onnxruntime_providers_dml.cmake
@@ -59,7 +59,7 @@
   if (GDK_PLATFORM STREQUAL Scarlett)
     target_link_libraries(onnxruntime_providers_dml PRIVATE ${gdk_dx_libs})
   else()
-    target_link_libraries(onnxruntime_providers_dml PRIVATE dxguid.lib d3d12.lib dxgi.lib dxcore.lib)
+    target_link_libraries(onnxruntime_providers_dml PRIVATE dxguid.lib d3d12.lib dxgi.lib)
   endif()
 
   target_link_libraries(onnxruntime_providers_dml PRIVATE delayimp.lib)
diff --git a/cmake/onnxruntime_providers_rocm.cmake b/cmake/onnxruntime_providers_rocm.cmake
index 108b8b46deb27..03f1e288f4d0d 100644
--- a/cmake/onnxruntime_providers_rocm.cmake
+++ b/cmake/onnxruntime_providers_rocm.cmake
@@ -154,7 +154,7 @@
 
   set_target_properties(onnxruntime_providers_rocm PROPERTIES LINKER_LANGUAGE CXX)
   set_target_properties(onnxruntime_providers_rocm PROPERTIES FOLDER "ONNXRuntime")
-  target_compile_definitions(onnxruntime_providers_rocm PRIVATE HIPBLAS_V2)
+  target_compile_definitions(onnxruntime_providers_rocm PRIVATE HIPBLAS)
 
   if (onnxruntime_ENABLE_TRAINING)
     target_include_directories(onnxruntime_providers_rocm PRIVATE ${ORTTRAINING_ROOT} ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining ${MPI_CXX_INCLUDE_DIRS})
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index f6eac2c24eca2..5639b295f0787 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -189,7 +189,10 @@ set(onnxruntime_pybind11_state_static_providers
 if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
   list(APPEND onnxruntime_pybind11_state_static_providers PRIVATE onnxruntime_providers_qnn)
 endif()
-
+if(WIN32)
+  # onnxruntime_pybind11_state is a DLL
+  target_sources(onnxruntime_pybind11_state PRIVATE "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc")
+endif()
 target_link_libraries(onnxruntime_pybind11_state PRIVATE
     onnxruntime_session
     ${onnxruntime_libs}
@@ -1064,12 +1067,6 @@ if (onnxruntime_USE_QNN)
         ${QNN_LIB_FILES}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
   )
-  add_custom_command(
-    TARGET onnxruntime_pybind11_state POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-        $<TARGET_FILE:ep_weight_sharing_ctx_gen>
-        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
-  )
   if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
     add_custom_command(
       TARGET onnxruntime_pybind11_state POST_BUILD
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index c8de91d6c6eb6..26ef7970fa2b6 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1334,6 +1334,14 @@ endif()
 
   # shared lib
   if (onnxruntime_BUILD_SHARED_LIB)
+    if(WIN32)
+        AddTest(DYN
+                TARGET onnxruntime_shared_lib_dlopen_test
+                SOURCES ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/dlopen_main.cc
+                LIBS onnxruntime
+                DEPENDS ${all_dependencies}
+        )
+    endif()
     onnxruntime_add_static_library(onnxruntime_mocked_allocator ${TEST_SRC_DIR}/util/test_allocator.cc)
     target_include_directories(onnxruntime_mocked_allocator PUBLIC ${TEST_SRC_DIR}/util/include)
     target_link_libraries(onnxruntime_mocked_allocator PRIVATE ${GSL_TARGET})
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index f00292fade52d..c0b6efb0eb75d 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -503,58 +503,60 @@ jsepDownload:_pp_")
 
   set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME ${target_name} SUFFIX ".mjs")
 
-  #
-  # The following POST_BUILD script is a workaround for enabling:
-  # - using onnxruntime-web with Multi-threading enabled when import from CDN
-  # - using onnxruntime-web when consumed in some frameworks like Vite
-  #
-  # In the use case mentioned above, the file name of the script may be changed. So we need to replace the line:
-  # `new Worker(new URL("ort-wasm-*.mjs", import.meta.url),`
-  # with
-  # `new Worker(new URL(import.meta.url),`
-  #
-  # This behavior is introduced in https://github.com/emscripten-core/emscripten/pull/22165. Since it's unlikely to be
-  # reverted, and there is no config to disable this behavior, we have to use a post-build script to workaround it.
-  #
-
-  # Generate a script to do the post-build work
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/wasm_post_build.js "
-    const fs = require('fs');
-    const path = require('path');
-
-    // node wasm_post_build.js <mjsFilePath>
-    const mjsFilePath = process.argv[2];
-    let contents = fs.readFileSync(mjsFilePath).toString();
-
-    const regex = 'new Worker\\\\(new URL\\\\(\".+?\", ?import\\\\.meta\\\\.url\\\\),';
-    const matches = [...contents.matchAll(new RegExp(regex, 'g'))];
-    if (matches.length !== 1) {
-      throw new Error(
-        `Unexpected number of matches for \"${regex}\" in \"${filepath}\": ${matches.length}.`,
+  if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
+    #
+    # The following POST_BUILD script is a workaround for enabling:
+    # - using onnxruntime-web with Multi-threading enabled when import from CDN
+    # - using onnxruntime-web when consumed in some frameworks like Vite
+    #
+    # In the use case mentioned above, the file name of the script may be changed. So we need to replace the line:
+    # `new Worker(new URL("ort-wasm-*.mjs", import.meta.url),`
+    # with
+    # `new Worker(new URL(import.meta.url),`
+    #
+    # This behavior is introduced in https://github.com/emscripten-core/emscripten/pull/22165. Since it's unlikely to be
+    # reverted, and there is no config to disable this behavior, we have to use a post-build script to workaround it.
+    #
+
+    # Generate a script to do the post-build work
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/wasm_post_build.js "
+      const fs = require('fs');
+      const path = require('path');
+
+      // node wasm_post_build.js <mjsFilePath>
+      const mjsFilePath = process.argv[2];
+      let contents = fs.readFileSync(mjsFilePath).toString();
+
+      const regex = 'new Worker\\\\(new URL\\\\(\".+?\", ?import\\\\.meta\\\\.url\\\\),';
+      const matches = [...contents.matchAll(new RegExp(regex, 'g'))];
+      if (matches.length !== 1) {
+        throw new Error(
+          `Unexpected number of matches for \"\${regex}\" in \"\${mjsFilePath}\": \${matches.length}.`,
+        );
+      }
+
+      // Replace the only occurrence.
+      contents = contents.replace(
+        new RegExp(regex),
+        `new Worker(new URL(import.meta.url),`,
       );
-    }
 
-    // Replace the only occurrence.
-    contents = contents.replace(
-      new RegExp(regex),
-      `new Worker(new URL(import.meta.url),`,
-    );
+      fs.writeFileSync(mjsFilePath, contents);
+    "
+    )
 
-    fs.writeFileSync(mjsFilePath, contents);
-  "
-  )
+    find_program(NODE_EXECUTABLE node required)
+    if (NOT NODE_EXECUTABLE)
+      message(FATAL_ERROR "Node is required to run the post-build script")
+    endif()
 
-  find_program(NODE_EXECUTABLE node required)
-  if (NOT NODE_EXECUTABLE)
-    message(FATAL_ERROR "Node is required to run the post-build script")
+    add_custom_command(
+      TARGET onnxruntime_webassembly
+      POST_BUILD
+      # Backup file at $<TARGET_FILE_NAME:onnxruntime_webassembly>.bak
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE_NAME:onnxruntime_webassembly>" "$<TARGET_FILE_NAME:onnxruntime_webassembly>.bak"
+      COMMAND ${CMAKE_COMMAND} -E echo "Performing post-process for $<TARGET_FILE_NAME:onnxruntime_webassembly>"
+      COMMAND ${NODE_EXECUTABLE} "${CMAKE_CURRENT_BINARY_DIR}/wasm_post_build.js" "$<TARGET_FILE_NAME:onnxruntime_webassembly>"
+    )
   endif()
-
-  add_custom_command(
-    TARGET onnxruntime_webassembly
-    POST_BUILD
-    # Backup file at $<TARGET_FILE_NAME:onnxruntime_webassembly>.bak
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE_NAME:onnxruntime_webassembly>" "$<TARGET_FILE_NAME:onnxruntime_webassembly>.bak"
-    COMMAND ${CMAKE_COMMAND} -E echo "Performing post-process for $<TARGET_FILE_NAME:onnxruntime_webassembly>"
-    COMMAND ${NODE_EXECUTABLE} "${CMAKE_CURRENT_BINARY_DIR}/wasm_post_build.js" "$<TARGET_FILE_NAME:onnxruntime_webassembly>"
-  )
 endif()
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index 0cd6bfa305843..7df4fd0898bde 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -3,8 +3,8 @@ vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
-    REF 7fc2b81a275223f5b02a522d9d2649837542a7be
-    SHA512 6911b4e532a7735ef40660dee904877850234a600b39d46a8dab91f6506c6547e3bd10af5d5f0f0abc0c6e7e6e1fc04c0ea307eb9f4aef5c614eaaa50403804d
+    REF "v${VERSION}"
+    SHA512 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
     PATCHES
         fix-cmakelists.patch
         fix-dependency-protobuf.patch
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index 9794d2c184d5d..6e325f7fe9646 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -17,6 +17,7 @@ public enum GraphOptimizationLevel
         ORT_DISABLE_ALL = 0,
         ORT_ENABLE_BASIC = 1,
         ORT_ENABLE_EXTENDED = 2,
+        ORT_ENABLE_LAYOUT = 3,
         ORT_ENABLE_ALL = 99
     }
 
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 8c1ab002bce67..b657c828fbde1 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -298,6 +298,7 @@ Do not modify directly.*
 |||[19, 20]|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int8), tensor(uint8)|
 |||[13, 18]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |||[10, 12]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
+|RMSNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**|23+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **V** = tensor(double), tensor(float), tensor(float16)|
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|22+|**T** = tensor(float)<br/> **T1** = tensor(int32)|
 |||[14, 21]|**T** = tensor(float)<br/> **T1** = tensor(int32)|
 |||[7, 13]|**T** = tensor(float)<br/> **T1** = tensor(int32)|
@@ -437,7 +438,7 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[2, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(string)|
+|SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(string)|
 |Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
 |Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|23+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -782,6 +783,7 @@ Do not modify directly.*
 |||[19, 20]|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
 |||[13, 18]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |||[10, 12]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
+|RMSNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**|23+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |||[7, 13]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |RandomNormal|*out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index 15c15c6c143d2..c84d34cfd3cbe 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -101,7 +101,9 @@ class IAllocator {
   const OrtMemoryInfo& Info() const { return memory_info_; };
 
   // Each implementation of IAllocator can override and provide their own implementation
-  virtual void GetStats(AllocatorStats* /*stats*/) { return; }
+  virtual void GetStats(AllocatorStats* stats) {
+    *stats = {};
+  }
 
   static bool CalcMemSizeForArray(size_t nmemb, size_t size, size_t* out) noexcept {
     return CalcMemSizeForArrayWithAlignment(nmemb, size, 0, out);
diff --git a/include/onnxruntime/core/optimizer/graph_transformer_level.h b/include/onnxruntime/core/optimizer/graph_transformer_level.h
index 111f38f9ccb6e..3f2126ce494a6 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_level.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_level.h
@@ -12,8 +12,9 @@ enum class TransformerLevel : int {
   Level1,       // basic optimizations
   Level2,       // extended optimizations
   Level3,       // layout optimizations
+  Level4,       // unsupported datatypes optimizations
   // The max level should always be same as the last level.
-  MaxLevel = Level3
+  MaxLevel = Level4
 };
 
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
index 31b0f22340510..6f07ead935f4a 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_utils.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -36,7 +36,8 @@ namespace optimizer_utils {
    TODO: This is visible for testing at the moment, but we should rather make it private. */
 InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
     TransformerLevel level,
-    const InlinedHashSet<std::string>& rules_to_disable = {});
+    const InlinedHashSet<std::string>& rules_to_disable = {},
+    const bool enable_cast_chain_elimination = false);
 
 /** Given a TransformerLevel, this method generates a name for the rule-based graph transformer of that level. */
 std::string GenerateRuleBasedTransformerName(TransformerLevel level);
@@ -45,7 +46,8 @@ std::string GenerateRuleBasedTransformerName(TransformerLevel level);
 std::unique_ptr<RuleBasedGraphTransformer> GenerateRuleBasedGraphTransformer(
     TransformerLevel level,
     const InlinedHashSet<std::string>& rules_to_disable,
-    const InlinedHashSet<std::string_view>& compatible_execution_providers);
+    const InlinedHashSet<std::string_view>& compatible_execution_providers,
+    const bool enable_cast_chain_elimination = false);
 
 /** Generates all predefined (both rule-based and non-rule-based) transformers for this level.
     Any transformers or rewrite rules named in rules_and_transformers_to_disable will be excluded. */
diff --git a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
index 0c9095f566fad..11cc6f131dab3 100644
--- a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
+++ b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
@@ -8,6 +8,7 @@
  * - `kHasUserComputeStream`: Indicates whether a user-provided compute stream is used.
  * - `kUserComputeStream`: Specifies the user-provided compute stream.
  * - `kMaxWorkspaceSize`: Sets the maximum workspace size for GPU memory allocation.
+ * - 'kMaxSharedMemSize': Sets the maximum amount of shared memory that TensorRT kernels are allowed to use
  * - `kDumpSubgraphs`: Enables or disables dumping of subgraphs for debugging.
  * - `kDetailedBuildLog`: Enables or disables detailed build logs for debugging.
  * - `kProfilesMinShapes`: Specifies the minimum shapes for profiling.
@@ -24,6 +25,7 @@ constexpr const char* kDeviceId = "device_id";
 constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
 constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMaxWorkspaceSize = "nv_max_workspace_size";
+constexpr const char* kMaxSharedMemSize = "nv_max_shared_mem_size";
 constexpr const char* kDumpSubgraphs = "nv_dump_subgraphs";
 constexpr const char* kDetailedBuildLog = "nv_detailed_build_log";
 constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes";
diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
index 776045a97cae5..a0053ffd3e3e3 100644
--- a/include/onnxruntime/core/session/environment.h
+++ b/include/onnxruntime/core/session/environment.h
@@ -6,6 +6,8 @@
 #include <atomic>
 #include <filesystem>
 #include <memory>
+#include <vector>
+#include <string>
 
 #include "core/common/common.h"
 #include "core/common/basic_types.h"
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index a2f518ae09a4b..0892accec40b0 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -340,6 +340,26 @@ typedef struct OrtAllocator {
    * those made during session initialization. This allows for separate memory management strategies for these allocations.
    */
   void*(ORT_API_CALL* Reserve)(struct OrtAllocator* this_, size_t size);  ///< Returns a pointer to an allocated block of `size` bytes
+
+  /**
+   * @brief Function used to get the statistics of the allocator.
+   *
+   * Return a pointer to the OrtKeyValuePairs structure that contains the statistics of the allocator
+   * and the user should call OrtApi::ReleaseKeyValuePairs.
+   * Supported keys are:
+   * - Limit: Bytes limit of the allocator. -1 if no limit is set.
+   * - InUse: Number of bytes in use.
+   * - TotalAllocated: The total number of allocated bytes by the allocator.
+   * - MaxInUse: The maximum bytes in use.
+   * - NumAllocs: Number of allocations.
+   * - NumReserves: Number of reserves. (Number of calls to Reserve() in arena-based allocators)
+   * - NumArenaExtensions: Number of arena extensions (Relevant only for arena based allocators)
+   * - NumArenaShrinkages: Number of arena shrinkages (Relevant only for arena based allocators)
+   * - MaxAllocSize: The max single allocation seen.
+   *
+   * NOTE: If the allocator does not implement this function, the OrtKeyValuePairs instance will be empty.
+   */
+  ORT_API2_STATUS(GetStats, _In_ const struct OrtAllocator* this_, _Outptr_ OrtKeyValuePairs** out);
 } OrtAllocator;
 
 typedef void(ORT_API_CALL* OrtLoggingFunction)(
@@ -355,6 +375,7 @@ typedef enum GraphOptimizationLevel {
   ORT_DISABLE_ALL = 0,
   ORT_ENABLE_BASIC = 1,
   ORT_ENABLE_EXTENDED = 2,
+  ORT_ENABLE_LAYOUT = 3,
   ORT_ENABLE_ALL = 99
 } GraphOptimizationLevel;
 
@@ -672,6 +693,7 @@ typedef struct OrtTensorRTProviderOptions {
 typedef struct OrtMIGraphXProviderOptions {
   int device_id;                                     // hip device id.
   int migraphx_fp16_enable;                          // MIGraphX FP16 precision. Default 0 = false, nonzero = true
+  int migraphx_fp8_enable;                           // MIGraphX FP8 precision. Default 0 = false, nonzero = true
   int migraphx_int8_enable;                          // MIGraphX INT8 precision. Default 0 = false, nonzero = true
   int migraphx_use_native_calibration_table;         // MIGraphx INT8 cal table. Default 0 = false, noznero = true
   const char* migraphx_int8_calibration_table_name;  // MIGraphx INT8 calibration table name
@@ -680,6 +702,21 @@ typedef struct OrtMIGraphXProviderOptions {
   int migraphx_load_compiled_model;                  // migraphx int8 cal table. Default 0 = false, noznero = true
   const char* migraphx_load_model_path;              // migraphx model path name
   bool migraphx_exhaustive_tune;                     // migraphx tuned compile  Default = false
+
+  /** \brief MIGraphX memory limit (To use all possible memory pass in maximum size_t)
+   *   Defaults to SIZE_MAX.
+   *   \note If a ::OrtArenaCfg has been applied, it will override this field
+   */
+  size_t migraphx_mem_limit;
+
+  /** \brief Strategy used to grow the memory arena
+   *   0 = kNextPowerOfTwo<br>
+   *   1 = kSameAsRequested<br>
+   *   Defaults to 0.
+   *   \note If a ::OrtArenaCfg has been applied, it will override this field
+   */
+  int migraphx_arena_extend_strategy;
+
 } OrtMIGraphXProviderOptions;
 
 /** \brief OpenVINO Provider Options
@@ -5275,6 +5312,22 @@ struct OrtApi {
    * \since Version 1.23
    */
   ORT_API2_STATUS(GetTensorSizeInBytes, _In_ const OrtValue* ort_value, _Out_ size_t* size);
+
+  /** \brief Calls OrtAllocator::GetStats function
+   *
+   * Return a pointer to the OrtKeyValuePairs structure that contains the statistics of the allocator
+   * and the user should call OrtApi::ReleaseKeyValuePairs.
+   *
+   * NOTE: If the allocator does not implement this function, the OrtKeyValuePairs instance will be empty.
+   *
+   * \param[in] ort_allocator The allocator to get stats from
+   * \param[out] out A pointer to the OrtKeyValuePairs instance that contains the stats
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(AllocatorGetStats, _In_ const OrtAllocator* ort_allocator, _Outptr_ OrtKeyValuePairs** out);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index c7f81264115c6..08e8736e9e591 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1740,15 +1740,11 @@ struct ConstValueImpl : Base<T> {
   size_t GetStringTensorElementLength(size_t element_index) const;
 
   /// <summary>
-  /// Returns the total size of the tensor data in bytes.
+  /// Returns the total size of the tensor data in bytes. Throws an exception if the OrtValue
+  /// does not contain a tensor or if it contains a tensor that contains strings.
+  /// For numeric tensors, this is sizeof(element_type) * total_element_count.
   /// </summary>
   /// <returns>The total size of the tensor data in bytes</returns>
-  /// <exception>Throws an exception if the OrtValue does not contain a tensor or
-  /// if it contains a tensor that contains strings</exception>
-  /// <remarks>
-  /// For numeric tensors, this is sizeof(element_type) * total_element_count.
-  ///
-  /// </remarks>
   size_t GetTensorSizeInBytes() const;  ///< Wraps OrtApi::GetTensorSizeInBytes
 
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -2155,6 +2151,12 @@ struct AllocatorImpl : Base<T> {
   MemoryAllocation GetAllocation(size_t size);
   void Free(void* p);
   ConstMemoryInfo GetInfo() const;
+
+  /** \brief Function that returns the statistics of the allocator.
+   *
+   * \return A pointer to a KeyValuePairs object that will be filled with the allocator statistics.
+   */
+  KeyValuePairs GetStats() const;
 };
 
 }  // namespace detail
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 6cd52732b923b..25936038ba297 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -243,6 +243,12 @@ inline ConstMemoryInfo AllocatorImpl<T>::GetInfo() const {
   return ConstMemoryInfo{out};
 }
 
+template <typename T>
+inline KeyValuePairs AllocatorImpl<T>::GetStats() const {
+  OrtKeyValuePairs* out;
+  ThrowOnError(GetApi().AllocatorGetStats(this->p_, &out));
+  return KeyValuePairs(out);
+}
 }  // namespace detail
 
 inline AllocatorWithDefaultOptions::AllocatorWithDefaultOptions() {
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 5497d7c71a393..97e53e6acee5a 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -67,6 +67,10 @@ static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enab
 // GeluApproximation has side effects which may change the inference results. It is disabled by default due to this.
 static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation";
 
+// Enable or disable Cast chain elimination in graph optimization. "0": disable; "1": enable. The default is "0".
+// CastElimination with chain elimination has side effects which may change the inference results. It is disabled by default due to this.
+static const char* const kOrtSessionOptionsEnableCastChainElimination = "optimization.enable_cast_chain_elimination";
+
 // This setting controls whether to enable AheadOfTime function inlining.
 // AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model
 // as possible with the help of enabled execution providers.
@@ -107,6 +111,37 @@ static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimiz
 // Default is an empty string which means no optimizers are disabled.
 static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
 
+// It controls whether to run graph optimizations in loop or not.
+//
+// "0": disable. Graph Optimization Loop is disabled.
+// ```
+// Level 2 --> Level 3 --> InsertCastTransforms --> Level 4
+//   ^                                                 |
+//   |                 "No Loop"                       |
+//   |                                                 |
+//   X                xxxxxxxxxxx                      X
+// ```
+// "1": enable. Graph Optimization Loop is enabled, such that, if optimizations at Level 4 are applied then
+// the loop will check for any other valid optimization that can happen.
+// ```
+// Level 2 --> Level 3 --> InsertCastTransforms --> Level 4
+//   ^                                                 |
+//   |        "Loop only depending on Level 4"         |
+//   |                                                 |
+//   ---------------------------------------------------
+// ```
+// "2": enable. Graph Optimization Loop is enabled, such that, if optimizations at Level 2 or above are applied then
+// The loop will check for any other valid optimization that can happen.
+// ```
+// Level 2 --> Level 3 --> InsertCastTransforms --> Level 4
+//   ^                                                 |
+//   |                    "Loop"                       |
+//   |                                                 |
+//   ---------------------------------------------------
+// ```
+// Default value is set to "1".
+static const char* const kOrtSessionOptionsGraphOptimizationsLoopLevel = "session.graph_optimizations_loop_level";
+
 // Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0".
 // Using device allocators means the memory allocation is made using malloc/new.
 static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers";
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index f15ad938463a7..a399d5080ca16 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -652,6 +652,8 @@ public enum OptLevel {
        * graph.
        */
       EXTENDED_OPT(2),
+      /** Applies all the layout optimizations like NCHW and NCHWC to the ONNX graph. */
+      LAYOUT_OPT(3),
       /** Applies all available optimizations to the ONNX graph. */
       ALL_OPT(99);
 
diff --git a/java/src/main/native/OrtJniUtil.c b/java/src/main/native/OrtJniUtil.c
index 6a3c279073860..fe19015d642f0 100644
--- a/java/src/main/native/OrtJniUtil.c
+++ b/java/src/main/native/OrtJniUtil.c
@@ -47,6 +47,8 @@ GraphOptimizationLevel convertOptimizationLevel(jint level) {
             return ORT_ENABLE_BASIC;
         case 2:
             return ORT_ENABLE_EXTENDED;
+        case 3:
+            return ORT_ENABLE_LAYOUT;
         case 99:
             return ORT_ENABLE_ALL;
         default:
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 4ef4891b5b46a..4a670e24aa6b7 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -81,7 +81,7 @@ export declare namespace InferenceSession {
      *
      * This setting is available only in ONNXRuntime (Node.js binding and react-native) or WebAssembly backend
      */
-    graphOptimizationLevel?: 'disabled' | 'basic' | 'extended' | 'all';
+    graphOptimizationLevel?: 'disabled' | 'basic' | 'extended' | 'layout' | 'all';
 
     /**
      * Whether enable CPU memory arena.
diff --git a/js/node/src/session_options_helper.cc b/js/node/src/session_options_helper.cc
index b189b45556306..7fff751a29186 100644
--- a/js/node/src/session_options_helper.cc
+++ b/js/node/src/session_options_helper.cc
@@ -31,6 +31,7 @@ const std::unordered_map<std::string, GraphOptimizationLevel> GRAPH_OPT_LEVEL_NA
     {"disabled", ORT_DISABLE_ALL},
     {"basic", ORT_ENABLE_BASIC},
     {"extended", ORT_ENABLE_EXTENDED},
+    {"layout", ORT_ENABLE_LAYOUT},
     {"all", ORT_ENABLE_ALL}};
 
 const std::unordered_map<std::string, ExecutionMode> EXECUTION_MODE_NAME_TO_ID_MAP = {{"sequential", ORT_SEQUENTIAL},
diff --git a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
index 1be8c22b40da8..496db5a6087e6 100644
--- a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
+++ b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
@@ -326,6 +326,7 @@ public WritableMap run(String key, ReadableMap input, ReadableArray output, Read
               {"disabled", SessionOptions.OptLevel.NO_OPT},
               {"basic", SessionOptions.OptLevel.BASIC_OPT},
               {"extended", SessionOptions.OptLevel.EXTENDED_OPT},
+              // {"layout", SessionOptions.OptLevel.LAYOUT_OPT},
               {"all", SessionOptions.OptLevel.ALL_OPT},
           })
           .collect(Collectors.toMap(p -> (String)p[0], p -> (SessionOptions.OptLevel)p[1]));
diff --git a/js/react_native/ios/OnnxruntimeModule.mm b/js/react_native/ios/OnnxruntimeModule.mm
index d3527aad6ae38..b1b55075d26bc 100644
--- a/js/react_native/ios/OnnxruntimeModule.mm
+++ b/js/react_native/ios/OnnxruntimeModule.mm
@@ -301,6 +301,7 @@ - (NSDictionary*)run:(NSString*)url
   @"disabled" : @(ORT_DISABLE_ALL),
   @"basic" : @(ORT_ENABLE_BASIC),
   @"extended" : @(ORT_ENABLE_EXTENDED),
+  @"layout" : @(ORT_ENABLE_LAYOUT),
   @"all" : @(ORT_ENABLE_ALL)
 };
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index cd787379220c1..26d07b4347131 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -14,6 +14,8 @@ const getGraphOptimzationLevel = (graphOptimizationLevel: string | unknown): num
       return 1;
     case 'extended':
       return 2;
+    case 'layout':
+      return 3;
     case 'all':
       return 99;
     default:
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 23eb2f0978feb..f2a28396d7486 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -151,7 +151,12 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
   const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag;
   const wasmBinaryOverride = flags.wasmBinary;
 
-  const [objectUrl, ortWasmFactory] = await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1);
+  const [objectUrl, ortWasmFactory] = await importWasmModule(
+    mjsPathOverride,
+    wasmPrefixOverride,
+    numThreads > 1,
+    !!wasmBinaryOverride || !!wasmPathOverride,
+  );
 
   let isTimeout = false;
 
diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index a8e27f6f334bc..d9180e220c80c 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -234,9 +234,45 @@ export const importWasmModule = async (
   urlOverride: string | undefined,
   prefixOverride: string | undefined,
   isMultiThreaded: boolean,
+  isWasmOverridden: boolean,
 ): Promise<[undefined | string, EmscriptenModuleFactory<OrtWasmModule>]> => {
-  if (!urlOverride && !prefixOverride && embeddedWasmModule && scriptSrc && isSameOrigin(scriptSrc)) {
-    return [undefined, embeddedWasmModule];
+  //
+  // Check if we should use the embedded module.
+  //
+
+  // To use the embedded module, it should be available, and no URL override or prefix override should be specified.
+  let useEmbeddedModule = embeddedWasmModule && !(urlOverride || prefixOverride);
+  if (useEmbeddedModule) {
+    if (!scriptSrc) {
+      // no URL info available.
+      //
+      // Note: when the embedded module is available, it means the current script is ESM. Usually, in ESM, the
+      // `import.meta.url` is available. But in some cases (eg. Cloudflare Workers), the value of `import.meta.url`
+      // can be `null` or `undefined`. In this case, we can only load the embedded module when:
+      //
+      // 1. The WebAssembly module binary is overridden:
+      //    ```js
+      //    env.wasm.wasmPaths = undefined;  // or not specified
+      //    env.wasm.wasmBinary = /* a Uint8Array containing the WebAssembly binary */;
+      //    ```
+      //
+      // 2. The ".wasm" only is overridden.
+      //    ```js
+      //    env.wasm.wasmPaths = { wasm: /* URL of the .wasm file */ };
+      //    ```
+      //
+      if (isWasmOverridden && !isMultiThreaded) {
+        useEmbeddedModule = true;
+      } else {
+        throw new Error('cannot determine the script source URL.');
+      }
+    } else {
+      // if the script source is available, we can check if it is from the same origin.
+      useEmbeddedModule = isSameOrigin(scriptSrc);
+    }
+  }
+  if (useEmbeddedModule) {
+    return [undefined, embeddedWasmModule!];
   } else {
     const wasmModuleFilename = !BUILD_DEFS.DISABLE_JSEP
       ? 'ort-wasm-simd-threaded.jsep.mjs'
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index f546f58a28bfa..3bfb89164393e 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -58,7 +58,7 @@ Options:
 *** Session Options ***
  -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
  -o=<...>, --graph-optimization-level=<...>         Specify graph optimization level.
-                                                      Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
+                                                      Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'layout', 'all'.
  -i=<...>, --io-binding=<...>  Specify the IO binding testing type. Should be one of the following:
                                  none            (default)
                                  gpu-tensor      use pre-allocated GPU tensors for inputs and outputs
@@ -195,7 +195,7 @@ export interface TestRunnerCliArgs {
   /**
    * Specify graph optimization level
    */
-  graphOptimizationLevel: 'disabled' | 'basic' | 'extended' | 'all';
+  graphOptimizationLevel: 'disabled' | 'basic' | 'extended' | 'layout' | 'all';
 
   cpuOptions?: InferenceSession.CpuExecutionProviderOption;
   cudaOptions?: InferenceSession.CudaExecutionProviderOption;
@@ -480,7 +480,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   const graphOptimizationLevel = args['graph-optimization-level'] || args.o || 'all';
   if (
     typeof graphOptimizationLevel !== 'string' ||
-    ['disabled', 'basic', 'extended', 'all'].indexOf(graphOptimizationLevel) === -1
+    ['disabled', 'basic', 'extended', 'layout', 'all'].indexOf(graphOptimizationLevel) === -1
   ) {
     throw new Error(`graph optimization level is invalid: ${graphOptimizationLevel}`);
   }
diff --git a/objectivec/include/ort_enums.h b/objectivec/include/ort_enums.h
index 78de233972ccf..61a127f1a4b55 100644
--- a/objectivec/include/ort_enums.h
+++ b/objectivec/include/ort_enums.h
@@ -50,6 +50,7 @@ typedef NS_ENUM(int32_t, ORTGraphOptimizationLevel) {
   ORTGraphOptimizationLevelNone,
   ORTGraphOptimizationLevelBasic,
   ORTGraphOptimizationLevelExtended,
+  ORTGraphOptimizationLevelLayout,
   ORTGraphOptimizationLevelAll,
 };
 
diff --git a/objectivec/ort_enums.mm b/objectivec/ort_enums.mm
index 60939812df531..5fcbe34e5e8a4 100644
--- a/objectivec/ort_enums.mm
+++ b/objectivec/ort_enums.mm
@@ -68,6 +68,7 @@
     {ORTGraphOptimizationLevelNone, ORT_DISABLE_ALL},
     {ORTGraphOptimizationLevelBasic, ORT_ENABLE_BASIC},
     {ORTGraphOptimizationLevelExtended, ORT_ENABLE_EXTENDED},
+    {ORTGraphOptimizationLevelLayout, ORT_ENABLE_LAYOUT},
     {ORTGraphOptimizationLevelAll, ORT_ENABLE_ALL},
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/cudnn_fmha/cudnn_flash_attention.cu b/onnxruntime/contrib_ops/cuda/bert/cudnn_fmha/cudnn_flash_attention.cc
similarity index 99%
rename from onnxruntime/contrib_ops/cuda/bert/cudnn_fmha/cudnn_flash_attention.cu
rename to onnxruntime/contrib_ops/cuda/bert/cudnn_fmha/cudnn_flash_attention.cc
index aabbe4cc7582a..ec5deccf655ff 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cudnn_fmha/cudnn_flash_attention.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/cudnn_fmha/cudnn_flash_attention.cc
@@ -313,8 +313,7 @@ struct BytesHash {
 
 // Use thread local caches because cuDNN execution plans are not guaranteed to be thread safe.
 // TODO(tianleiwu): since the key includes sequence lengths, we may want to limit the cache size.
-thread_local
-std::unordered_map<GraphParams, std::shared_ptr<fe::graph::Graph>, BytesHash<GraphParams> > mha_graph_cache;
+thread_local std::unordered_map<GraphParams, std::shared_ptr<fe::graph::Graph>, BytesHash<GraphParams> > mha_graph_cache;
 
 void run(
     void* output,
@@ -341,7 +340,6 @@ void run(
     cudnnHandle_t handle,
     Stream* stream,
     AllocatorPtr allocator) {
-
   GraphParams params;
   params.batch_size = batch_size;
   params.num_heads_q = num_heads_q;
diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
index 8a17e945df3f3..e6f1798f6ef72 100644
--- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
@@ -39,7 +39,13 @@ __global__ void MaskIndexKernelSmall(int sequence_length, const int* mask, int*
   // blockIdx.x is b
   const int offset = blockIdx.x * sequence_length;  // batch strides of sequence_length
 
+#if CUDA_VERSION >= 12090
+  ::cuda::minimum<int> min;
+#else
+  // Deprecated on CUDA 12.9
   cub::Min min;
+#endif
+
   int thread_data(sequence_length);
 
   const int idx = offset + threadIdx.x;
@@ -66,7 +72,13 @@ __global__ void MaskIndexKernel(int sequence_length, const int* mask, int* mask_
   // blockIdx.x is b
   const int offset = blockIdx.x * sequence_length;  // batch strides of sequence_length
 
+#if CUDA_VERSION >= 12090
+  ::cuda::minimum<int> min;
+#else
+  // Deprecated on CUDA 12.9
   cub::Min min;
+#endif
+
   int thread_data(sequence_length);
 
   for (int i = threadIdx.x; i < sequence_length; i += TPB) {
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_extensions/gemm_configs.h b/onnxruntime/contrib_ops/cuda/llm/cutlass_extensions/gemm_configs.h
index e48ef3f154883..e7ba5d4b54f05 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_extensions/gemm_configs.h
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_extensions/gemm_configs.h
@@ -376,24 +376,19 @@ struct CutlassGemmConfig {
 };
 
 inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& config) {
-  // clang-format off
-    if (config.is_tma_warp_specialized)
-    {
-        out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
-            << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
-            << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
-            << ", cluster_shape_enum: " << int(config.cluster_shape)
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-    }
-    else
-    {
-        out << "tile_config_enum: " << config.getTileConfigAsInt()
-            << ", split_k_style_enum: " << int(config.split_k_style)
-            << ", split_k_factor: " << config.split_k_factor
-            << ", stages: " << config.stages
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-    }
-  // clang-format on
+  if (config.is_tma_warp_specialized) {
+    out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
+        << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
+        << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
+        << ", cluster_shape_enum: " << int(config.cluster_shape)
+        << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+  } else {
+    out << "tile_config_enum: " << config.getTileConfigAsInt()
+        << ", split_k_style_enum: " << int(config.split_k_style)
+        << ", split_k_factor: " << config.split_k_factor
+        << ", stages: " << config.stages
+        << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+  }
   return out;
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_preprocessors.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_preprocessors.cc
deleted file mode 100644
index 50ee944161538..0000000000000
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_preprocessors.cc
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "contrib_ops/cuda/llm/cutlass_preprocessors.h"
-
-#include <cuda_bf16.h>
-
-#include "core/common/common.h"
-#include "contrib_ops/cuda/llm/common/cuda_runtime_utils.h"
-#include "contrib_ops/cuda/llm/common/logger.h"
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif
-
-#include "cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-
-using namespace onnxruntime::llm::common;
-
-namespace onnxruntime::llm {
-namespace kernels {
-namespace cutlass_kernels {
-
-struct LayoutDetails {
-  enum class Layout {
-    UNKNOWN,
-    ROW_MAJOR,
-    COLUMN_MAJOR
-  };
-
-  Layout layoutB = Layout::UNKNOWN;
-  int rows_per_column_tile = 1;
-  int columns_interleaved = 1;
-
-  bool uses_imma_ldsm = false;
-};
-
-template <typename Layout>
-struct getLayoutDetails {
-};
-
-template <>
-struct getLayoutDetails<cutlass::layout::RowMajor> {
-  LayoutDetails operator()() {
-    LayoutDetails layout_details;
-    layout_details.layoutB = LayoutDetails::Layout::ROW_MAJOR;
-    return layout_details;
-  }
-};
-
-template <>
-struct getLayoutDetails<cutlass::layout::ColumnMajor> {
-  LayoutDetails operator()() {
-    LayoutDetails layout_details;
-    layout_details.layoutB = LayoutDetails::Layout::COLUMN_MAJOR;
-    return layout_details;
-  }
-};
-
-template <int RowsPerTile, int ColumnsInterleaved>
-struct getLayoutDetails<cutlass::layout::ColumnMajorTileInterleave<RowsPerTile, ColumnsInterleaved>> {
-  LayoutDetails operator()() {
-    LayoutDetails layout_details;
-    layout_details.layoutB = LayoutDetails::Layout::COLUMN_MAJOR;
-    layout_details.rows_per_column_tile = RowsPerTile;
-    layout_details.columns_interleaved = ColumnsInterleaved;
-    return layout_details;
-  }
-};
-
-template <typename cutlassArch, typename TypeA, typename TypeB>
-LayoutDetails getLayoutDetailsForArchAndQuantType() {
-  using CompileTraits = cutlass::gemm::kernel::LayoutDetailsB<TypeA, TypeB, cutlassArch>;
-  using LayoutB = typename CompileTraits::Layout;
-  using MmaOperator = typename CompileTraits::Operator;
-  LayoutDetails details = getLayoutDetails<LayoutB>()();
-  details.uses_imma_ldsm = std::is_same<MmaOperator, cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA>::value;
-  return details;
-}
-
-template <typename cutlassArch>
-LayoutDetails getLayoutDetailsForArch(QuantType quant_type) {
-  LayoutDetails details;
-  switch (quant_type) {
-    case QuantType::W8_A16:
-      details = getLayoutDetailsForArchAndQuantType<cutlassArch, cutlass::half_t, uint8_t>();
-      break;
-    case QuantType::W4_A16:
-      details = getLayoutDetailsForArchAndQuantType<cutlassArch, cutlass::half_t, cutlass::uint4b_t>();
-      break;
-    case QuantType::W4_AFP8:
-      details = getLayoutDetailsForArchAndQuantType<cutlassArch, cutlass::float_e4m3_t, cutlass::uint4b_t>();
-      break;
-    default:
-      ORT_THROW("Unsupported quantization type");
-  }
-  return details;
-}
-
-LayoutDetails getLayoutDetailsForTransform(QuantType quant_type, int arch) {
-  if (arch >= 75 && arch < 80) {
-    return getLayoutDetailsForArch<cutlass::arch::Sm75>(quant_type);
-  } else if (arch >= 80 && arch < 90) {
-    return getLayoutDetailsForArch<cutlass::arch::Sm80>(quant_type);
-  } else if (arch >= 90 && arch < 100) {
-    return getLayoutDetailsForArch<cutlass::arch::Sm90>(quant_type);
-  } else if (arch >= 100) {
-    return getLayoutDetailsForArch<cutlass::arch::Sm80>(quant_type);
-  } else {
-    ORT_THROW("Unsupported Arch");
-    return LayoutDetails();
-  }
-}
-
-// Permutes the rows of B in a way that is compatible with Turing+ architectures.
-//
-// Throws an error for other architectures.
-// The data is permuted such that:
-// For W8_A16, each group of 16 rows is permuted using the map below:
-//  0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15
-// For W4_A16, each group of 32 rows is permuted using the map below:
-//  0 1 8 9 16 17 24 25 2 3 10 11 18 19 26 27 4 5 12 13 20 21 28 29 6 7 14 15 22 23 30 31
-// For W4_A8, see the map in the code. The idea is similar to above.
-// The goal of this permutation is to ensure data ends up in the correct threads after
-// we execute LDSM. It counteracts the effect of the data being of different widths.
-// For more information about the expected layouts, see the MMA section in the PTX docs.
-std::vector<int> get_permutation_map(QuantType quant_type) {
-  if (quant_type == QuantType::W8_A16) {
-    return {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-  } else if (quant_type == QuantType::W4_A16) {
-    return {0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15,
-            22, 23, 30, 31};
-  } else if (quant_type == QuantType::W4_AFP8) {
-    return {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23, 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15,
-            28, 29, 30, 31};
-  } else {
-    ORT_THROW("Invalid quantization type for LDSM permutation");
-  }
-}
-
-void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor, int8_t const* quantized_tensor,
-                                   std::vector<size_t> const& shape, QuantType quant_type) {
-  ORT_LLM_LOG_TRACE(__PRETTY_FUNCTION__);
-  // We only want to run this step for weight only quant.
-  std::vector<int> row_permutation = get_permutation_map(quant_type);
-
-  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
-  const size_t num_experts = shape.size() == 2 ? 1 : shape[0];
-  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
-  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];
-
-  int const BITS_PER_ELT = get_weight_quant_bits(quant_type);
-  int const K = 16 / BITS_PER_ELT;
-
-  uint32_t const* input_byte_ptr = reinterpret_cast<uint32_t const*>(quantized_tensor);
-  uint32_t* output_byte_ptr = reinterpret_cast<uint32_t*>(permuted_quantized_tensor);
-
-  int MMA_SHAPE_N = 8;
-  int B_ROWS_PER_MMA = 8 * K;
-  int const elts_in_int32 = 32 / BITS_PER_ELT;
-
-  int const num_vec_cols = num_cols / elts_in_int32;
-
-  ORT_ENFORCE(num_rows % B_ROWS_PER_MMA == 0,
-              "Invalid shape for quantized tensor. Number of rows of quantized matrix must be a multiple of ",
-              B_ROWS_PER_MMA);
-  ORT_ENFORCE(num_cols % MMA_SHAPE_N == 0,
-              "Invalid shape for quantized tensor. On turing/Ampere, the number of cols must be a multiple of ",
-              MMA_SHAPE_N);
-
-  ORT_ENFORCE(size_t(B_ROWS_PER_MMA) == row_permutation.size(), "Unexpected number of LDSM rows permuted.");
-
-  for (int expert = 0; expert < static_cast<int>(num_experts); ++expert) {
-    const int64_t matrix_offset = expert * int64_t(num_rows) * int64_t(num_vec_cols);
-    for (int base_row = 0; base_row < static_cast<int>(num_rows); base_row += B_ROWS_PER_MMA) {
-      for (int tile_row = 0; tile_row < B_ROWS_PER_MMA; ++tile_row) {
-        for (int write_col = 0; write_col < num_vec_cols; ++write_col) {
-          int const write_row = base_row + tile_row;
-          int const tile_read_row = row_permutation[tile_row];
-          int const read_row = base_row + tile_read_row;
-          int const read_col = write_col;
-
-          const int64_t read_offset = matrix_offset + int64_t(read_row) * num_vec_cols + read_col;
-          const int64_t write_offset = matrix_offset + int64_t(write_row) * num_vec_cols + write_col;
-
-          output_byte_ptr[write_offset] = input_byte_ptr[read_offset];
-        }
-      }
-    }
-  }
-}
-
-// We need to use this transpose to correctly handle packed int4 and int8 data
-// The reason this code is relatively complex is that the "trivial" loops took a substantial
-// amount of time to transpose leading to long preprocessing times. This seemed to be a big
-// issue for relatively large models.
-template <QuantType quant_type>
-void subbyte_transpose_impl(
-    int8_t* transposed_quantized_tensor, int8_t const* quantized_tensor, std::vector<size_t> const& shape) {
-  ORT_LLM_LOG_TRACE(__PRETTY_FUNCTION__);
-  constexpr int bits_per_elt = get_weight_quant_bits(quant_type);
-
-  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
-  const size_t num_experts = shape.size() == 2 ? 1 : shape[0];
-  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
-  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];
-
-  const size_t col_bytes = num_cols * bits_per_elt / 8;
-  const size_t col_bytes_trans = num_rows * bits_per_elt / 8;
-
-  uint8_t const* input_byte_ptr = reinterpret_cast<uint8_t const*>(quantized_tensor);
-  uint8_t* output_byte_ptr = reinterpret_cast<uint8_t*>(transposed_quantized_tensor);
-
-  static constexpr int ELTS_PER_BYTE = 8 / bits_per_elt;
-
-  static constexpr int M_TILE_L1 = 64;
-  static constexpr int N_TILE_L1 = M_TILE_L1 / ELTS_PER_BYTE;
-  uint8_t cache_buf[M_TILE_L1][N_TILE_L1];
-
-  static constexpr int VECTOR_WIDTH = std::min(32, N_TILE_L1);
-
-  // We assume the dims are a multiple of vector width. Our kernels only handle dims which are multiples
-  // of 64 for weight-only quantization. As a result, this seemed like a reasonable tradeoff because it
-  // allows GCC to emit vector instructions.
-  ORT_ENFORCE(!(col_bytes_trans % VECTOR_WIDTH) && !(col_bytes % VECTOR_WIDTH),
-              "Number of bytes for rows and cols must be a multiple of ", VECTOR_WIDTH, ". However, num_rows_bytes = ",
-              col_bytes_trans, " and num_col_bytes = ", col_bytes);
-
-  for (size_t expert = 0; expert < num_experts; ++expert) {
-    const size_t matrix_offset = expert * num_rows * col_bytes;
-    for (size_t row_tile_start = 0; row_tile_start < num_rows; row_tile_start += M_TILE_L1) {
-      for (size_t col_tile_start_byte = 0; col_tile_start_byte < col_bytes; col_tile_start_byte += N_TILE_L1) {
-        int const row_limit = std::min(row_tile_start + M_TILE_L1, num_rows);
-        int const col_limit = std::min(col_tile_start_byte + N_TILE_L1, col_bytes);
-
-        for (int ii = 0; ii < M_TILE_L1; ++ii) {
-          int const row = row_tile_start + ii;
-
-          for (int jj = 0; jj < N_TILE_L1; jj += VECTOR_WIDTH) {
-            int const col = col_tile_start_byte + jj;
-
-            const size_t logical_src_offset = matrix_offset + row * col_bytes + col;
-
-            if (row < row_limit && col < col_limit) {
-              for (int v = 0; v < VECTOR_WIDTH; ++v) {
-                cache_buf[ii][jj + v] = input_byte_ptr[logical_src_offset + v];
-              }
-            }
-          }
-        }
-
-        if constexpr (bits_per_elt == 8) {
-          for (int ii = 0; ii < M_TILE_L1; ++ii) {
-            for (int jj = ii + 1; jj < N_TILE_L1; ++jj) {
-              std::swap(cache_buf[ii][jj], cache_buf[jj][ii]);
-            }
-          }
-        } else if constexpr (bits_per_elt == 4) {
-          for (int ii = 0; ii < M_TILE_L1; ++ii) {
-            // Using M_TILE_L1 here is deliberate since we assume that the cache tile
-            // is square in the number of elements (not necessarily the number of bytes).
-            for (int jj = ii + 1; jj < M_TILE_L1; ++jj) {
-              int const ii_byte = ii / ELTS_PER_BYTE;
-              int const ii_bit_offset = ii % ELTS_PER_BYTE;
-
-              int const jj_byte = jj / ELTS_PER_BYTE;
-              int const jj_bit_offset = jj % ELTS_PER_BYTE;
-
-              uint8_t src_elt = 0xF & (cache_buf[ii][jj_byte] >> (4 * jj_bit_offset));
-              uint8_t tgt_elt = 0xF & (cache_buf[jj][ii_byte] >> (4 * ii_bit_offset));
-
-              cache_buf[ii][jj_byte] &= (0xF0 >> (4 * jj_bit_offset));
-              cache_buf[jj][ii_byte] &= (0xF0 >> (4 * ii_bit_offset));
-
-              cache_buf[ii][jj_byte] |= (tgt_elt << (4 * jj_bit_offset));
-              cache_buf[jj][ii_byte] |= (src_elt << (4 * ii_bit_offset));
-            }
-          }
-        } else {
-          ORT_THROW("Unsupported quantization type.");
-        }
-
-        const size_t row_tile_start_trans = col_tile_start_byte * ELTS_PER_BYTE;
-        const size_t col_tile_start_byte_trans = row_tile_start / ELTS_PER_BYTE;
-
-        int const row_limit_trans = std::min(row_tile_start_trans + M_TILE_L1, num_cols);
-        int const col_limit_trans = std::min(col_tile_start_byte_trans + N_TILE_L1, col_bytes_trans);
-
-        for (int ii = 0; ii < M_TILE_L1; ++ii) {
-          int const row = row_tile_start_trans + ii;
-          for (int jj = 0; jj < N_TILE_L1; jj += VECTOR_WIDTH) {
-            int const col = col_tile_start_byte_trans + jj;
-
-            const size_t logical_tgt_offset = matrix_offset + row * col_bytes_trans + col;
-
-            if (row < row_limit_trans && col < col_limit_trans) {
-              for (int v = 0; v < VECTOR_WIDTH; ++v) {
-                output_byte_ptr[logical_tgt_offset + v] = cache_buf[ii][jj + v];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void subbyte_transpose(int8_t* transposed_quantized_tensor, int8_t const* quantized_tensor,
-                       std::vector<size_t> const& shape, QuantType quant_type) {
-  ORT_LLM_LOG_TRACE(__PRETTY_FUNCTION__);
-
-  if (quant_type == QuantType::W8_A16) {
-    subbyte_transpose_impl<QuantType::W8_A16>(transposed_quantized_tensor, quantized_tensor, shape);
-  } else if (quant_type == QuantType::W4_A16) {
-    subbyte_transpose_impl<QuantType::W4_A16>(transposed_quantized_tensor, quantized_tensor, shape);
-  } else if (quant_type == QuantType::W4_AFP8) {
-    subbyte_transpose_impl<QuantType::W4_AFP8>(transposed_quantized_tensor, quantized_tensor, shape);
-  } else {
-    ORT_THROW("Invalid quant_type");
-  }
-}
-
-void add_bias_and_interleave_int8s_inplace(int8_t* int8_tensor, const size_t num_elts) {
-  for (size_t ii = 0; ii < num_elts; ++ii) {
-    int8_tensor[ii] = int8_t(int(int8_tensor[ii]) + 128);
-  }
-
-  // Step 2 will transform the layout of a 32-bit register in CUDA in order to match the int4 layout. This has no
-  // performance benefit and is purely so that int4 and int8 have the same layout.
-  // Pictorially, this does the following:
-  // bit 32                                                      0
-  //      [elt_3  elt_2  elt_1  elt_0] (each elt occupies 8 bits)
-  //
-  // And it will rearrange the output 32 bit register to be the following:
-  // bit 32                                                      0
-  //      [elt_3  elt_1  elt_2  elt_0] (each elt occupies 8 bits)
-
-  ORT_ENFORCE(num_elts % 4 == 0, "Dimensions of int8 tensor must be a multiple of 4 for register relayout");
-  for (size_t base = 0; base < num_elts; base += 4) {
-    std::swap(int8_tensor[base + 1], int8_tensor[base + 2]);
-  }
-}
-
-void add_bias_and_interleave_int4s_inplace(int8_t* packed_int4_tensor, const size_t num_elts) {
-  size_t const num_bytes = num_elts / 2;
-
-  // Step 1 will be to transform all the int4s to unsigned in order to make the dequantize take as little
-  // instructions as possible in the CUDA code.
-  for (size_t ii = 0; ii < num_bytes; ++ii) {
-    int8_t transformed_packed_int4s = 0;
-    int8_t transformed_first_elt = (int8_t(packed_int4_tensor[ii] << 4) >> 4) + 8;  // The double shift here is to ensure sign extension
-    int8_t transformed_second_elt = (packed_int4_tensor[ii] >> 4) + 8;
-
-    ORT_ENFORCE(
-        transformed_first_elt >= 0 && transformed_first_elt <= 15, "Illegal result for int4 transform (first elt)");
-    ORT_ENFORCE(transformed_second_elt >= 0 && transformed_second_elt <= 15,
-                "Illegal result for int4 transform (second elt)");
-
-    // We don't need to mask in these ops since everything should be in the range 0-15
-    transformed_packed_int4s |= transformed_first_elt;
-    transformed_packed_int4s |= (transformed_second_elt << 4);
-    packed_int4_tensor[ii] = transformed_packed_int4s;
-  }
-
-  // Step 2 will transform the layout of a 32-bit register in CUDA in order to minimize the number of shift & logical
-  // instructions That are needed to extract the int4s in the GEMM main loop. Pictorially, the loop below will do the
-  // following: Take as input a 32 bit register with layout: bit 32 0
-  //      [elt_7  elt_6  elt_5  elt_4  elt_3  elt_2  elt_1  elt_0] (each elt occupies 4 bits)
-  //
-  // And it will rearrange the output 32 bit register to be the following:
-  // bit 32                                                      0
-  //      [elt_7  elt_5  elt_3  elt_1  elt_6  elt_4  elt_2  elt_0] (each elt occupies 4 bits)
-
-  ORT_ENFORCE(num_bytes % 4 == 0, "Dimensions of int4 tensor must be a multiple of 8 for register relayout");
-  const size_t num_registers = num_bytes / 4;
-
-  uint32_t* register_ptr = reinterpret_cast<uint32_t*>(packed_int4_tensor);
-  for (size_t ii = 0; ii < num_registers; ++ii) {
-    const uint32_t current_register = register_ptr[ii];
-    uint32_t transformed_register = 0;
-
-    for (int dest_idx = 0; dest_idx < 8; ++dest_idx) {
-      int const src_idx = dest_idx < 4 ? 2 * dest_idx : 2 * (dest_idx - 4) + 1;
-      int const src_shift = 4 * src_idx;
-      int const dest_shift = 4 * dest_idx;
-
-      const uint32_t src_bits = (current_register >> src_shift) & 0xF;
-      transformed_register |= (src_bits << dest_shift);
-    }
-    register_ptr[ii] = transformed_register;
-  }
-}
-
-void add_bias_and_interleave_quantized_tensor_inplace(int8_t* tensor, const size_t num_elts, QuantType quant_type) {
-  ORT_LLM_LOG_TRACE(__PRETTY_FUNCTION__);
-  if (quant_type == QuantType::W8_A16) {
-    add_bias_and_interleave_int8s_inplace(tensor, num_elts);
-  } else if (quant_type == QuantType::W4_A16 || quant_type == QuantType::W4_AFP8) {
-    // W4_AFP8 uses the same preprocessor as W4_A16 because the FP8 data must
-    // be converted to FP16 before the scales can be applied using CUDA cores.
-    // As a result, we still want permute the data so that it is well aligned
-    // for conversion to FP16.
-    add_bias_and_interleave_int4s_inplace(tensor, num_elts);
-  } else {
-    ORT_THROW("Invalid quantization type for interleaving.");
-  }
-}
-
-void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor, int8_t const* quantized_tensor,
-                                    std::vector<size_t> const& shape, QuantType quant_type, LayoutDetails details) {
-  ORT_LLM_LOG_TRACE(__PRETTY_FUNCTION__);
-
-  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
-  const size_t num_experts = shape.size() == 2 ? 1 : shape[0];
-  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
-  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];
-
-  int const BITS_PER_ELT = get_weight_quant_bits(quant_type);
-  int const elts_in_int32 = 32 / BITS_PER_ELT;
-
-  int const rows_per_tile = details.rows_per_column_tile;
-
-  ORT_ENFORCE(!(num_rows % elts_in_int32),
-              "The number of rows must be a multiple of ", elts_in_int32, " but the number of rows is ", num_rows);
-
-  uint32_t const* input_byte_ptr = reinterpret_cast<uint32_t const*>(quantized_tensor);
-  uint32_t* output_byte_ptr = reinterpret_cast<uint32_t*>(interleaved_quantized_tensor);
-
-  ORT_ENFORCE(!(num_rows % rows_per_tile),
-              "The number of rows must be a multiple of ", rows_per_tile, " but the number of rows is ", num_rows);
-
-  int const num_vec_rows = num_rows / elts_in_int32;
-  int const vec_rows_per_tile = rows_per_tile / elts_in_int32;
-  int const interleave = details.columns_interleaved;
-
-  for (int expert = 0; expert < static_cast<int>(num_experts); ++expert) {
-    const int64_t matrix_offset = expert * int64_t(num_vec_rows) * int64_t(num_cols);
-    for (int64_t read_col = 0; read_col < static_cast<int64_t>(num_cols); ++read_col) {
-      const int64_t write_col = read_col / interleave;
-      for (int base_vec_row = 0; base_vec_row < num_vec_rows; base_vec_row += vec_rows_per_tile) {
-        for (int vec_read_row = base_vec_row;
-             vec_read_row < std::min(num_vec_rows, base_vec_row + vec_rows_per_tile); ++vec_read_row) {
-          const int64_t vec_write_row = interleave * base_vec_row + vec_rows_per_tile * (read_col % interleave) + vec_read_row % vec_rows_per_tile;
-
-          const int64_t read_offset = matrix_offset + read_col * num_vec_rows + vec_read_row;
-          const int64_t write_offset = matrix_offset + int64_t(write_col) * num_vec_rows * interleave + vec_write_row;
-          output_byte_ptr[write_offset] = input_byte_ptr[read_offset];
-        }
-      }
-    }
-  }
-}
-
-void preprocess_weights_for_mixed_gemm(int8_t* preprocessed_quantized_weight, int8_t const* row_major_quantized_weight,
-                                       std::vector<size_t> const& shape, QuantType quant_type, bool force_interleave) {
-  int arch = getSMVersion();
-  if (force_interleave && arch >= 90) {
-    // Workaround for MOE which doesn't have specialized Hopper/Blackwell kernels yet
-    arch = 80;
-  }
-  // Force use sm80 kernel for GB20x.
-  if (arch >= 100) {
-    arch = 80;
-  }
-  LayoutDetails details = getLayoutDetailsForTransform(quant_type, arch);
-
-  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
-
-  size_t num_elts = 1;
-  for (auto const& dim : shape) {
-    num_elts *= dim;
-  }
-
-  const size_t num_bytes = num_elts * get_weight_quant_bits(quant_type) / 8;
-
-  std::vector<int8_t> src_buf(num_bytes);
-  std::vector<int8_t> dst_buf(num_bytes);
-  std::copy(row_major_quantized_weight, row_major_quantized_weight + num_bytes, src_buf.begin());
-
-  // Works on row major data, so issue this permutation first.
-  if (details.uses_imma_ldsm) {
-    permute_B_rows_for_mixed_gemm(dst_buf.data(), src_buf.data(), shape, quant_type);
-    src_buf.swap(dst_buf);
-  }
-
-  if (details.layoutB == LayoutDetails::Layout::COLUMN_MAJOR) {
-    subbyte_transpose(dst_buf.data(), src_buf.data(), shape, quant_type);
-    src_buf.swap(dst_buf);
-  }
-
-  if (details.columns_interleaved > 1 && arch != 90) {
-    interleave_column_major_tensor(dst_buf.data(), src_buf.data(), shape, quant_type, details);
-    src_buf.swap(dst_buf);
-  }
-
-  add_bias_and_interleave_quantized_tensor_inplace(src_buf.data(), num_elts, quant_type);
-  std::copy(src_buf.begin(), src_buf.end(), preprocessed_quantized_weight);
-}
-
-/*
-    Arguments:
-      input_weight_ptr - the weight tensor to be quantized. Must be 2-D or 3-D and of type FP16.
-
-      quant_type - the type of the output quantization weight.
-
-    This function does symmetric quantization on 2-D or 3-D tensors. It uses the full int range and assumes the
-    zero-point is zero and will automatically construct the scales.
-
-    It always quantizes the last axis of the tensor. For 3-D tensors, it operates in "batched" mode where the tensor is
-    viewed as a stack of matrices and a scale is produced for each column of every matrix.
-
-Outputs
-    processed_quantized_weight - quantized AND processed weight for GEMM. This MUST be used with the CUTLASS GEMM
-    unprocessed_quantized_weight - quantized but unprocessed weights. Useful for reference checking.
-    scale_ptr - scales for the quantized weight.
-
-    Note that the returned quantized_weights will be preprocessed in a way to accelerate the mixed type GEMM. The data
-    layout may not make sense if printed.
-
-    Shapes:
-      quant_type == int8:
-        If weight is a [m,n] matrix, quantized_weights will have shape [m,n] and scales of shape [n]
-        If weight is a [b,m,n] tensor, unprocessed_quantized_weight will have shape [b,m,n] and scales of shape [b,n]
-      quant_type == int4:
-        If weight is a [m,n] matrix, quantized_weights will have shape [m, ceil(n/2)] and scales of shape [n]
-        If weight is a [b,m,n] tensor, unprocessed_quantized_weight will have shape [b,m, ceil(n/2)] and scales of shape
-          [b,n]
-
-      The quantized_weight will be of type torch.int8 and have two int4 values packed in a single byte. This is the
-      reason for halving the shape. At the time of writing this code, there was not an elegant way to handle this kind
-      of batched quantization using torch's quantized tensors (to the best of the author's knowledge). Scale tensors
-      must have a dimension of 1, which breaks the semantics we need for batched weights.
-  */
-
-template <typename ComputeType, typename WeightType>
-void symmetric_quantize(int8_t* processed_quantized_weight, int8_t* unprocessed_quantized_weight,
-                        ComputeType* scale_ptr, WeightType const* input_weight_ptr, std::vector<size_t> const& shape, QuantType quant_type,
-                        bool force_interleave) {
-  ORT_ENFORCE(processed_quantized_weight, "Processed quantized tensor is NULL");
-  ORT_ENFORCE(scale_ptr, "Scale output pointer is NULL");
-  ORT_ENFORCE(input_weight_ptr, "Input weight pointer is NULL");
-
-  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
-  const size_t num_experts = shape.size() == 2 ? 1 : shape[0];
-  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
-  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];
-
-  int const bits_in_type = get_weight_quant_bits(quant_type);
-  int const bytes_per_out_col = num_cols * bits_in_type / 8;
-
-  int const bits_per_weigtht_element = get_weight_quant_bits(quant_type);
-
-  std::vector<int8_t> weight_buf;
-  if (unprocessed_quantized_weight == nullptr) {
-    weight_buf.resize(num_experts * num_rows * num_cols);
-    unprocessed_quantized_weight = weight_buf.data();
-  }
-
-  int const input_mat_size = num_rows * num_cols;
-  int const quantized_mat_size = num_rows * bytes_per_out_col;
-  float const quant_range_scale = 1.f / float(1 << (bits_in_type - 1));
-
-  std::vector<float> per_col_max(num_cols);
-
-  for (int expert = 0; expert < static_cast<int>(num_experts); ++expert) {
-    WeightType const* current_weight = input_weight_ptr + expert * input_mat_size;
-    int8_t* current_quantized_weight = unprocessed_quantized_weight + expert * quantized_mat_size;
-
-    // First we find the per column max for this expert weight.
-    for (size_t jj = 0; jj < num_cols; ++jj) {
-      per_col_max[jj] = 0.f;
-    }
-
-    for (size_t ii = 0; ii < num_rows; ++ii) {
-      WeightType const* current_weight_row = current_weight + ii * num_cols;
-      for (size_t jj = 0; jj < num_cols; ++jj) {
-        per_col_max[jj] = std::max(per_col_max[jj], std::abs(float(current_weight_row[jj])));
-      }
-    }
-
-    // Then, we construct the scales
-    ComputeType* current_scales = scale_ptr + expert * num_cols;
-    for (size_t jj = 0; jj < num_cols; ++jj) {
-      per_col_max[jj] *= quant_range_scale;
-      current_scales[jj] = ComputeType(per_col_max[jj]);
-    }
-
-    // Finally, construct the weights.
-    for (size_t ii = 0; ii < num_rows; ++ii) {
-      int8_t* current_quantized_weight_row = current_quantized_weight + ii * bytes_per_out_col;
-      WeightType const* current_weight_row = current_weight + ii * num_cols;
-      for (int jj = 0; jj < bytes_per_out_col; ++jj) {
-        if (bits_per_weigtht_element == 8) {
-          float const col_scale = per_col_max[jj];
-          float const weight_elt = float(current_weight_row[jj]);
-          float const scaled_weight = (col_scale != 0.0f) ? round(weight_elt / col_scale) : 0.0f;
-          const int8_t clipped_weight = int8_t(std::max(-128.f, std::min(127.f, scaled_weight)));
-          current_quantized_weight_row[jj] = clipped_weight;
-        } else if (bits_per_weigtht_element == 4) {
-          // We will pack two int4 elements per iteration of the inner loop.
-          int8_t packed_int4s = 0;
-          for (int packed_idx = 0; packed_idx < 2; ++packed_idx) {
-            int const input_idx = 2 * jj + packed_idx;
-            if (input_idx < static_cast<int>(num_cols)) {
-              float const col_scale = per_col_max[input_idx];
-              float const weight_elt = float(current_weight_row[input_idx]);
-              float const scaled_weight = (col_scale != 0.0f) ? round(weight_elt / col_scale) : 0.0f;
-              int int_weight = int(scaled_weight);
-              const int8_t clipped_weight = std::max(-8, std::min(7, int_weight));
-
-              // Kill the sign extension bits (hence 0x0F mask) then shift to upper bits
-              // if packing the second int4 and or the bits into the final result.
-              packed_int4s |= ((clipped_weight & 0x0F) << (4 * packed_idx));
-            }
-          }
-          current_quantized_weight_row[jj] = packed_int4s;
-        } else {
-          ORT_THROW("Unsupported quantization type");
-        }
-      }
-    }
-  }
-
-  preprocess_weights_for_mixed_gemm(
-      processed_quantized_weight, unprocessed_quantized_weight, shape, quant_type, force_interleave);
-}
-
-template void symmetric_quantize<half, float>(
-    int8_t*, int8_t*, half*, float const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<half, half>(
-    int8_t*, int8_t*, half*, half const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<__nv_bfloat16, __nv_bfloat16>(
-    int8_t*, int8_t*, __nv_bfloat16*, __nv_bfloat16 const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<__nv_bfloat16, float>(
-    int8_t*, int8_t*, __nv_bfloat16*, float const*, std::vector<size_t> const&, QuantType, bool);
-
-template <typename ComputeType, typename WeightType>
-void symmetric_quantize(int8_t* processed_quantized_weight, ComputeType* scale_ptr, WeightType const* input_weight_ptr,
-                        std::vector<size_t> const& shape, QuantType quant_type, bool force_interleave) {
-  symmetric_quantize(
-      processed_quantized_weight, nullptr, scale_ptr, input_weight_ptr, shape, quant_type, force_interleave);
-}
-
-template void symmetric_quantize<float, float>(
-    int8_t*, float*, float const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<half, float>(
-    int8_t*, half*, float const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<half, half>(int8_t*, half*, half const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<__nv_bfloat16, __nv_bfloat16>(
-    int8_t*, __nv_bfloat16*, __nv_bfloat16 const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<__nv_bfloat16, half>(
-    int8_t*, __nv_bfloat16*, half const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<half, __nv_bfloat16>(
-    int8_t*, half*, __nv_bfloat16 const*, std::vector<size_t> const&, QuantType, bool);
-
-template void symmetric_quantize<__nv_bfloat16, float>(
-    int8_t*, __nv_bfloat16*, float const*, std::vector<size_t> const&, QuantType, bool);
-
-}  // namespace cutlass_kernels
-}  // namespace kernels
-}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_preprocessors.h b/onnxruntime/contrib_ops/cuda/llm/cutlass_preprocessors.h
deleted file mode 100644
index 3e83852228e24..0000000000000
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_preprocessors.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <stdint.h>
-#include <vector>
-
-#include "core/common/common.h"
-
-namespace onnxruntime::llm {
-namespace kernels {
-namespace cutlass_kernels {
-
-enum class QuantType {
-  W8_A16,
-  W4_A16,
-  W4_AFP8
-};
-
-constexpr int get_weight_quant_bits(QuantType quant_type) {
-  switch (quant_type) {
-    case QuantType::W8_A16:
-      return 8;
-    case QuantType::W4_A16:
-      return 4;
-    case QuantType::W4_AFP8:
-      return 4;
-    default:
-      ORT_THROW("Invalid quant_type");
-      return -1;
-  }
-}
-
-// Shapes here can be 2 or 3D. 2-D shapes are [num_rows, num_cols]
-// 3-D shapes are [num_experts, num_rows, num_cols]
-void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor, int8_t const* quantized_tensor,
-                                   std::vector<size_t> const& shape, QuantType quant_type);
-
-void subbyte_transpose(int8_t* transposed_quantized_tensor, int8_t const* quantized_tensor,
-                       std::vector<size_t> const& shape, QuantType quant_type);
-
-void add_bias_and_interleave_quantized_tensor_inplace(int8_t* tensor, const size_t num_elts, QuantType quant_type);
-
-void preprocess_weights_for_mixed_gemm(int8_t* preprocessed_quantized_weight, int8_t const* row_major_quantized_weight,
-                                       std::vector<size_t> const& shape, QuantType quant_type, bool force_interleave = false);
-
-template <typename ComputeType, typename WeightType>
-void symmetric_quantize(int8_t* processed_quantized_weight, ComputeType* scale_ptr, WeightType const* input_weight_ptr,
-                        std::vector<size_t> const& shape, QuantType quant_type, bool force_interleave);
-
-// This is exposed so that we can write tests that use the processed weights for CUTLASS but the unprocessed weight
-// to implement a simple reference implementation.
-template <typename ComputeType, typename WeightType>
-void symmetric_quantize(int8_t* processed_quantized_weight, int8_t* unprocessed_quantized_weight,
-                        ComputeType* scale_ptr, WeightType const* input_weight_ptr, std::vector<size_t> const& shape, QuantType quant_type,
-                        bool force_interleave);
-
-}  // namespace cutlass_kernels
-}  // namespace kernels
-}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scaleonly.cu
new file mode 100644
index 0000000000000..de834db4b7440
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scaleonly.cu
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scaleonly.cu
new file mode 100644
index 0000000000000..97c71615ce54d
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scaleonly.cu
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scaleonly.cu
new file mode 100644
index 0000000000000..5905f48b9b479
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scaleonly.cu
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scaleonly.cu
new file mode 100644
index 0000000000000..aa3e984ab2945
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scaleonly.cu
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_1.generated.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_1.generated.cu
index 468d53f336e55..ba513a831b432 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_1.generated.cu
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_1.generated.cu
@@ -70,6 +70,69 @@ half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGe
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
 cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
 cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
@@ -133,6 +196,69 @@ half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGe
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
 cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
 cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
@@ -196,6 +322,69 @@ __nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
 cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
 cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
@@ -259,6 +448,69 @@ __nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<64>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedPingpong, cutlass::epilogue::TmaWarpSpecialized> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 } // namespace cutlass_kernels
 } // namespace kernels
 } // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_2.generated.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_2.generated.cu
index 0156c83840b09..6c6318fa6c589 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_2.generated.cu
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_2.generated.cu
@@ -133,6 +133,132 @@ half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGe
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, cutlass::uint4b_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const cutlass::uint4b_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
 cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
 cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
@@ -259,6 +385,132 @@ half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGe
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<half, uint8_t, half, half, half,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const half*, const uint8_t*, const half*, const half*, const half*, const float,
+half*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
 cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
 cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
@@ -385,6 +637,132 @@ __nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const cutlass::uint4b_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
 cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
 cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
@@ -511,6 +889,132 @@ __nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::
 );
 
 
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
+template void sm90_generic_mixed_gemm_kernelLauncher<__nv_bfloat16, uint8_t, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16,
+cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, onnxruntime::llm::cutlass_extensions::EpilogueOpBias,
+cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>,
+cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative> (
+const __nv_bfloat16*, const uint8_t*, const __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, const float,
+__nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
+);
+
+
 } // namespace cutlass_kernels
 } // namespace kernels
 } // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
index 779ff88455703..4f901a550e8bf 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@@ -17,6 +17,7 @@
 #ifdef __GNUC__  // Check if the compiler is GCC or Clang
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" // suppress warning that tma_load_zero may be used uninitialized
 #endif  // __GNUC__
 
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h
new file mode 100644
index 0000000000000..da6cc4dd2376b
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include <cstddef>
+#include <vector>
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace weight_only {
+
+enum class QuantType {
+  W8_A16,
+  W4_A16,
+  W4_AFP8
+};
+
+void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
+                                            int arch,
+                                            int8_t* preprocessed_quantized_weight,
+                                            int8_t* row_major_quantized_weight,
+                                            int32_t* d_permutation_map,
+                                            std::vector<size_t> const& shape,
+                                            QuantType quant_type);
+
+}  // namespace weight_only
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu
new file mode 100644
index 0000000000000..4990f676cb5c4
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu
@@ -0,0 +1,583 @@
+
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "core/common/safeint.h"
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace weight_only {
+
+/**
+ * @brief CUDA kernel to permute rows of a quantized tensor.
+ *
+ * This kernel reorders rows based on a permutation map. It's designed to work
+ * for tensors where elements are packed into 32-bit unsigned integers.
+ * Each CUDA block handles a "tile group" of rows to be permuted.
+ * - blockIdx.x: Index of the row tile group.
+ * - threadIdx.y: Index of the row within the current permutation tile (0 to permutation_tile_size - 1).
+ * - threadIdx.x: Cooperatively processes the columns of uint32_t vectors.
+ *
+ * @param permuted_quantized_tensor Output buffer for the permuted tensor (device memory).
+ * @param quantized_tensor Input quantized tensor (device memory).
+ * @param num_matrix_rows Total number of rows in the input matrix (K dimension).
+ * @param num_matrix_cols Total number of columns (element-wise) in the input matrix (N dimension).
+ * @param row_permutation Device pointer to the permutation map array.
+ * @param permutation_tile_size The size of the permutation tile (e.g., 16 for W8, 32 for W4).
+ * @param num_experts Number of experts (for MoE models, typically 1 if not MoE).
+ * @param bits_per_element The number of bits per quantized element (e.g., 4 or 8).
+ */
+__global__ void permute_rows_kernel(
+    uint32_t* permuted_quantized_tensor,
+    const uint32_t* quantized_tensor,
+    int num_matrix_rows,  // K
+    int num_matrix_cols,  // N (element-wise)
+    const int* row_permutation,
+    int permutation_tile_size,
+    int num_experts,
+    int bits_per_element) {
+  const int elts_in_int32 = 32 / bits_per_element;
+  const int num_vec_cols = num_matrix_cols / elts_in_int32;  // Number of uint32_t columns
+
+  // Determine the expert index and the row tile group index within that expert
+  int expert_idx = 0;
+  int block_idx_val = blockIdx.x;
+
+  if (num_experts > 1) {
+    int num_row_tile_groups_per_expert = (num_matrix_rows + permutation_tile_size - 1) / permutation_tile_size;
+    expert_idx = blockIdx.x / num_row_tile_groups_per_expert;
+    block_idx_val = blockIdx.x % num_row_tile_groups_per_expert;
+  }
+
+  if (expert_idx >= num_experts) return;
+
+  // Base row for this block's tile group
+  int base_row = block_idx_val * permutation_tile_size;
+  if (base_row >= num_matrix_rows) return;
+
+  // Row within the current permutation tile (handled by threadIdx.y)
+  int tile_row_idx = threadIdx.y;  // From 0 to blockDim.y - 1
+
+  if (tile_row_idx >= permutation_tile_size) return;
+
+  int write_row = base_row + tile_row_idx;
+  if (write_row >= num_matrix_rows) return;  // Boundary check for the last tile
+
+  // Determine the source row within the tile using the permutation map
+  // `row_permutation` maps the destination `tile_row_idx` to the source row index *within the tile*.
+  int permuted_source_tile_row = row_permutation[tile_row_idx];
+  int read_row = base_row + permuted_source_tile_row;
+
+  // Offset for the current expert
+  int64_t expert_matrix_offset_vec = (int64_t)expert_idx * num_matrix_rows * num_vec_cols;
+
+  // Threads in blockDim.x iterate over the vector columns
+  for (int current_vec_col = threadIdx.x; current_vec_col < num_vec_cols; current_vec_col += blockDim.x) {
+    int64_t read_offset_vec = expert_matrix_offset_vec + (int64_t)read_row * num_vec_cols + current_vec_col;
+    int64_t write_offset_vec = expert_matrix_offset_vec + (int64_t)write_row * num_vec_cols + current_vec_col;
+
+    permuted_quantized_tensor[write_offset_vec] = quantized_tensor[read_offset_vec];
+  }
+}
+
+/**
+ * @brief Performs row permutation for B matrix in mixed GEMM on the GPU.
+ *
+ * @param permuted_quantized_tensor_out GPU buffer for the output permuted tensor.
+ * @param quantized_tensor_in GPU buffer for the input tensor.
+ * @param shape Shape of the input tensor. Can be 2D {K, N} or 3D {num_experts, K, N}.
+ * K is rows, N is columns (element-wise).
+ * @param quant_type The quantization type (W4_A16 or W8_A16).
+ * @param stream CUDA stream for the operation.
+ */
+void permute_B_rows_on_gpu(
+    int8_t* permuted_quantized_tensor_out,
+    const int8_t* quantized_tensor_in,
+    const int* d_row_permutation,
+    const int permutation_tile_size,
+    const std::vector<size_t>& shape,
+    QuantType quant_type,
+    cudaStream_t stream) {
+  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
+
+  const int num_experts = shape.size() == 3 ? static_cast<int>(shape[0]) : 1;
+  const int k_dim = static_cast<int>(shape.size() == 3 ? shape[1] : shape[0]);  // Rows to permute
+  const int n_dim = static_cast<int>(shape.size() == 3 ? shape[2] : shape[1]);  // Columns (element-wise)
+
+  // Kernel launch configuration
+  // threadIdx.y will correspond to the row index within the permutation tile.
+  // blockDim.y should be permutation_tile_size.
+  // threadIdx.x will cooperatively process vector columns.
+
+  int threads_per_block_y = permutation_tile_size;  // 16 or 32, depending on the quantization type
+  int threads_per_block_x = 32;                     // Tunable: number of vector columns processed by a warp/blockDim.x threads
+
+  dim3 blockDim(threads_per_block_x, threads_per_block_y, 1);
+
+  int num_row_tile_groups_per_expert = (k_dim + permutation_tile_size - 1) / permutation_tile_size;
+  int total_row_tile_groups = num_experts * num_row_tile_groups_per_expert;
+  dim3 gridDim(total_row_tile_groups, 1, 1);
+
+  int bits_per_element = get_weight_quant_bits(quant_type);
+  permute_rows_kernel<<<gridDim, blockDim, 0, stream>>>(
+      reinterpret_cast<uint32_t*>(permuted_quantized_tensor_out),
+      reinterpret_cast<const uint32_t*>(quantized_tensor_in),
+      k_dim,
+      n_dim,
+      d_row_permutation,
+      permutation_tile_size,
+      num_experts,
+      bits_per_element);
+}
+
+// Constants for the subbyte_transpose_kernel
+constexpr int SUBBYTE_TRANSPOSE_TILE_DIM_ELTS = 32;  // Tile dimension in elements (e.g., 32 elements wide and 32 elements high)
+constexpr int SUBBYTE_TRANSPOSE_BLOCK_ROWS = 8;      // Affects how many rows of a tile a threadblock y-dimension loads/stores in one pass
+
+template <int BITS_PER_ELT>
+__global__ void subbyte_transpose_kernel(int8_t* output, const int8_t* input, int num_rows_in, int num_cols_in) {
+  static_assert(BITS_PER_ELT == 8 || BITS_PER_ELT == 4, "BITS_PER_ELT must be 8 or 4");
+
+  constexpr int ELTS_PER_BYTE = 8 / BITS_PER_ELT;
+
+  // Shared memory tile dimensions
+  // Tile height is fixed in elements. Tile width is fixed in elements, convert to bytes for smem.
+  constexpr int SMEM_TILE_HEIGHT_ELTS = SUBBYTE_TRANSPOSE_TILE_DIM_ELTS;
+  constexpr int SMEM_TILE_WIDTH_ELTS = SUBBYTE_TRANSPOSE_TILE_DIM_ELTS;
+  constexpr int SMEM_TILE_WIDTH_BYTES = SMEM_TILE_WIDTH_ELTS / ELTS_PER_BYTE;
+
+  // Shared memory tile. Padding +1 can sometimes help with bank conflicts.
+  __shared__ uint8_t smem_tile[SMEM_TILE_HEIGHT_ELTS][SMEM_TILE_WIDTH_BYTES];
+
+  // Thread indices
+  int tx_smem_byte_col = threadIdx.x;      // Thread's x-index corresponds to byte column in shared memory
+  int ty_smem_elt_row_base = threadIdx.y;  // Thread's y-index corresponds to base row in shared memory tile part for this thread
+
+  // Starting global coordinates for the input tile this block is processing
+  int block_input_start_col_byte = blockIdx.x * SMEM_TILE_WIDTH_BYTES;
+  int block_input_start_row_elt = blockIdx.y * SMEM_TILE_HEIGHT_ELTS;
+
+  // Load data from global input to shared memory tile
+  for (int i = 0; i < SMEM_TILE_HEIGHT_ELTS; i += SUBBYTE_TRANSPOSE_BLOCK_ROWS) {
+    int current_smem_row_elt = ty_smem_elt_row_base + i;  // Actual row in shared memory this thread instance writes to
+
+    if (current_smem_row_elt < SMEM_TILE_HEIGHT_ELTS) {  // Check smem row bound
+      int gmem_load_row_elt = block_input_start_row_elt + current_smem_row_elt;
+      int gmem_load_col_byte = block_input_start_col_byte + tx_smem_byte_col;
+
+      // Boundary checks for global memory read from input
+      if (gmem_load_row_elt < num_rows_in && gmem_load_col_byte < (num_cols_in / ELTS_PER_BYTE)) {
+        smem_tile[current_smem_row_elt][tx_smem_byte_col] =
+            input[gmem_load_row_elt * (num_cols_in / ELTS_PER_BYTE) + gmem_load_col_byte];
+      } else {
+        // Pad with a known value (e.g., 0) if reading out of bounds of the input matrix.
+        // This is important if the matrix dimensions are not multiples of tile dimensions.
+        smem_tile[current_smem_row_elt][tx_smem_byte_col] = 0;
+      }
+    }
+  }
+
+  __syncthreads();  // Ensure all data is loaded into shared memory
+
+  // Write data from shared memory tile to global output (transposed)
+  // Output matrix dimensions: num_cols_in rows, num_rows_in columns (element-wise)
+  // Output matrix byte columns: (num_rows_in / ELTS_PER_BYTE)
+
+  // Starting global coordinates for the output tile this block is processing
+  // Transposed: blockIdx.x (input col tiles) -> output row tiles
+  //             blockIdx.y (input row tiles) -> output col tiles
+  int block_output_start_row_elt = blockIdx.x * SMEM_TILE_WIDTH_ELTS;
+  int block_output_start_col_elt = blockIdx.y * SMEM_TILE_HEIGHT_ELTS;
+
+  for (int i = 0; i < SMEM_TILE_HEIGHT_ELTS; i += SUBBYTE_TRANSPOSE_BLOCK_ROWS) {
+    int smem_read_row_elt = ty_smem_elt_row_base + i;  // This is the row in smem to read from
+
+    if (smem_read_row_elt < SMEM_TILE_HEIGHT_ELTS) {
+      // The byte read from shared memory. Its original position was (smem_read_row_elt, tx_smem_byte_col)
+      // in the tile structure.
+      uint8_t source_byte_from_smem = smem_tile[smem_read_row_elt][tx_smem_byte_col];
+
+      // This byte contains ELTS_PER_BYTE elements. Iterate through them.
+      for (int k = 0; k < ELTS_PER_BYTE; ++k) {  // k is the element index within the source_byte_from_smem (0 or 1 for 4-bit)
+        // Transposed mapping:
+        // Original element in tile: (row=smem_read_row_elt, col_element_in_tile=tx_smem_byte_col*ELTS_PER_BYTE + k)
+        // Transposed element in tile: (row=tx_smem_byte_col*ELTS_PER_BYTE + k, col=smem_read_row_elt)
+
+        int gmem_write_row_elt = block_output_start_row_elt + (tx_smem_byte_col * ELTS_PER_BYTE + k);
+        int gmem_write_col_elt = block_output_start_col_elt + smem_read_row_elt;
+
+        // Boundary check for global memory write to output
+        if (gmem_write_row_elt < num_cols_in && gmem_write_col_elt < num_rows_in) {
+          if constexpr (BITS_PER_ELT == 8) {
+            // Direct byte write for 8-bit elements.
+            // Output has num_cols_in rows. Output byte stride is num_rows_in bytes.
+            output[gmem_write_row_elt * num_rows_in + gmem_write_col_elt] = source_byte_from_smem;
+          } else if constexpr (BITS_PER_ELT == 4) {
+            uint8_t nibble = (source_byte_from_smem >> (k * 4)) & 0x0F;
+
+            // Calculate precise byte and nibble index in the output byte
+            int output_matrix_num_byte_cols = num_rows_in / ELTS_PER_BYTE;
+            int gmem_dest_col_byte = gmem_write_col_elt / ELTS_PER_BYTE;
+            int gmem_dest_col_nibble_idx = gmem_write_col_elt % ELTS_PER_BYTE;
+
+            int8_t* p_target_byte = &output[gmem_write_row_elt * output_matrix_num_byte_cols + gmem_dest_col_byte];
+
+            // Ensure atomicOr operates on a 4-byte aligned address.
+            uintptr_t addr_val = (uintptr_t)p_target_byte;
+            uint32_t* p_aligned_word = (uint32_t*)(addr_val & ~3ULL);  // Align address down to nearest 4-byte boundary
+            uint32_t byte_offset_in_word = addr_val & 3ULL;            // Find byte's offset within this 4-byte word (0,1,2,3)
+
+            // Calculate the shift for the nibble within the 4-byte aligned word
+            uint32_t shift_in_aligned_word = (byte_offset_in_word * 8) + (gmem_dest_col_nibble_idx * 4);
+            uint32_t value_to_or = ((uint32_t)nibble) << shift_in_aligned_word;
+
+            atomicOr(p_aligned_word, value_to_or);
+          }
+        }
+      }
+    }
+  }
+}
+
+void subbyte_transpose_cuda(
+    int8_t* transposed_quantized_tensor_out,  // Output buffer
+    const int8_t* quantized_tensor_in,        // Input buffer
+    const std::vector<size_t>& shape,         // Shape of the input tensor {num_rows_in, num_cols_in}
+    QuantType quant_type,
+    cudaStream_t stream) {
+  ORT_ENFORCE(shape.size() == 2, "CUDA subbyte_transpose currently only supports 2D shapes for simplicity in this example.");
+  const int num_rows_in = static_cast<int>(shape[0]);  // K
+  const int num_cols_in = static_cast<int>(shape[1]);  // N (element-wise columns)
+
+  const int BITS_PER_ELT = get_weight_quant_bits(quant_type);
+  const int ELTS_PER_BYTE_HOST = 8 / BITS_PER_ELT;
+
+  // blockDim.x should correspond to the width of the shared memory tile IN BYTES.
+  const int SMEM_TILE_WIDTH_BYTES_CONST = SUBBYTE_TRANSPOSE_TILE_DIM_ELTS / ELTS_PER_BYTE_HOST;
+  dim3 blockDim(SMEM_TILE_WIDTH_BYTES_CONST, SUBBYTE_TRANSPOSE_BLOCK_ROWS);
+
+  // Grid dimensions are based on how many tiles are needed to cover the input matrix
+  dim3 gridDim(
+      // Number of tiles needed for input columns (in bytes)
+      ((num_cols_in / ELTS_PER_BYTE_HOST) + SMEM_TILE_WIDTH_BYTES_CONST - 1) / SMEM_TILE_WIDTH_BYTES_CONST,
+      // Number of tiles needed for input rows (in elements)
+      (num_rows_in + SUBBYTE_TRANSPOSE_TILE_DIM_ELTS - 1) / SUBBYTE_TRANSPOSE_TILE_DIM_ELTS);
+
+  // IMPORTANT: For atomicOr to work correctly by combining nibbles,
+  // the output buffer must be zero-initialized before launching the kernel.
+  if (BITS_PER_ELT == 4) {
+    size_t output_num_bytes = static_cast<size_t>(num_cols_in) * num_rows_in * BITS_PER_ELT / 8;
+    cudaMemsetAsync(transposed_quantized_tensor_out, 0, output_num_bytes, stream);
+  }
+
+  if (BITS_PER_ELT == 4) {
+    subbyte_transpose_kernel<4><<<gridDim, blockDim, 0, stream>>>(
+        transposed_quantized_tensor_out, quantized_tensor_in, num_rows_in, num_cols_in);
+  } else if (BITS_PER_ELT == 8) {
+    subbyte_transpose_kernel<8><<<gridDim, blockDim, 0, stream>>>(
+        transposed_quantized_tensor_out, quantized_tensor_in, num_rows_in, num_cols_in);
+  } else {
+    ORT_THROW("Invalid quant_type for CUDA subbyte_transpose.");
+  }
+}
+
+/**
+ * @brief CUDA kernel to interleave a column-major tensor.
+ *
+ * This kernel rearranges the elements of a tensor from a standard column-major
+ * layout to an interleaved layout, as required for certain optimized GEMM operations.
+ * Each thread processes a single 32-bit element.
+ *
+ * @param interleaved_quantized_tensor The output buffer for the interleaved tensor.
+ * @param quantized_tensor The input column-major tensor.
+ * @param num_rows The number of rows in the tensor.
+ * @param num_cols The number of columns in the tensor.
+ * @param num_experts The number of experts (for Mixture-of-Experts models).
+ * @param rows_per_tile The number of rows in a tile.
+ * @param columns_interleaved The number of columns to interleave.
+ * @param quant_type The quantization type of the weight.
+ */
+__global__ void interleave_column_major_tensor_kernel(
+    uint32_t* interleaved_quantized_tensor,
+    const uint32_t* quantized_tensor,
+    int num_rows,
+    int num_cols,
+    int num_experts,
+    int rows_per_tile,
+    int columns_interleaved,
+    QuantType quant_type) {
+  const int BITS_PER_ELT = get_weight_quant_bits(quant_type);
+  const int elts_in_int32 = 32 / BITS_PER_ELT;
+  const int num_vec_rows = num_rows / elts_in_int32;
+  const int vec_rows_per_tile = rows_per_tile / elts_in_int32;
+
+  // Each thread handles one 32-bit element
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < num_experts * num_vec_rows * num_cols) {
+    // Deconstruct the flat index to get expert, column, and row
+    const int expert = idx / (num_vec_rows * num_cols);
+    const int col_row_idx = idx % (num_vec_rows * num_cols);
+    const int read_col = col_row_idx / num_vec_rows;
+    const int vec_read_row = col_row_idx % num_vec_rows;
+
+    const int write_col = read_col / columns_interleaved;
+    const int base_vec_row = (vec_read_row / vec_rows_per_tile) * vec_rows_per_tile;
+
+    const int vec_write_row = columns_interleaved * base_vec_row +
+                              vec_rows_per_tile * (read_col % columns_interleaved) +
+                              (vec_read_row % vec_rows_per_tile);
+
+    const int64_t matrix_offset = (int64_t)expert * num_vec_rows * num_cols;
+    const int64_t read_offset = matrix_offset + (int64_t)read_col * num_vec_rows + vec_read_row;
+    const int64_t write_offset = matrix_offset + (int64_t)write_col * num_vec_rows * columns_interleaved + vec_write_row;
+
+    interleaved_quantized_tensor[write_offset] = quantized_tensor[read_offset];
+  }
+}
+
+/**
+ * @brief Launches the CUDA kernel for column-major tensor interleaving.
+ *
+ * @param interleaved_quantized_tensor The output buffer on the GPU.
+ * @param quantized_tensor The input tensor on the GPU.
+ * @param shape The shape of the tensor.
+ * @param quant_type The quantization type.
+ * @param details The layout details.
+ * @param stream The CUDA stream for the operation.
+ */
+void interleave_column_major_tensor_cuda(
+    int8_t* interleaved_quantized_tensor,
+    const int8_t* quantized_tensor,
+    const std::vector<size_t>& shape,
+    QuantType quant_type,
+    const LayoutDetails& details,
+    cudaStream_t stream) {
+  const int num_experts = shape.size() == 2 ? 1 : static_cast<int>(shape[0]);
+  const int num_rows = static_cast<int>(shape.size() == 2 ? shape[0] : shape[1]);
+  const int num_cols = static_cast<int>(shape.size() == 2 ? shape[1] : shape[2]);
+
+  const int BITS_PER_ELT = get_weight_quant_bits(quant_type);
+  const int elts_in_int32 = 32 / BITS_PER_ELT;
+
+  const int total_elements_32bit = SafeInt<int32_t>(num_experts) * (num_rows / elts_in_int32) * num_cols;
+  const int threads_per_block = 256;
+  const int num_blocks = (total_elements_32bit + threads_per_block - 1) / threads_per_block;
+
+  interleave_column_major_tensor_kernel<<<num_blocks, threads_per_block, 0, stream>>>(
+      reinterpret_cast<uint32_t*>(interleaved_quantized_tensor),
+      reinterpret_cast<const uint32_t*>(quantized_tensor),
+      num_rows,
+      num_cols,
+      num_experts,
+      details.rows_per_column_tile,
+      details.columns_interleaved,
+      quant_type);
+}
+
+/**
+ * @brief CUDA kernel to add bias and interleave an INT8 tensor in place.
+ *
+ * Each thread handles a 32-bit segment (4 int8 elements).
+ * 1. Adds a bias of 128 to each int8 element.
+ * 2. Swaps the middle two elements to match the required register layout.
+ * [elt_0, elt_1, elt_2, elt_3] -> [elt_0, elt_2, elt_1, elt_3]
+ *
+ * @param tensor The int8 tensor to be modified in place, treated as uint32_t*.
+ * @param num_elts The total number of int8 elements in the tensor.
+ */
+__global__ void add_bias_and_interleave_int8s_inplace_kernel(uint32_t* tensor, size_t num_elts) {
+  // Each thread processes one 32-bit word (4 elements)
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t register_idx = static_cast<size_t>(idx);
+
+  if (register_idx < num_elts / 4) {
+    uint32_t current_register = tensor[register_idx];
+
+    // Unpack the 4 int8 elements from the 32-bit register
+    int8_t elt0 = (current_register >> 0) & 0xFF;
+    int8_t elt1 = (current_register >> 8) & 0xFF;
+    int8_t elt2 = (current_register >> 16) & 0xFF;
+    int8_t elt3 = (current_register >> 24) & 0xFF;
+
+    // Add default bias of 128 to each element
+    uint8_t biased_elt0 = static_cast<uint8_t>(elt0 + 128);
+    uint8_t biased_elt1 = static_cast<uint8_t>(elt1 + 128);
+    uint8_t biased_elt2 = static_cast<uint8_t>(elt2 + 128);
+    uint8_t biased_elt3 = static_cast<uint8_t>(elt3 + 128);
+
+    // Interleave by swapping elements 1 and 2
+    uint32_t transformed_register = (static_cast<uint32_t>(biased_elt3) << 24) |
+                                    (static_cast<uint32_t>(biased_elt1) << 16) |
+                                    (static_cast<uint32_t>(biased_elt2) << 8) |
+                                    (static_cast<uint32_t>(biased_elt0) << 0);
+
+    tensor[register_idx] = transformed_register;
+  }
+}
+
+/**
+ * @brief CUDA kernel to add bias and interleave an INT4 tensor in place.
+ *
+ * Each thread handles a 32-bit segment (8 int4 elements).
+ * 1. Unpacks 8 int4 elements.
+ * 2. Adds a bias of 8 to each element.
+ * 3. Repacks them into an interleaved layout to optimize for GEMM operations.
+ *    [e7, e6, e5, e4, e3, e2, e1, e0] -> [e7, e5, e3, e1, e6, e4, e2, e0]
+ *
+ * @param tensor The packed int4 tensor to be modified, treated as uint32_t*.
+ * @param num_elts The total number of int4 elements in the tensor.
+ */
+__global__ void add_bias_and_interleave_int4s_inplace_kernel(uint32_t* tensor, size_t num_elts) {
+  // Each thread processes one 32-bit word (8 int4 elements)
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t register_idx = static_cast<size_t>(idx);
+
+  // Each register holds 8 int4 elements.
+  if (register_idx < num_elts / 8) {
+    uint32_t current_register = tensor[register_idx];
+    uint32_t transformed_register = 0;
+
+    // This loop processes each source nibble 'i' from the current_register
+    // and places its biased version into the correct destination slot in transformed_register.
+    for (int i = 0; i < 8; ++i) {  // i = src_idx of nibble (0 to 7, LSB to MSB within current_register)
+      // Extract the i-th 4-bit element (raw nibble) from current_register
+      uint8_t raw_nibble = (current_register >> (i * 4)) & 0x0F;
+
+      // Sign extend the 4-bit raw_nibble to int8_t.
+      // Assumes raw_nibble 0x0..0x7 are positive, 0x8..0xF are negative.
+      // e.g., 0xF is +7, 0x8 is 0, 0x7 is -1, 0x0 is -8 (if zero point is 8)
+      int8_t signed_nibble = static_cast<int8_t>(raw_nibble);
+      if (signed_nibble & 0x08) {  // If sign bit (MSB of nibble, value 8) is set
+        signed_nibble |= 0xF0;     // Sign extend to fill upper bits of int8_t
+      }
+
+      // After (int8_t)(raw_nibble << 4) >> 4, we get the signed value. Example:
+      // raw_nibble=0xF -> (0xF0)>>4 (signed) = -1.
+      // raw_nibble=0x8 -> (0x80)>>4 (signed) = -8.
+      // raw_nibble=0x7 -> (0x70)>>4 (signed) = 7.
+      // raw_nibble=0x0 -> (0x00)>>4 (signed) = 0.
+      // This is the signed interpretation where 0..7 are positive, 8..15 are negative.
+      int8_t val_for_sign_ext = static_cast<int8_t>(raw_nibble << 4);  // Place nibble in high part of a byte
+      signed_nibble = val_for_sign_ext >> 4;                           // Arithmetic shift right to sign extend
+
+      // Add bias (maps signed int4 to unsigned int4 [0, 15])
+      uint8_t biased_nibble = static_cast<uint8_t>(signed_nibble + 8);
+
+      // Determine destination index (dest_idx) for this src_idx (i)
+      int dest_idx;
+      if ((i % 2) == 0) {  // src_idx is even: 0, 2, 4, 6
+        dest_idx = i / 2;
+      } else {  // src_idx is odd: 1, 3, 5, 7
+        dest_idx = (i - 1) / 2 + 4;
+      }
+
+      // Place the biased nibble (making sure it's masked to 4 bits) into the transformed_register
+      transformed_register |= (static_cast<uint32_t>(biased_nibble & 0x0F) << (dest_idx * 4));
+    }
+    tensor[register_idx] = transformed_register;
+  }
+}
+
+/**
+ * @brief Launches the CUDA kernel for in-place bias addition and interleaving.
+ *
+ * This function selects the correct CUDA kernel based on the quantization type
+ * and launches it on the provided stream.
+ *
+ * @param tensor The quantized tensor on the GPU to be modified in place.
+ * @param num_elts The total number of elements in the tensor.
+ * @param quant_type The quantization type (W8_A16, W4_A16, etc.).
+ * @param stream The CUDA stream for the operation.
+ */
+void add_bias_and_interleave_quantized_tensor_inplace_cuda(
+    int8_t* tensor,
+    size_t num_elts,
+    QuantType quant_type,
+    cudaStream_t stream) {
+  const int threads_per_block = 256;
+
+  if (quant_type == QuantType::W8_A16) {
+    // Each thread handles 4 elements (32 bits)
+    const int num_registers = SafeInt<int32_t>(num_elts) / 4;
+    const int num_blocks = (num_registers + threads_per_block - 1) / threads_per_block;
+
+    add_bias_and_interleave_int8s_inplace_kernel<<<num_blocks, threads_per_block, 0, stream>>>(
+        reinterpret_cast<uint32_t*>(tensor),
+        num_elts);
+  } else if (quant_type == QuantType::W4_A16 || quant_type == QuantType::W4_AFP8) {
+    // Each thread handles 8 elements (32 bits)
+    const int num_registers = SafeInt<int32_t>(num_elts) / 8;
+    const int num_blocks = (num_registers + threads_per_block - 1) / threads_per_block;
+
+    add_bias_and_interleave_int4s_inplace_kernel<<<num_blocks, threads_per_block, 0, stream>>>(
+        reinterpret_cast<uint32_t*>(tensor),
+        num_elts);
+  } else {
+    ORT_THROW("Invalid quantization type for interleaving.");
+  }
+}
+
+void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
+                                            int arch,
+                                            int8_t* preprocessed_quantized_weight,
+                                            int8_t* row_major_quantized_weight,
+                                            int32_t* d_permutation_map,
+                                            std::vector<size_t> const& shape,
+                                            QuantType quant_type) {
+  LayoutDetails details = getLayoutDetailsForTransform(quant_type, arch);
+
+  ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
+
+  size_t num_elts = 1;
+  for (auto const& dim : shape) {
+    num_elts *= dim;
+  }
+
+  int8_t* src_buf = row_major_quantized_weight;
+  int8_t* dst_buf = preprocessed_quantized_weight;
+
+  if (details.uses_imma_ldsm) {
+    auto row_permutation = get_permutation_map(quant_type);
+    cudaMemcpyAsync(d_permutation_map, row_permutation.data(), row_permutation.size() * sizeof(int), cudaMemcpyHostToDevice, stream);
+    permute_B_rows_on_gpu(dst_buf, src_buf, d_permutation_map, static_cast<int>(row_permutation.size()), shape, quant_type, stream);
+    std::swap(src_buf, dst_buf);
+  }
+
+  if (details.layoutB == LayoutDetails::Layout::COLUMN_MAJOR) {
+    subbyte_transpose_cuda(dst_buf, src_buf, shape, quant_type, stream);
+    std::swap(src_buf, dst_buf);
+  }
+
+  if (details.columns_interleaved > 1 && arch != 90) {
+    interleave_column_major_tensor_cuda(
+        dst_buf,
+        src_buf,
+        shape,
+        quant_type,
+        details,
+        stream);
+
+    std::swap(src_buf, dst_buf);
+  }
+
+  add_bias_and_interleave_quantized_tensor_inplace_cuda(
+      src_buf,
+      num_elts,
+      quant_type,
+      stream);
+
+  if (preprocessed_quantized_weight != src_buf) {
+    const size_t num_bytes = num_elts * static_cast<size_t>(get_weight_quant_bits(quant_type)) / static_cast<size_t>(8);
+    CUDA_CALL_THROW(cudaMemcpyAsync(preprocessed_quantized_weight, src_buf, num_bytes, cudaMemcpyDeviceToDevice, stream));
+  }
+
+  // Synchronize the stream to ensure the permutation is complete before row_permutation memory is relased.
+  CUDA_CALL_THROW(cudaStreamSynchronize(stream));
+}
+
+}  // namespace weight_only
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h
new file mode 100644
index 0000000000000..f03ea9b11d527
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h"
+#include "core/common/common.h"
+#include "core/common/span_utils.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+#include "contrib_ops/cuda/llm/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+namespace onnxruntime::llm {
+namespace kernels {
+namespace weight_only {
+
+constexpr int get_weight_quant_bits(QuantType quant_type) {
+  switch (quant_type) {
+    case QuantType::W8_A16:
+      return 8;
+    case QuantType::W4_A16:
+    case QuantType::W4_AFP8:
+      return 4;
+  }
+
+  return -1;
+}
+
+struct LayoutDetails {
+  enum class Layout {
+    UNKNOWN,
+    ROW_MAJOR,
+    COLUMN_MAJOR
+  };
+
+  Layout layoutB = Layout::UNKNOWN;
+  int rows_per_column_tile = 1;
+  int columns_interleaved = 1;
+
+  bool uses_imma_ldsm = false;
+};
+
+template <typename Layout>
+struct getLayoutDetails {
+};
+
+template <>
+struct getLayoutDetails<cutlass::layout::RowMajor> {
+  LayoutDetails operator()() {
+    LayoutDetails layout_details;
+    layout_details.layoutB = LayoutDetails::Layout::ROW_MAJOR;
+    return layout_details;
+  }
+};
+
+template <>
+struct getLayoutDetails<cutlass::layout::ColumnMajor> {
+  LayoutDetails operator()() {
+    LayoutDetails layout_details;
+    layout_details.layoutB = LayoutDetails::Layout::COLUMN_MAJOR;
+    return layout_details;
+  }
+};
+
+template <int RowsPerTile, int ColumnsInterleaved>
+struct getLayoutDetails<cutlass::layout::ColumnMajorTileInterleave<RowsPerTile, ColumnsInterleaved>> {
+  LayoutDetails operator()() {
+    LayoutDetails layout_details;
+    layout_details.layoutB = LayoutDetails::Layout::COLUMN_MAJOR;
+    layout_details.rows_per_column_tile = RowsPerTile;
+    layout_details.columns_interleaved = ColumnsInterleaved;
+    return layout_details;
+  }
+};
+
+template <typename cutlassArch, typename TypeA, typename TypeB>
+LayoutDetails getLayoutDetailsForArchAndQuantType() {
+  using CompileTraits = cutlass::gemm::kernel::LayoutDetailsB<TypeA, TypeB, cutlassArch>;
+  using LayoutB = typename CompileTraits::Layout;
+  using MmaOperator = typename CompileTraits::Operator;
+  LayoutDetails details = getLayoutDetails<LayoutB>()();
+  details.uses_imma_ldsm = std::is_same<MmaOperator, cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA>::value;
+  return details;
+}
+
+template <typename cutlassArch>
+LayoutDetails getLayoutDetailsForArch(QuantType quant_type) {
+  LayoutDetails details;
+  switch (quant_type) {
+    case QuantType::W8_A16:
+      details = getLayoutDetailsForArchAndQuantType<cutlassArch, cutlass::half_t, uint8_t>();
+      break;
+    case QuantType::W4_A16:
+      details = getLayoutDetailsForArchAndQuantType<cutlassArch, cutlass::half_t, cutlass::uint4b_t>();
+      break;
+    case QuantType::W4_AFP8:
+      details = getLayoutDetailsForArchAndQuantType<cutlassArch, cutlass::float_e4m3_t, cutlass::uint4b_t>();
+      break;
+  }
+  return details;
+}
+
+LayoutDetails getLayoutDetailsForTransform(QuantType quant_type, int arch) {
+  ORT_ENFORCE(arch >= 75, "Unsupported CUDA architecture: ", arch);
+  if (arch >= 75 && arch < 80) {
+    return getLayoutDetailsForArch<cutlass::arch::Sm75>(quant_type);
+  } else if (arch >= 90 && arch < 100) {
+    return getLayoutDetailsForArch<cutlass::arch::Sm90>(quant_type);
+  } else /*if (arch >= 80 && arch < 90 || arch >= 100)*/ {
+    return getLayoutDetailsForArch<cutlass::arch::Sm80>(quant_type);
+  }
+}
+
+constexpr std::array<int, 16> kPerm_W8_A16 = {
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+
+constexpr std::array<int, 32> kPerm_W4_A16 = {
+    0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27,
+    4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31};
+
+constexpr std::array<int, 32> kPerm_W4_AFP8 = {
+    0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23,
+    8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
+
+// Permutes the rows of B in a way that is compatible with Turing+ architectures.
+//
+// Throws an error for other architectures.
+// The data is permuted such that:
+// For W8_A16, each group of 16 rows is permuted using the map below:
+//  0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15
+// For W4_A16, each group of 32 rows is permuted using the map below:
+//  0 1 8 9 16 17 24 25 2 3 10 11 18 19 26 27 4 5 12 13 20 21 28 29 6 7 14 15 22 23 30 31
+// For W4_A8, see the map in the code. The idea is similar to above.
+// The goal of this permutation is to ensure data ends up in the correct threads after
+// we execute LDSM. It counteracts the effect of the data being of different widths.
+// For more information about the expected layouts, see the MMA section in the PTX docs.
+gsl::span<const int> get_permutation_map(QuantType quant_type) {
+  switch (quant_type) {
+    case QuantType::W8_A16:
+      return AsSpan(kPerm_W8_A16);
+    case QuantType::W4_A16:
+      return AsSpan(kPerm_W4_A16);
+    case QuantType::W4_AFP8:
+      return AsSpan(kPerm_W4_AFP8);
+    default:
+      ORT_THROW("Invalid quantization type for LDSM permutation");
+  }
+}
+
+}  // namespace weight_only
+}  // namespace kernels
+}  // namespace onnxruntime::llm
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h
index ff1a28661184f..92d891f2f3c1f 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h
@@ -17,11 +17,17 @@
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/fpA_intB_gemv.h"
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/details.h"
 #include "core/common/common.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
 
 namespace onnxruntime::llm {
 namespace kernels {
 namespace fpA_intB_gemv {
 
+// This code are only relevant for CUDA architectures where the fpA_intB_gemv is intended to run (sm_75 and above).
+// Therefore, we conditionally compile this block only when __CUDA_ARCH__ >= 750.
+// This prevents compilation errors that half2 requires target sm_53 or higher.
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) || !defined(__CUDA_ARCH__)
+
 template <typename DetailsA>
 struct MathWrapper {
 };
@@ -42,10 +48,6 @@ struct MathWrapper<FP16DetailsA> {
   __device__ __forceinline__ static Type2 mul2(Type2 const& a, Type2 const& b) {
     return __hmul2(a, b);
   }
-
-  // __device__ __forceinline__ static Type2 deq2(Type2 const& weight, Type2 const& scale, Type2 const& zero_point) {
-  //   return __hmul2(__hsub2(weight, zero_point), scale);
-  // }
 };
 
 template <>
@@ -227,11 +229,13 @@ __device__ __forceinline__ void fill(void* tile, T v) {
     reinterpret_cast<T*>(tile)[ii] = v;
   }
 }
+#endif  // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) || !defined(__CUDA_ARCH__)
 
 template <typename Details, int CtaM, int CtaN, int Threads, int GroupSize, bool EnableActScale, bool EnableZero,
           bool EnableBias, bool ApplyAlphaInAdvance, typename TypeA = typename Details::TypeDetailsA::Type>
 __global__ void kernel(TypeA* act, TypeA* act_scale, uint8_t* weight, TypeA* scales, TypeA* zeros, TypeA* bias,
                        TypeA* out, float alpha, int m, int n, int k) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
   // ArgType          ArgName          DataType           Shape                 Layout
   // input            act              fp16/bf16          [m, k]                RowMajor
   // input            act_scale        fp16/bf16          [1, k]                RowMajor
@@ -312,6 +316,7 @@ __global__ void kernel(TypeA* act, TypeA* act_scale, uint8_t* weight, TypeA* sca
     }
   }
   epilogue<Details, CtaM, CtaN, Threads, EnableBias, ApplyAlphaInAdvance>(out, n, tile_acc, bias, alpha);
+#endif
 }
 
 template <typename Details, int CtaM, int CtaN, int Threads, int GroupSize, bool EnableActScale, bool EnableZero,
diff --git a/onnxruntime/contrib_ops/cuda/llm/generate_kernels.py b/onnxruntime/contrib_ops/cuda/llm/generate_kernels.py
index 678102c809b63..acaecf71bf1f0 100644
--- a/onnxruntime/contrib_ops/cuda/llm/generate_kernels.py
+++ b/onnxruntime/contrib_ops/cuda/llm/generate_kernels.py
@@ -276,7 +276,7 @@ def is_gemm_op_valid(op):
 
 
 ################################################################################
-def generate_sm90_mixed_gemm_operations(enable_fp8=False, enable_scale_only=False):
+def generate_sm90_mixed_gemm_operations(enable_fp8=False):
     arch = 90
 
     # For legacy reasons, we use unsigned types for the weights. The instanitated template
@@ -296,13 +296,7 @@ def generate_sm90_mixed_gemm_operations(enable_fp8=False, enable_scale_only=Fals
             (DataType.e4m3, DataType.u4, DataType.f16, DataType.bf16, DataType.bf16),
         ]
 
-    quant_ops = [LlmQuantOp.finegrained_scale_and_zeros]
-
-    if enable_scale_only:
-        quant_ops = [
-            *quant_ops,
-            LlmQuantOp.finegrained_scale_only,
-        ]
+    quant_ops = [LlmQuantOp.finegrained_scale_and_zeros, LlmQuantOp.finegrained_scale_only]
 
     epi_tags = [LlmEpilogueTag.epilogue_op_bias]
 
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index 3e0b9d35b1950..a60fa01c26a0e 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -65,7 +65,13 @@ __launch_bounds__(TPB) __global__
 
     const int thread_row_offset = blockIdx.x * num_cols;
 
+#if CUDA_VERSION >= 12090
+    ::cuda::std::plus sum;
+#else
+    // Deprecated on CUDA 12.9
     cub::Sum sum;
+#endif
+
     float threadData(-FLT_MAX);
 
     // Don't touch finished rows.
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
index 3f485f0abdcb1..59da5b57dc715 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
@@ -14,7 +14,7 @@
 #include "contrib_ops/cuda/quantization/dequantize_blockwise.cuh"
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm.h"
 #include "contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.h"
-#include "contrib_ops/cuda/llm/cutlass_preprocessors.h"
+#include "contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h"
 #include "contrib_ops/cpu/quantization/matmul_nbits_helper.h"
 
 constexpr int MatMulNBits_Input_B = 1;
@@ -30,21 +30,40 @@ using onnxruntime::llm::kernels::weight_only::WeightOnlyGroupwiseQuantGemmPlugin
 using onnxruntime::llm::kernels::weight_only::WeightTypeId;
 static GemmPluginProfilerManager<WeightOnlyGroupwiseQuantGemmPluginProfiler> s_profilerManager;
 
+constexpr auto kScaleAndZeros = cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+constexpr auto kScaleOnly = cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+
 template <typename T>
 void MatMulNBits<T>::InitGemmProfiler(int sm) {
   gemmProfiler_ = s_profilerManager.createGemmPluginProfiler(/*inference*/ false);
 
   if constexpr (std::is_same_v<T, MLFloat16>) {
-    if (nbits_ == 8) {
-      weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<half, uint8_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>>();
-    } else if (nbits_ == 4) {
-      weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>>();
+    if (has_zero_points_) {
+      if (nbits_ == 8) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<half, uint8_t, kScaleAndZeros>>();
+      } else if (nbits_ == 4) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t, kScaleAndZeros>>();
+      }
+    } else {
+      if (nbits_ == 8) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<half, uint8_t, kScaleOnly>>();
+      } else if (nbits_ == 4) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t, kScaleOnly>>();
+      }
     }
   } else if constexpr (std::is_same_v<T, BFloat16>) {
-    if (nbits_ == 8) {
-      weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>>();
-    } else if (nbits_ == 4) {
-      weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>>();
+    if (has_zero_points_) {
+      if (nbits_ == 8) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, kScaleAndZeros>>();
+      } else if (nbits_ == 4) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, kScaleAndZeros>>();
+      }
+    } else {
+      if (nbits_ == 8) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, kScaleOnly>>();
+      } else if (nbits_ == 4) {
+        weightOnlyGemmRunner_ = std::make_shared<CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, kScaleOnly>>();
+      }
     }
   }
 
@@ -126,27 +145,18 @@ Status MatMulNBits<T>::PrePack_B([[maybe_unused]] const Tensor& tensor,
           stream, packed_transposed_weight, blob_data, n, k);
     }
 
-    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
-
-    auto tranpose_weight_buffer = this->AllocateBufferOnCPUPinned<int8_t>(packed_weight_bytes);
-    CUDA_RETURN_IF_ERROR(cudaMemcpy(tranpose_weight_buffer.get(), packed_transposed_weight, packed_weight_bytes, cudaMemcpyDeviceToHost));
-
-    auto processed_weight_buffer = this->AllocateBufferOnCPUPinned<uint8_t>(n * k / (8 / nbits_));
-    bool force_interleave = false;
-
-    using onnxruntime::llm::kernels::cutlass_kernels::QuantType;
+    using onnxruntime::llm::kernels::weight_only::QuantType;
     QuantType quant_type = nbits_ == 4 ? QuantType::W4_A16 : QuantType::W8_A16;
 
-    // TODO: Add a cuda kernle for preprocessing so that we can avoid copying the data back to CPU.
-    onnxruntime::llm::kernels::cutlass_kernels::preprocess_weights_for_mixed_gemm(
-        reinterpret_cast<int8_t*>(processed_weight_buffer.get()),
-        reinterpret_cast<const int8_t*>(tranpose_weight_buffer.get()),
+    auto permutation_map_buffer = this->GetTransientScratchBuffer<int32_t>(32);
+    onnxruntime::llm::kernels::weight_only::preprocess_weights_for_mixed_gemm_cuda(
+        stream,
+        sm_,
+        preprocessed_weight,
+        packed_transposed_weight,
+        permutation_map_buffer.get(),
         {static_cast<size_t>(k), static_cast<size_t>(n)},
-        quant_type,
-        force_interleave);
-
-    CUDA_RETURN_IF_ERROR(cudaMemcpy(preprocessed_weight, processed_weight_buffer.get(), n * k / (8 / nbits_), cudaMemcpyHostToDevice));
-    CUDA_RETURN_IF_ERROR(cudaDeviceSynchronize());
+        quant_type);
 
     DUMP_TENSOR_INIT();
     DUMP_TENSOR_D("packed transposed_weight in GPU", packed_transposed_weight, k, n * nbits_ / 8);
@@ -172,7 +182,8 @@ Status MatMulNBits<T>::PrePack_Scale([[maybe_unused]] const Tensor& tensor,
     typedef typename ToCudaType<T>::MappedType CudaT;
     CudaT* transposed_scales = reinterpret_cast<CudaT*>(fpA_intB_scale_buffer_.get());
 
-    onnxruntime::llm::kernels::fpA_intB_gemv::launch_transpose_scale_kernel<CudaT>(stream, reinterpret_cast<const CudaT*>(tensor.Data<T>()), transposed_scales, n, k_blocks);
+    onnxruntime::llm::kernels::fpA_intB_gemv::launch_transpose_scale_kernel<CudaT>(
+        stream, reinterpret_cast<const CudaT*>(tensor.Data<T>()), transposed_scales, n, k_blocks);
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
 
     DUMP_TENSOR_INIT();
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
index 02740d905c7c7..3c8bd34c5e845 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
@@ -62,7 +62,7 @@ class MatMulNBits final : public CudaKernel {
       if ((option & (static_cast<int>(nbits_) | kFpAIntBGemmOption_All)) != 0 &&
           (block_size_ == 64 || block_size_ == 128) &&
           (nbits_ == 4 || nbits_ == 8) &&
-          !has_g_idx_ && has_zero_points_ && !has_bias_ &&
+          !has_g_idx_ && !has_bias_ &&
           N_ % (nbits_ == 8 ? 32 : 64) == 0 &&
           K_ % block_size_ == 0 &&
           sm_ >= 75) {
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index 33f90d0a5f791..55fe2b05c7386 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -137,13 +137,18 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AdditionalImplementation() << "const qkv_head_size: u32 = " << qkv_head_size_ << ";\n"
                                     << "const num_heads: u32 =" << qkv_num_heads_ << ";\n";
 
+  if (is_fp16_) {
+    shader.AdditionalImplementation() << "const min_value : q_element_t = q_element_t(-65504.0);\n";
+  } else {
+    shader.AdditionalImplementation() << "const min_value = f32(-3.402823e+38f);\n";
+  }
+
   shader.AdditionalImplementation() << R"HELPER_FN(
   // For max performance max_k_step should be the same as sg_size, however we might run out of registers
   // for qk_1, qk_2 .. qk_(sg_size). So we cap it at max_k_step (16).
   const max_k_step: u32 = 16u;
   const vec_factor: u32 = 4u;
   const qkv_head_size_vec: u32 = qkv_head_size / vec_factor;
-  const min_value = f32(-3.402823e+38f);;
 
   // Default SHM usage limit is 16KB in Dawn.
   // vec4<f16> * qkv_head_size_vec * max_k_step = 8 * (128/4) * 16 = 4KB. 128 is head_size for phi4.
@@ -152,7 +157,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
 
   // Private memory per lane.
   var<private> q_tile : array<q_value_t, qkv_head_size_vec>;
-  fn loadq(q_idx_global : u32, head_idx: u32)
+  fn loadq(q_idx_global : u32, head_idx: u32, alpha: q_element_t)
   {
       // Stored as float16[batch_size,sequence_length,3072] the inputs as per onnx MHA
       // This is the layout if TransferBSDToBNSH has not been run.
@@ -161,7 +166,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
       //let offset = head_idx * uniforms.new_sequence_length * qkv_head_size_vec + q_idx_global * qkv_head_size_vec;
       for (var idx:u32 = 0; idx < qkv_head_size_vec; idx++)
       {
-          q_tile[idx] = q[idx+offset];
+          q_tile[idx] = q[idx+offset] * alpha;
       }
   }
   fn loadk(k_start : u32, head_idx: u32, local_idx: u32, k_step: u32)
@@ -195,32 +200,32 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Move half of o_tile from private memory into workgroup memory to reduce register pressure.
   // Note that register spill was observed on Qualcomm if whole o_tile is on private memory.
   // vec4<f16> * half_qkv_head_size_vec * workgroup_size_x = 8 * (128/4/2) * 64 = 8KB.
-  var<workgroup> o_tile_r : array<array<vec4<f32>, half_qkv_head_size_vec>, workgroup_size_x>;
+  var<workgroup> o_tile_r : array<array<q_value_t, half_qkv_head_size_vec>, workgroup_size_x>;
 
   // Private memory per lane.
-  var<private> o_tile : array<vec4<f32>, half_qkv_head_size_vec>;
+  var<private> o_tile : array<q_value_t, half_qkv_head_size_vec>;
   fn writeo(o_idx_global: u32, head_idx: u32, local_idx: u32)
   {
       // Stored as float16[batch_size,sequence_length,3072]
       let offset = o_idx_global * num_heads * qkv_head_size_vec + head_idx * qkv_head_size_vec;
       for (var idx:u32 = 0; idx < half_qkv_head_size_vec; idx ++)
       {
-          output[offset+idx] = q_value_t(o_tile[idx]);
-          output[offset+idx+half_qkv_head_size_vec] = q_value_t(o_tile_r[local_idx][idx]);
+          output[offset+idx] = o_tile[idx];
+          output[offset+idx+half_qkv_head_size_vec] = o_tile_r[local_idx][idx];
       }
   }
     )HELPER_FN";
   } else {
     shader.AdditionalImplementation() << R"HELPER_FN(
   // Private memory per lane.
-  var<private> o_tile : array<vec4<f32>, qkv_head_size_vec>;
+  var<private> o_tile : array<q_value_t, qkv_head_size_vec>;
   fn writeo(o_idx_global: u32, head_idx: u32)
   {
       // Stored as float16[batch_size,sequence_length,3072]
       let offset = o_idx_global * num_heads * qkv_head_size_vec + head_idx * qkv_head_size_vec;
       for (var idx:u32 = 0; idx < qkv_head_size_vec; idx ++)
       {
-          output[offset+idx] = q_value_t(o_tile[idx]);
+          output[offset+idx] = o_tile[idx];
       }
   }
     )HELPER_FN";
@@ -265,11 +270,11 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   let valid_q = q_idx_global < uniforms.new_sequence_length;
   if (valid_q)
   {
-    loadq(q_idx_global, head_idx);
+    loadq(q_idx_global, head_idx, q_element_t(uniforms.alpha));
   }
 
-  var previous_max : f32 = min_value;
-  var previous_denom : f32 = 0;
+  var previous_max : q_element_t = min_value;
+  var previous_denom : q_element_t = 0;
 
   for(var k_start = 0u; k_start < uniforms.total_sequence_length; k_start+=capped_sg_size)
   {
@@ -279,16 +284,16 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     workgroupBarrier();
 
     // Compute QKt
-    var qk_1:vec4<f32>;
-    var qk_2:vec4<f32>;
-    var qk_3:vec4<f32>;
-    var qk_4:vec4<f32>;
+    var qk_1:vec4<q_element_t>;
+    var qk_2:vec4<q_element_t>;
+    var qk_3:vec4<q_element_t>;
+    var qk_4:vec4<q_element_t>;
     if (sg_size > 8)
     {
       for (var i:u32 = 0u; i < qkv_head_size_vec; i++)
       {
-        var k_local = vec4<f32>(k_tile[capped_sg_id][i]);
-        var q_own = vec4<f32>(q_tile[i]);
+        var k_local = k_tile[capped_sg_id][i];
+        var q_own = q_tile[i];
         qk_1[0] += dot(q_own, subgroupShuffle(k_local, 0));
         qk_1[1] += dot(q_own, subgroupShuffle(k_local, 1));
         qk_1[2] += dot(q_own, subgroupShuffle(k_local, 2));
@@ -311,8 +316,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     {
       for (var i:u32 = 0u; i < qkv_head_size_vec; i++)
       {
-        var k_local = vec4<f32>(k_tile[capped_sg_id][i]);
-        var q_own = vec4<f32>(q_tile[i]);
+        var k_local = k_tile[capped_sg_id][i];
+        var q_own = q_tile[i];
         qk_1[0] += dot(q_own, subgroupShuffle(k_local, 0));
         qk_1[1] += dot(q_own, subgroupShuffle(k_local, 1));
         qk_1[2] += dot(q_own, subgroupShuffle(k_local, 2));
@@ -324,12 +329,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
       }
     }
 
-    qk_1 = qk_1 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start, head_idx));
-    qk_2 = qk_2 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start+4, head_idx));
+    qk_1 = qk_1 + loadAttentionBias(q_idx_global, k_start, head_idx);
+    qk_2 = qk_2 + loadAttentionBias(q_idx_global, k_start+4, head_idx);
     if (sg_size > 8)
     {
-      qk_3 = qk_3 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start+8, head_idx));
-      qk_4 = qk_4 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start+12, head_idx));
+      qk_3 = qk_3 + loadAttentionBias(q_idx_global, k_start+8, head_idx);
+      qk_4 = qk_4 + loadAttentionBias(q_idx_global, k_start+12, head_idx);
     }
 
     let seq_causal_length = select(uniforms.total_sequence_length, uniforms.past_sequence_length + q_idx_global + 1, uniforms.is_gqa > 0);
@@ -386,11 +391,11 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     }
     let local_max = max(max(local_max_temp.x, local_max_temp.y),max(local_max_temp.z, local_max_temp.w));
     let new_max = max(previous_max, local_max);
-    qk_1 = exp(qk_1 - new_max);
-    qk_2 = exp(qk_2 - new_max);
+    qk_1 = q_value_t(exp(vec4<f32>(qk_1) - f32(new_max)));
+    qk_2 = q_value_t(exp(vec4<f32>(qk_2) - f32(new_max)));
     if (sg_size > 8) {
-      qk_3 = exp(qk_3 - new_max);
-      qk_4 = exp(qk_4 - new_max);
+      qk_3 = q_value_t(exp(vec4<f32>(qk_3) - f32(new_max)));
+      qk_4 = q_value_t(exp(vec4<f32>(qk_4) - f32(new_max)));
     }
     let sum_vec = qk_1 + qk_2 + qk_3 + qk_4;
     let sum = sum_vec.x + sum_vec.y + sum_vec.z + sum_vec.w;
@@ -398,7 +403,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     // Compute lhs term of update di prime and the compute di prime.
     let dleft = previous_denom * exp(previous_max-new_max);
     var d = dleft + sum;
-    d = select(d,f32(0.0000001),d==0);
+    d = select(d,q_element_t(0.0000001),d==0);
     qk_1 = qk_1 / d;
     qk_2 = qk_2 / d;
     if (sg_size > 8) {
@@ -416,7 +421,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     if (sg_size > 8) {
       for (var i:u32 = 0; i < half_qkv_head_size_vec; i++)
       {
-          var val = vec4<f32>(v_tile[capped_sg_id][i]);
+          var val = v_tile[capped_sg_id][i];
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -435,7 +440,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
           sum += subgroupShuffle(val, 15) * qk_4[3];
           o_tile[i] = o_tile[i] * o_ratio + sum;
 
-          val = vec4<f32>(v_tile[capped_sg_id][half_qkv_head_size_vec + i]);
+          val = v_tile[capped_sg_id][half_qkv_head_size_vec + i];
           sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -459,7 +464,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     {
       for (var i:u32 = 0; i < half_qkv_head_size_vec; i++)
       {
-          var val = vec4<f32>(v_tile[capped_sg_id][i]);
+          var val = v_tile[capped_sg_id][i];
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -470,7 +475,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
           sum += subgroupShuffle(val, 7) * qk_2[3];
           o_tile[i] = o_tile[i] * o_ratio + sum;
 
-          val = vec4<f32>(v_tile[capped_sg_id][half_qkv_head_size_vec + i]);
+          val = v_tile[capped_sg_id][half_qkv_head_size_vec + i];
           sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -493,7 +498,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     if (sg_size > 8) {
       for (var i:u32 = 0; i < qkv_head_size_vec; i++)
       {
-          var val = vec4<f32>(v_tile[capped_sg_id][i]);
+          var val = v_tile[capped_sg_id][i];
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -517,7 +522,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     {
       for (var i:u32 = 0; i < qkv_head_size_vec; i++)
       {
-          var val = vec4<f32>(v_tile[capped_sg_id][i]);
+          var val = v_tile[capped_sg_id][i];
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -569,8 +574,8 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
                                     << "const sub_tile_count = " << WorkgroupSizeX() / tile_size_k_vec << "u;\n";
   shader.AdditionalImplementation() << R"ADDNL_FN(
 var<workgroup> tile_q: array<q_value_t, tile_size_k_vec>;
-var<workgroup> inner_qk_values: array<array<f32, tile_size_k_vec>, tile_size>;
-var<workgroup> tile_qk: array<f32, tile_size>;
+var<workgroup> inner_qk_values: array<array<q_element_t, tile_size_k_vec>, tile_size>;
+var<workgroup> tile_qk: array<q_element_t, tile_size>;
 )ADDNL_FN";
 
   if (has_attention_bias_) {
@@ -602,11 +607,11 @@ var<workgroup> tile_qk: array<f32, tile_size>;
         tile_q[local_idx] = q[q_offset + k + local_idx];
       }
       workgroupBarrier();
-      let q_data = vec4<f32>(tile_q[local_col]);
+      let q_data = tile_q[local_col] * q_element_t(uniforms.alpha);
       if (k + local_col < uniforms.head_size_vec) {
         for (var row_offset = 0u; row_offset < tile_size; row_offset += sub_tile_count) {
           if (total_seq_offset + row_offset + local_row < total_sequence_length) {
-            inner_qk_values[row_offset + local_row][local_col] += dot(vec4<f32>(present_key[present_offset + (total_seq_offset + row_offset + local_row) * uniforms.head_size_vec + k + local_col]), q_data);
+            inner_qk_values[row_offset + local_row][local_col] += dot(present_key[present_offset + (total_seq_offset + row_offset + local_row) * uniforms.head_size_vec + k + local_col], q_data);
           }
         }
       }
@@ -614,13 +619,13 @@ var<workgroup> tile_qk: array<f32, tile_size>;
     }
 
     if (local_idx < tile_size && total_seq_offset + local_idx < total_sequence_length && head_idx < uniforms.num_heads) {
-      var sum = f32(0);
+      var sum = q_element_t(0);
       for (var i = 0u; i < tile_size_k_vec; i++) {
         sum += inner_qk_values[local_idx][i];
       }
 
       let output_idx = head_idx * total_sequence_length + total_seq_offset + local_idx;
-      sum = sum * uniforms.alpha + f32(loadAttentionBias(output_idx));
+      sum = sum + loadAttentionBias(output_idx);
       tile_qk[local_idx] = sum;
       output[output_idx] = sum;
     }
@@ -740,7 +745,7 @@ var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>,
       }
 
       if (total_seq_offset + local_idx < total_sequence_length) {
-        tile_qk[local_idx] = present_value_element_t(exp(qk[head_idx * total_sequence_length + total_seq_offset + local_idx] - g_max) / g_sum);
+        tile_qk[local_idx] = present_value_element_t(exp(f32(qk[head_idx * total_sequence_length + total_seq_offset + local_idx]) - g_max) / g_sum);
       }
     }
     for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) {
@@ -891,7 +896,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     const uint32_t tile_size = 64;
     bool has_attention_bias = attention_bias != nullptr;
     bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
-    FlashAttentionProgram program{"FlashAttention", has_attention_bias, is_qualcomm, parameters.head_size_, parameters.num_heads_};
+    bool is_fp16 = (Q->GetElementType() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+    FlashAttentionProgram program{"FlashAttention", has_attention_bias, is_qualcomm, is_fp16, parameters.head_size_, parameters.num_heads_};
     program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, 4},
                        {present_key, ProgramTensorMetadataDependency::TypeAndRank, 4},
                        {present_value, ProgramTensorMetadataDependency::TypeAndRank, 4}});
@@ -920,7 +926,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_,
                                    parameters.sequence_length_, parameters.total_sequence_length_});
   const TensorShape qk_shape(qk_dims);
-  Tensor qk = context.CreateGPUTensor(DataTypeImpl::GetType<float>(), qk_shape);
+  Tensor qk = context.CreateGPUTensor(Q->DataType(), qk_shape);
   constexpr uint32_t tile_size = 64;
   const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size;
   // The metadata is used to store the max and sum of each tile.
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index bfa4e7cb6d53f..374ede6f4db7c 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -37,11 +37,13 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
   FlashAttentionProgram(const std::string& kernel_name,
                         bool has_attention_bias,
                         bool is_qualcomm,
+                        bool is_fp16,
                         int qkv_head_size,
                         int qkv_num_heads)
       : Program{kernel_name},
         has_attention_bias_(has_attention_bias),
         is_qualcomm_(is_qualcomm),
+        is_fp16_(is_fp16),
         qkv_head_size_(qkv_head_size),
         qkv_num_heads_(qkv_num_heads) {
   }
@@ -60,6 +62,7 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
  private:
   bool has_attention_bias_;
   bool is_qualcomm_;
+  bool is_fp16_;
   int qkv_head_size_;
   int qkv_num_heads_;
 };
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index 00ff896bf6749..8ea593f107833 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -127,7 +127,9 @@ void CPUIDInfo::X86Init() {
       has_f16c_ = has_avx_ && (data[2] & (1 << 29)) && (data[3] & (1 << 26));
 
       if (num_IDs >= 7) {
-        GetCPUID(7, data);
+        // This change is made to overcome the issue of __get_cpuid returning all zeros, instead use __get_cpuid_count.
+        // Reference: https://stackoverflow.com/questions/46272579/why-does-get-cpuid-return-all-zeros-for-leaf-4
+        GetCPUID(7, 0, data);
         const uint32_t max_SubLeaves = data[0];
         has_amx_bf16_ = (data[3] & (1 << 22));
         has_avx2_ = has_avx_ && (data[1] & (1 << 5));
diff --git a/onnxruntime/core/common/spin_pause.cc b/onnxruntime/core/common/spin_pause.cc
index 9bada0841c162..329f3f125abda 100644
--- a/onnxruntime/core/common/spin_pause.cc
+++ b/onnxruntime/core/common/spin_pause.cc
@@ -25,6 +25,7 @@ namespace concurrency {
 // Intrinsic to use in spin-loops
 void SpinPause() {
 #if (defined(_M_AMD64) || defined(__x86_64__)) && \
+    !defined(_M_ARM64EC) &&                       \
     !defined(__ANDROID__) &&                      \
     !defined(__APPLE__)
 
diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc
index ac5dcd9c96084..7cc00fa4ca74a 100644
--- a/onnxruntime/core/dll/dllmain.cc
+++ b/onnxruntime/core/dll/dllmain.cc
@@ -12,20 +12,29 @@
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
+#include "core/session/ort_env.h"
+
+extern std::atomic<bool> g_is_shutting_down;
 
 // dllmain.cc : Defines the entry point for the DLL application.
 BOOL APIENTRY DllMain(HMODULE /*hModule*/,
                       DWORD ul_reason_for_call,
-                      LPVOID /*lpReserved*/
-) {
+                      LPVOID lpvReserved) {
   switch (ul_reason_for_call) {
     case DLL_PROCESS_ATTACH:
     case DLL_THREAD_ATTACH:
     case DLL_THREAD_DETACH:
       break;
     case DLL_PROCESS_DETACH:
-      // TODO: Don't do it when Protobuf_USE_STATIC_LIBS is OFF
-      ::google::protobuf::ShutdownProtobufLibrary();
+      // Windows API doc says: "When handling DLL_PROCESS_DETACH, a DLL should free resources such as heap memory only if the DLL is being unloaded dynamically"
+      if (lpvReserved != nullptr) {
+        g_is_shutting_down = true;
+        // do not do cleanup if process termination scenario
+      } else {
+        // Cleanup protobuf library.
+        // NOTE: it might be too early to do so, as all function local statics and global objects are not destroyed yet.
+        ::google::protobuf::ShutdownProtobufLibrary();
+      }
       break;
   }
   return TRUE;
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index ba469a89abd00..a58f5ee27b754 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -82,34 +82,14 @@ void AllocatorDefaultFreeAligned(void* p, size_t alignment) {
 
 void* AllocatorDefaultAllocAligned(size_t size, size_t alignment) {
   if (size == 0) return nullptr;
+
   size += MLAS_SYMM_QGEMM_BUF_OVERRUN;
-  void* p;
-#if _MSC_VER
-  p = _aligned_malloc(size, alignment);
-  if (p == nullptr)
-    ORT_THROW_EX(std::bad_alloc);
-#elif defined(_LIBCPP_SGX_CONFIG)
-  p = memalign(alignment, size);
-  if (p == nullptr)
-    ORT_THROW_EX(std::bad_alloc);
-#else
-  int ret = posix_memalign(&p, alignment, size);
-  if (ret != 0)
-    ORT_THROW_EX(std::bad_alloc);
-#endif
-  return p;
-}
 
-void AllocatorDefaultFree(void* p) {
-#if _MSC_VER
-  _aligned_free(p);
-#else
-  free(p);
-#endif
+  return ::operator new(size, std::align_val_t{alignment});
 }
 
-void AllocatorDefaultFreeAligned(void* p, size_t /* alignment */) {
-  AllocatorDefaultFree(p);
+void AllocatorDefaultFreeAligned(void* p, size_t alignment) {
+  ::operator delete(p, std::align_val_t{alignment});
 }
 
 #endif  // USE_MIMALLOC
diff --git a/onnxruntime/core/framework/allocator_stats.h b/onnxruntime/core/framework/allocator_stats.h
index e3c942fec17bf..25128b29fc371 100644
--- a/onnxruntime/core/framework/allocator_stats.h
+++ b/onnxruntime/core/framework/allocator_stats.h
@@ -51,4 +51,4 @@ struct AllocatorStats {
     return ss.str();
   }
 };
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index df85daa006a43..f44e1280e8041 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "core/framework/execution_provider.h"
+#include "core/framework/execution_providers.h"
 
 #include "core/graph/graph_viewer.h"
 #include "core/framework/compute_capability.h"
@@ -9,6 +10,8 @@
 #include "core/framework/murmurhash3.h"
 #include "core/framework/op_kernel.h"
 
+#include <stdint.h>
+
 namespace onnxruntime {
 
 std::vector<std::unique_ptr<ComputeCapability>>
@@ -37,4 +40,105 @@ common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
 }
 
 #endif
+
+ExecutionProviders::ExecutionProviders() {
+#ifdef _WIN32
+  // Register callback for ETW capture state (rundown)
+  etw_callback_key_ = "ExecutionProviders_rundown_";
+  etw_callback_key_.append(std::to_string(reinterpret_cast<uintptr_t>(this)));
+  WindowsTelemetry::RegisterInternalCallback(
+      etw_callback_key_,
+      [this](LPCGUID SourceId,
+             ULONG IsEnabled,
+             UCHAR Level,
+             ULONGLONG MatchAnyKeyword,
+             ULONGLONG MatchAllKeyword,
+             PEVENT_FILTER_DESCRIPTOR FilterData,
+             PVOID CallbackContext) { this->EtwProvidersCallback(SourceId, IsEnabled, Level,
+                                                                 MatchAnyKeyword, MatchAllKeyword,
+                                                                 FilterData, CallbackContext); });
+#endif
+}
+
+ExecutionProviders::~ExecutionProviders() {
+#ifdef _WIN32
+  WindowsTelemetry::UnregisterInternalCallback(etw_callback_key_);
+#endif
+}
+
+common::Status ExecutionProviders::Add(const std::string& provider_id,
+                                       const std::shared_ptr<IExecutionProvider>& p_exec_provider) {
+  // make sure there are no issues before we change any internal data structures
+  if (provider_idx_map_.find(provider_id) != provider_idx_map_.end()) {
+    auto status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Provider ", provider_id, " has already been registered.");
+    LOGS_DEFAULT(ERROR) << status.ErrorMessage();
+    return status;
+  }
+
+  // index that provider will have after insertion
+  auto new_provider_idx = exec_providers_.size();
+
+  ORT_IGNORE_RETURN_VALUE(provider_idx_map_.insert({provider_id, new_provider_idx}));
+
+  // update execution provider options
+  auto providerOptions = p_exec_provider->GetProviderOptions();
+  exec_provider_options_[provider_id] = providerOptions;
+
+#ifdef _WIN32
+  LogProviderOptions(provider_id, providerOptions, false);
+#endif
+
+  exec_provider_ids_.push_back(provider_id);
+  exec_providers_.push_back(p_exec_provider);
+  return Status::OK();
+}
+
+#ifdef _WIN32
+void ExecutionProviders::EtwProvidersCallback(LPCGUID /* SourceId */,
+                                              ULONG IsEnabled,
+                                              UCHAR /* Level */,
+                                              ULONGLONG MatchAnyKeyword,
+                                              ULONGLONG /* MatchAllKeyword */,
+                                              PEVENT_FILTER_DESCRIPTOR /* FilterData */,
+                                              PVOID /* CallbackContext */) {
+  // Check if this callback is for capturing state
+  if ((IsEnabled == EVENT_CONTROL_CODE_CAPTURE_STATE) &&
+      ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)) != 0)) {
+    for (size_t i = 0; i < exec_providers_.size(); ++i) {
+      const auto& provider_id = exec_provider_ids_[i];
+
+      auto it = exec_provider_options_.find(provider_id);
+      if (it != exec_provider_options_.end()) {
+        const auto& options = it->second;
+
+        LogProviderOptions(provider_id, options, true);
+      }
+    }
+  }
+}
+
+void ExecutionProviders::LogProviderOptions(const std::string& provider_id,
+                                            const ProviderOptions& providerOptions,
+                                            bool captureState) {
+#ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
+  for (const auto& config_pair : providerOptions) {
+    TraceLoggingWrite(
+        telemetry_provider_handle,
+        "ProviderOptions",
+        TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+        TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+        TraceLoggingString(provider_id.c_str(), "ProviderId"),
+        TraceLoggingString(config_pair.first.c_str(), "Key"),
+        TraceLoggingString(config_pair.second.c_str(), "Value"),
+        TraceLoggingBool(captureState, "isCaptureState"));
+  }
+#else
+  ORT_UNUSED_PARAMETER(provider_id);
+  ORT_UNUSED_PARAMETER(providerOptions);
+  ORT_UNUSED_PARAMETER(captureState);
+#endif
+}
+
+#endif
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h
index 29cf79ec385d8..c226db8c37c51 100644
--- a/onnxruntime/core/framework/execution_providers.h
+++ b/onnxruntime/core/framework/execution_providers.h
@@ -26,91 +26,24 @@ Class for managing lookup of the execution providers in a session.
 */
 class ExecutionProviders {
  public:
-  ExecutionProviders() {
-#ifdef _WIN32
-    // Register callback for ETW capture state (rundown)
-    etw_callback_ = onnxruntime::WindowsTelemetry::EtwInternalCallback(
-        [this](
-            LPCGUID SourceId,
-            ULONG IsEnabled,
-            UCHAR Level,
-            ULONGLONG MatchAnyKeyword,
-            ULONGLONG MatchAllKeyword,
-            PEVENT_FILTER_DESCRIPTOR FilterData,
-            PVOID CallbackContext) {
-          (void)SourceId;
-          (void)Level;
-          (void)MatchAnyKeyword;
-          (void)MatchAllKeyword;
-          (void)FilterData;
-          (void)CallbackContext;
-
-          // Check if this callback is for capturing state
-          if ((IsEnabled == EVENT_CONTROL_CODE_CAPTURE_STATE) &&
-              ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)) != 0)) {
-            for (size_t i = 0; i < exec_providers_.size(); ++i) {
-              const auto& provider_id = exec_provider_ids_[i];
-
-              auto it = exec_provider_options_.find(provider_id);
-              if (it != exec_provider_options_.end()) {
-                const auto& options = it->second;
-
-                LogProviderOptions(provider_id, options, true);
-              }
-            }
-          }
-        });
-    WindowsTelemetry::RegisterInternalCallback(etw_callback_);
-#endif
-  }
-
-  ~ExecutionProviders() {
-#ifdef _WIN32
-    WindowsTelemetry ::UnregisterInternalCallback(etw_callback_);
-#endif
-  }
+  ExecutionProviders();
 
-  common::Status
-  Add(const std::string& provider_id, const std::shared_ptr<IExecutionProvider>& p_exec_provider) {
-    // make sure there are no issues before we change any internal data structures
-    if (provider_idx_map_.find(provider_id) != provider_idx_map_.end()) {
-      auto status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Provider ", provider_id, " has already been registered.");
-      LOGS_DEFAULT(ERROR) << status.ErrorMessage();
-      return status;
-    }
+  ~ExecutionProviders();
 
-    // index that provider will have after insertion
-    auto new_provider_idx = exec_providers_.size();
-
-    ORT_IGNORE_RETURN_VALUE(provider_idx_map_.insert({provider_id, new_provider_idx}));
-
-    // update execution provider options
-    auto providerOptions = p_exec_provider->GetProviderOptions();
-    exec_provider_options_[provider_id] = providerOptions;
+  common::Status Add(const std::string& provider_id, const std::shared_ptr<IExecutionProvider>& p_exec_provider);
 
 #ifdef _WIN32
-    LogProviderOptions(provider_id, providerOptions, false);
-#endif
 
-    exec_provider_ids_.push_back(provider_id);
-    exec_providers_.push_back(p_exec_provider);
-    return Status::OK();
-  }
+  void EtwProvidersCallback(LPCGUID SourceId,
+                            ULONG IsEnabled,
+                            UCHAR Level,
+                            ULONGLONG MatchAnyKeyword,
+                            ULONGLONG MatchAllKeyword,
+                            PEVENT_FILTER_DESCRIPTOR FilterData,
+                            PVOID CallbackContext);
 
-#ifdef _WIN32
-  void LogProviderOptions(const std::string& provider_id, const ProviderOptions& providerOptions, bool captureState) {
-    for (const auto& config_pair : providerOptions) {
-      TraceLoggingWrite(
-          telemetry_provider_handle,
-          "ProviderOptions",
-          TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
-          TraceLoggingLevel(WINEVENT_LEVEL_INFO),
-          TraceLoggingString(provider_id.c_str(), "ProviderId"),
-          TraceLoggingString(config_pair.first.c_str(), "Key"),
-          TraceLoggingString(config_pair.second.c_str(), "Value"),
-          TraceLoggingBool(captureState, "isCaptureState"));
-    }
-  }
+  void LogProviderOptions(const std::string& provider_id, const ProviderOptions& providerOptions,
+                          bool captureState);
 #endif
 
   const IExecutionProvider* Get(const onnxruntime::Node& node) const {
@@ -169,7 +102,7 @@ class ExecutionProviders {
   bool cpu_execution_provider_was_implicitly_added_ = false;
 
 #ifdef _WIN32
-  WindowsTelemetry::EtwInternalCallback etw_callback_;
+  std::string etw_callback_key_;
 #endif
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index b95b38d007fbb..b75eeb217e7f0 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -185,7 +185,7 @@ struct SessionOptions {
   unsigned max_num_graph_transformation_steps = 10;  // TODO choose a good default here?
 
   // set graph optimization level
-  TransformerLevel graph_optimization_level = TransformerLevel::Level3;
+  TransformerLevel graph_optimization_level = TransformerLevel::MaxLevel;
 
   // controls the size of the thread pool used to parallelize the execution of tasks within individual nodes (ops)
   OrtThreadPoolParams intra_op_param;
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index ff664c2c76703..b4f01bca1a097 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -32,14 +32,6 @@
 
 namespace onnxruntime {
 namespace utils {
-void* DefaultAlloc(size_t size) {
-  return onnxruntime::AllocatorDefaultAlloc(size);
-}
-
-void DefaultFree(void* p) {
-  onnxruntime::AllocatorDefaultFree(p);
-}
-
 void ConstructStrings(void* p_data, int64_t elements) {
   auto* ptr = static_cast<std::string*>(p_data);
   for (int64_t i = 0; i < elements; ++i) {
diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h
index afdb5a2cb27f5..6b5c404e26b7f 100644
--- a/onnxruntime/core/framework/utils.h
+++ b/onnxruntime/core/framework/utils.h
@@ -34,9 +34,6 @@ class Logger;
 }
 
 namespace utils {
-void* DefaultAlloc(size_t size);
-void DefaultFree(void* p);
-
 /// <summary>
 // Do the placement new for strings on pre-allocated buffer
 // `elements` times.
diff --git a/onnxruntime/core/graph/graph_utils.cc b/onnxruntime/core/graph/graph_utils.cc
index 640a6c6d4232a..cc48df4444951 100644
--- a/onnxruntime/core/graph/graph_utils.cc
+++ b/onnxruntime/core/graph/graph_utils.cc
@@ -610,6 +610,11 @@ bool IsGraphInput(const Graph& graph, const NodeArg* input) {
   return std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end();
 }
 
+bool IsGraphOutput(const Graph& graph, const NodeArg* output) {
+  const auto& graph_outputs = graph.GetOutputs();
+  return std::find(graph_outputs.begin(), graph_outputs.end(), output) != graph_outputs.end();
+}
+
 bool IsInitializer(const Graph& graph, const std::string& name, bool check_outer_scope) {
   bool is_initializer = false;
   const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
diff --git a/onnxruntime/core/graph/graph_utils.h b/onnxruntime/core/graph/graph_utils.h
index 0b713196203d6..8710519cdc865 100644
--- a/onnxruntime/core/graph/graph_utils.h
+++ b/onnxruntime/core/graph/graph_utils.h
@@ -132,6 +132,9 @@ bool IsOutputUsed(const Node& node, int index);
 /** Returns true if the graph has the given input.*/
 bool IsGraphInput(const Graph& graph, const NodeArg* input);
 
+/** Returns true if the graph has the given output.*/
+bool IsGraphOutput(const Graph& graph, const NodeArg* output);
+
 /** returns true if 'name' is an initializer in 'graph', or an ancestor graph if check_outer_scope is true.
 @param check_outer_scope If true and 'graph' is a subgraph, check ancestor graph/s for 'name' if not found in 'graph'.
 */
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 266370997fd46..3575e30721af7 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1419,6 +1419,17 @@ MlasConvertHalfToFloatBuffer(
     size_t Count
 );
 
+#define MLAS_MIN_TENSOR_SIZE_FOR_HALF_TO_FLOAT_CONVERSION_IN_PARALLEL 128000
+
+void
+MLASCALL
+MlasConvertHalfToFloatBufferInParallel(
+    const MLAS_FP16* Source,
+    float* Destination,
+    size_t Count,
+    MLAS_THREADPOOL* ThreadPool
+);
+
 void
 MLASCALL
 MlasConvertFloatToHalfBuffer(
diff --git a/onnxruntime/core/mlas/lib/cast.cpp b/onnxruntime/core/mlas/lib/cast.cpp
index 9b5800b08edbc..6b71fe37926e7 100644
--- a/onnxruntime/core/mlas/lib/cast.cpp
+++ b/onnxruntime/core/mlas/lib/cast.cpp
@@ -33,6 +33,60 @@ MlasConvertHalfToFloatBuffer(
     }
 }
 
+
+void
+MLASCALL
+MlasConvertHalfToFloatBufferInParallel(
+    const MLAS_FP16* Source,
+    float* Destination,
+    size_t Count,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+#if defined(BUILD_MLAS_NO_ONNXRUNTIME)
+    MLAS_UNREFERENCED_PARAMETER(ThreadPool);
+
+    // If the ThreadPool is not available, use the single-threaded version.
+    MlasConvertHalfToFloatBuffer(Source, Destination, Count);
+#else
+    // Check if the Tensor is long enough to use threads.
+    // Check if the Thread Pool is available.
+    // If not, execute single threaded conversion of half to float
+    if (!((Count > MLAS_MIN_TENSOR_SIZE_FOR_HALF_TO_FLOAT_CONVERSION_IN_PARALLEL) && ThreadPool)) {
+        MlasConvertHalfToFloatBuffer(Source, Destination, Count);
+    }
+    else {
+
+        // Calculate the number of compute cycles per implementation
+        size_t num_compute_cycles;
+        if (MLAS_CPUIDINFO::GetCPUIDInfo().HasSSE3()) {
+            num_compute_cycles = Count >> 1;
+        } else if (MLAS_CPUIDINFO::GetCPUIDInfo().HasAVX2()) {
+            num_compute_cycles = Count >> 2;
+        } else {
+            num_compute_cycles = Count * 10;
+        }
+
+        MLAS_THREADPOOL::TryParallelFor(
+            ThreadPool, Count,
+            // Tensor Op Cost
+            {
+                static_cast<double>(Count * sizeof(MLAS_FP16)),  // Size of no. of elements in bytes to be loaded
+                static_cast<double>(Count * sizeof(float)),      // Size of no. of elements in bytes to be stored
+                static_cast<double>(num_compute_cycles),         // No. of compute cycles required for the tensor op
+            },
+            // Lambda function required by TryParallelFor method
+            [Source, Destination](std::ptrdiff_t first_span, std::ptrdiff_t last_span) {
+                MlasConvertHalfToFloatBuffer(
+                    Source + first_span,
+                    Destination + first_span,
+                    static_cast<size_t>(last_span - first_span));
+            }
+        );
+    }
+#endif // BUILD_MLAS_NO_ONNXRUNTIME
+}
+
 void
 MLASCALL
 MlasConvertFloatToHalfBuffer(
diff --git a/onnxruntime/core/mlas/lib/logistic.cpp b/onnxruntime/core/mlas/lib/logistic.cpp
index ecca39f974155..1ef16ecc8ec49 100644
--- a/onnxruntime/core/mlas/lib/logistic.cpp
+++ b/onnxruntime/core/mlas/lib/logistic.cpp
@@ -110,7 +110,10 @@ Return Value:
         q = MlasMultiplyAddFloat32x4(q, ValueSquared, MlasBroadcastFloat32x4(MlasLogisticConstants.beta_2));
         q = MlasMultiplyAddFloat32x4(q, ValueSquared, MlasBroadcastFloat32x4(MlasLogisticConstants.beta_0));
 
-        MlasStoreFloat32x4(Output, MlasAddFloat32x4(MlasDivideFloat32x4(p, q), MlasBroadcastFloat32x4(0.5f)));
+        MlasStoreFloat32x4(Output, MlasClampFloat32x4(
+            MlasAddFloat32x4(MlasDivideFloat32x4(p, q), MlasBroadcastFloat32x4(0.5f)),
+            0.0f,
+            1.0f));
 
         Input += 4;
         Output += 4;
@@ -145,7 +148,7 @@ Return Value:
         q = q * ValueSquared + MlasLogisticConstants.beta_2;
         q = q * ValueSquared + MlasLogisticConstants.beta_0;
 
-        *Output++ = (p / q) + 0.5f;
+        *Output++ = std::clamp((p / q) + 0.5f, 0.0f, 1.0f);
 
         N -= 1;
     }
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 184816ac24c43..0af3cd2e33b02 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -261,6 +261,22 @@ struct MLFloat16 {
     MLFloat16() = default;
     explicit constexpr MLFloat16(uint16_t x) : val(x) {}
     explicit MLFloat16(float ff) : val(MLAS_Float2Half(ff)) {}
+    constexpr static MLFloat16 FromBits(uint16_t x) noexcept { return MLFloat16(x); }
+
+    MLFloat16 Abs() const noexcept {
+        return MLFloat16(static_cast<uint16_t>(val & ~kSignMask));
+    }
+    bool IsNaN() const noexcept {
+        return Abs().val > kPositiveInfinityBits;
+    }
+    bool IsNegative() const noexcept {
+        return static_cast<int16_t>(val) < 0;
+    }
+    MLFloat16 Negate() const {
+        return MLFloat16(IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask));
+    }
+    static constexpr uint16_t kSignMask = 0x8000U;
+    static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
 
     float ToFloat() const { return MLAS_Half2Float(val); }
 
diff --git a/onnxruntime/core/optimizer/cast_chain_elimination.cc b/onnxruntime/core/optimizer/cast_chain_elimination.cc
new file mode 100644
index 0000000000000..42ed52f1aa6ce
--- /dev/null
+++ b/onnxruntime/core/optimizer/cast_chain_elimination.cc
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/logging/logging.h"
+#include "core/optimizer/rewrite_rule.h"
+#include "core/optimizer/cast_chain_elimination.h"
+#include "core/optimizer/utils.h"
+#include "core/graph/graph.h"
+#include "core/graph/graph_utils.h"
+
+namespace onnxruntime {
+
+Status CastChainElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger&) const {
+  auto nextNodeIt = node.OutputNodesBegin();
+  Node* next = graph.GetNode(nextNodeIt->Index());
+
+  // We can remove the current node.
+  graph_utils::RemoveNodeOutputEdges(graph, node);
+
+  NodeArg* last_node_output_def = node.MutableOutputDefs()[0];
+  const std::string& last_node_output_tensor_name = last_node_output_def->Name();
+
+  // Find the matching def slot, so we can wire the final node to the input of the removeable node.
+  int slot = -1;
+
+  auto& inputs = next->MutableInputDefs();
+  for (int i = 0, n = static_cast<int>(inputs.size()); i < n; ++i) {
+    if (inputs[i]->Name() == last_node_output_tensor_name) {
+      slot = i;
+      break;
+    }
+  }
+
+  next->MutableInputDefs()[slot] = node.MutableInputDefs()[0];
+
+  graph_utils::MoveAllNodeInputEdges(graph, node, *next);
+
+  graph.RemoveNode(node.Index());
+
+  rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
+
+  return Status::OK();
+}
+
+bool CastChainElimination::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const {
+  if (!graph_utils::CanRemoveNode(graph, node, logger)) {
+    return false;
+  }
+
+  // Skip nodes that don't have 1 output edge.
+  if (node.GetOutputEdgesCount() != 1) {
+    return false;
+  }
+
+  const auto nextNodeIt = node.OutputNodesBegin();
+
+  const Node* next = graph.GetNode(nextNodeIt->Index());
+
+  // Skip if the next node is not of type Cast.
+  if (next->OpType() != "Cast") {
+    return false;
+  }
+
+  return true;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/cast_chain_elimination.h b/onnxruntime/core/optimizer/cast_chain_elimination.h
new file mode 100644
index 0000000000000..f3c6478969934
--- /dev/null
+++ b/onnxruntime/core/optimizer/cast_chain_elimination.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/rewrite_rule.h"
+
+namespace onnxruntime {
+
+/**
+@Class CastElimination
+The transform that will try to find the longest chain of the type Cast where the 'to' attribute has the same data type as the input of the first Cast node in the chain.
+E.g.
+A ('float32') -> Cast (to='float16') ->  Cast (to='int4') ->  Cast (to='float32') -> Cast (to='float16') -> B
+will reduce to
+ A ('float32') -> Cast (to='float16') -> B
+
+All the Cast nodes throughout the path need to have one input and one output to be considered for the fusion.
+*/
+class CastChainElimination : public RewriteRule {
+ public:
+  CastChainElimination() noexcept : RewriteRule("CastChainElimination") {}
+
+  std::vector<std::string> TargetOpTypes() const noexcept override {
+    return {"Cast"};
+  }
+
+ private:
+  bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const override;
+
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/cast_elimination.cc b/onnxruntime/core/optimizer/cast_elimination.cc
index bbcd93472e5b0..64bbacfa22a75 100644
--- a/onnxruntime/core/optimizer/cast_elimination.cc
+++ b/onnxruntime/core/optimizer/cast_elimination.cc
@@ -31,4 +31,4 @@ bool CastElimination::SatisfyCondition(const Graph& graph, const Node& node, con
   return optimizer_utils::IsAttributeWithExpectedValue(node, "to", static_cast<int64_t>(input_type->tensor_type().elem_type()));
 }
 
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/cast_elimination.h b/onnxruntime/core/optimizer/cast_elimination.h
index f1b880d678767..66837f1f9aa01 100644
--- a/onnxruntime/core/optimizer/cast_elimination.h
+++ b/onnxruntime/core/optimizer/cast_elimination.h
@@ -28,4 +28,4 @@ class CastElimination : public RewriteRule {
   Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
 };
 
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/fuse_initializers_transformer.cc b/onnxruntime/core/optimizer/fuse_initializers_transformer.cc
new file mode 100644
index 0000000000000..1516b07fc2049
--- /dev/null
+++ b/onnxruntime/core/optimizer/fuse_initializers_transformer.cc
@@ -0,0 +1,179 @@
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <string>
+#include <set>
+#include <algorithm>
+#include "core/graph/graph_utils.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/fuse_initializers_transformer.h"
+
+namespace onnxruntime {
+
+/**
+ * @brief   Check if the input node is cast node with no inputs and single output.
+ *
+ * @param graph_viewer  GraphViewer object
+ * @param node          Node object
+ *
+ * @return  True, if input node is Cast node with no inputs and single output,
+ *          else, False.
+ */
+static bool IsCastNodeWithConstraints(const GraphViewer& graph_viewer, const Node& node) {
+  // Node must be cast node
+  if (!("Cast" == node.OpType())) return false;
+
+  // Node must have no input edges
+  if (!(0 == node.GetInputEdgesCount())) return false;
+
+  // Node must have only one output edge
+  if (!(1 == node.GetOutputEdgesCount())) return false;
+
+  // Node output/s must not be part of graph output
+  // This check is added as GetOutputEdgesCount()
+  // don't count the output edges which are part
+  // of graph output.
+  if (graph_viewer.NodeProducesGraphOutput(node)) return false;
+
+  return true;
+}
+
+/**
+ * @brief   Check if for the current node has an initialized tensor of a specific type with specific type output.
+ *
+ * As the node to be checked is a Cast node with zero input, one initializer and one output, the input_defs_index
+ * and the output_defs_index is assumed to be 0 in all cases for all the checks.
+ *
+ * @param graph            Graph object.
+ * @param node             Node object.
+ * @param tensor_type      The type of initialized tensor to be found in the given node.
+ * @param output_type      The type of output for the given node.
+ *
+ * @return  True, if for the given node an initialized tensor of "tensor_type" with specific "output_type" is found,
+ *          else, False.
+ */
+static bool IsNodeValidForFusion(const Graph& graph,
+                                 const Node& node,
+                                 const onnxruntime::MLDataType tensor_type,
+                                 const onnxruntime::MLDataType output_type) {
+  // Node must have initialized tensor
+  if (!(graph.IsInitializedTensor(node.InputDefs()[0]->Name()))) return false;
+
+  // Initialzed tensor must be of tensor_type
+  if (!(DataTypeImpl::TypeFromProto(*(node.InputDefs()[0]->TypeAsProto())) == tensor_type)) return false;
+
+  // Node output must be of output_type
+  if (!(DataTypeImpl::TypeFromProto(*(node.OutputDefs()[0]->TypeAsProto())) == output_type)) return false;
+
+  return true;
+}
+
+/**
+ * @brief   Make a new name from the old node arg name.
+ *
+ * It replaces "InsertedPrecisionFreeCast_" prefix in a node name with "FusedBack_" prefix.
+ *
+ * @param   old_node_arg_name Old arg name.
+ *
+ * @return  New arg name.
+ */
+static const std::string NewNodeArgName(const std::string& old_node_arg_name) {
+  static thread_local const std::string pattern_to_be_replaced = "InsertedPrecisionFreeCast_";
+  std::string new_node_arg_name = old_node_arg_name;
+  auto pos = new_node_arg_name.find(pattern_to_be_replaced);
+  if (std::string::npos != pos) new_node_arg_name.replace(pos, pattern_to_be_replaced.size(), "");
+  new_node_arg_name = "FusedBack_" + new_node_arg_name;
+  return new_node_arg_name;
+}
+
+/**
+ * @brief   It fuses the initializer in the current node to its next node / output node, and then
+ *          remove the link/edge between current node and next node / output node.
+ *
+ * The node input_defs_index and output_defs_index is assumed to be always 0, as the node which encapsulates
+ * a single Initializer have just zero input, one initializer and one output.
+ *
+ * @param graph                 Graph object.
+ * @param node                  Current Node to be fused with its next node.
+ * @param next_node_arg_type    The "type" of initializer expected by the next-node to the current node.
+ * @param thread_pool           Thread pool for multi-threaded conversion of the initializer
+ *                              from an unsupported to supported type tensor.
+ */
+static void FuseInitializerWithNode(Graph& graph,
+                                    Node& node,
+                                    const onnxruntime::MLDataType next_node_arg_type,
+                                    onnxruntime::concurrency::ThreadPool* thread_pool) {
+  // Get next node
+  Node& next_node = *graph.GetNode(node.OutputNodesBegin()->Index());
+
+  // Get the index in next node at which the initializer must be replaced
+  NodeIndex next_node_arg_index = 0;
+  for (; next_node_arg_index < next_node.InputDefs().size(); ++next_node_arg_index) {
+    if (node.Name() == next_node.InputDefs()[next_node_arg_index]->Name()) {
+      break;
+    }
+  }
+
+  // Get the src initialized tensor at input def index 0
+  auto constant_initializer_tensor = graph_utils::GetConstantInitializer(graph, node.InputDefs()[0]->Name());
+  ONNX_NAMESPACE::TensorProto src_tensor(*constant_initializer_tensor);
+  Initializer src_init{*constant_initializer_tensor, graph.ModelPath()};
+  src_init.ToProto(src_tensor);
+
+  // Convert to dst tensor
+  ONNX_NAMESPACE::TensorProto dst_tensor;
+  if (next_node_arg_type == DataTypeImpl::GetTensorType<float>())
+    dst_tensor = src_init.ToFloat32(graph.GenerateNodeArgName(NewNodeArgName(next_node.InputDefs()[next_node_arg_index]->Name())), thread_pool);
+  else if (next_node_arg_type == DataTypeImpl::GetTensorType<MLFloat16>())
+    dst_tensor = src_init.ToFP16(graph.GenerateNodeArgName(NewNodeArgName(next_node.InputDefs()[next_node_arg_index]->Name())));
+  else if (next_node_arg_type == DataTypeImpl::GetTensorType<BFloat16>())
+    dst_tensor = src_init.ToBFloat16(graph.GenerateNodeArgName(NewNodeArgName(next_node.InputDefs()[next_node_arg_index]->Name())));
+  else
+    return;
+
+  // Remove the edge between the current node output def at index 0 and next node arg at relative arg index.
+  graph.RemoveEdge(node.Index(), next_node.Index(), 0, static_cast<int>(next_node_arg_index));
+
+  // Add the new converted Tensor in next node as initializer
+  graph_utils::ReplaceNodeInput(next_node, static_cast<int>(next_node_arg_index), graph_utils::AddInitializer(graph, dst_tensor));
+}
+
+Status FuseInitializersTransformer::ApplyImpl(Graph& graph, bool& modified, int /*graph_level*/, const logging::Logger& /*logger*/) const {
+  // Init
+  std::set<NodeIndex> nodes_to_be_fused_and_removed_from_graph;
+
+  // Get nodes in topological order
+  const GraphViewer graph_viewer(graph);
+  auto nodes_indexes_in_topological_order = graph_viewer.GetNodesInTopologicalOrder();
+
+  // For each Node
+  for (auto node_index : nodes_indexes_in_topological_order) {
+    // Get Node
+    auto node = graph.GetNode(node_index);
+
+    // Check if the current node is a Cast node with all constraints and valid for fusion
+    if (node && IsCastNodeWithConstraints(graph_viewer, *node) && IsNodeValidForFusion(graph, *node, init_type_, cvt_type_)) {
+      // Add node to the set of nodes to be fused and removed
+      nodes_to_be_fused_and_removed_from_graph.insert(node_index);
+    }
+  }
+
+  // Fuse all Cast Node with src type Initializer casted to dst type to Next Node
+  for (auto node_index : nodes_to_be_fused_and_removed_from_graph) {
+    auto node = graph.GetNode(node_index);
+    FuseInitializerWithNode(graph, *node, cvt_type_, thread_pool_);
+  }
+
+  // Remove all nodes considered during fusion
+  for (auto node_index : nodes_to_be_fused_and_removed_from_graph) {
+    graph.RemoveNode(node_index);
+  }
+
+  // set flag to true indicating the graph is changed
+  if (!nodes_to_be_fused_and_removed_from_graph.empty()) modified = true;
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/fuse_initializers_transformer.h b/onnxruntime/core/optimizer/fuse_initializers_transformer.h
new file mode 100644
index 0000000000000..91a4329582ae2
--- /dev/null
+++ b/onnxruntime/core/optimizer/fuse_initializers_transformer.h
@@ -0,0 +1,96 @@
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include "core/optimizer/graph_transformer.h"
+#include "core/framework/kernel_registry_manager.h"
+#include "core/framework/kernel_registry.h"
+
+namespace onnxruntime {
+
+/**
+ * @class FuseInitializersTransformer
+ *
+ * A Transformer to fuse cast node that casts from init_type to cvt_type, back to their next/output nodes.
+ * Below is the explanation on how this transforms works. It depends on "InsertCastTransforms" to produce the
+ * intermediate representation from which it fuses the initializers (which are the cast node with zero input,
+ * one initializer, and one output) back to the next/output node. After fusion, the link/edge between such
+ * cast node to next/output node will then be removed.
+ *
+ *
+ * ```
+ *
+ *         "Input Graph"                       "Intermediate Representation"               "Fusion Transforms"
+ *
+ *           --------                   --------        --------        --------                 --------
+ *          | X_Fp16 |                 | X_Fp16 |      | W_Fp16 |      | B_Fp16 |               | X_Fp16 |
+ *           --------                   --------        --------        --------                 --------
+ *              |                          |               |               |                        |
+ *              |                          |               |               |                        |
+ *              |                          V               V               V                        V
+ *              |                       | Cast |        | Cast |        | Cast |                 | Cast |
+ *              |                       | Fp16 |        | Fp16 |        | Fp16 |                 | Fp16 |
+ *              |                       |  To  |        |  To  |        |  To  |                 |  To  |
+ *              |                       | Fp32 |        | Fp32 |        | Fp32 |                 | Fp32 |
+ *              |                          |               |               |                        |
+ *              |                          |               |               |                        |
+ *              V                          V               V               V                        V
+ *  ----------------------------       -----------------------------------------       ----------------------------
+ * |        Conv_Fp16           |     |                                         |     |         Conv_Fp32          |
+ * |        --W_Fp16--          | ==> |                Conv_Fp32                | ==> |         --W_Fp32--         |
+ * |        --B_Fp16--          |     |                                         |     |         --B_Fp32--         |
+ *  ----------------------------       -----------------------------------------       ----------------------------
+ *              |                                          |                                        |
+ *              |                                          |                                        |
+ *              |                                          V                                        V
+ *              |                                       | Cast |                                 | Cast |
+ *              |                                       | Fp32 |                                 | Fp32 |
+ *              |                                       |  To  |                                 |  To  |
+ *              |                                       | Fp16 |                                 | Fp16 |
+ *              |                                          |                                        |
+ *              |                                          |                                        |
+ *              V                                          V                                        V
+ *           --------                                   --------                                 --------
+ *          | Y_Fp16 |                                 | Y_Fp16 |                               | Y_Fp16 |
+ *           --------                                   --------                                 --------
+ *
+ * ```
+ *
+ */
+class FuseInitializersTransformer : public GraphTransformer {
+ public:
+  /**
+   * @brief   Fuses Initializers to child node after conversion to child node kernel type
+   *          to save compute on Cast Op during inference.
+   *
+   * This transforms must be applied after InsertCastTransformer. Currently only FP16 Initializers are fused with
+   * nodes supporting FP32 Initializers, however, the code is designed to apply for any supported conversion/s.
+   *
+   * @param name          Name of the transforms, just for logging purpose.
+   * @param init_type     The unsupported type for which cast nodes are inserted to convert
+   *                      the initializers to supported type initializers expected by next/output nodes.
+   * @param cvt_type      The supported type initializers expected by next/output nodes.
+   * @param thread_pool   A pointer to thread pool to support conversion from init_type to cvt_type
+   *                      with multithreading
+   */
+  FuseInitializersTransformer(const std::string& name,
+                              const onnxruntime::MLDataType init_type,
+                              const onnxruntime::MLDataType cvt_type,
+                              onnxruntime::concurrency::ThreadPool* thread_pool = nullptr) : GraphTransformer(name, {}),
+                                                                                             init_type_(init_type),
+                                                                                             cvt_type_(cvt_type),
+                                                                                             thread_pool_(thread_pool) {}
+
+ private:
+  const onnxruntime::MLDataType init_type_;
+  const onnxruntime::MLDataType cvt_type_;
+  onnxruntime::concurrency::ThreadPool* thread_pool_;
+  Status ApplyImpl(
+      Graph& graph,
+      bool& modified,
+      int graph_level,
+      const logging::Logger& logger) const override;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_transformer_mgr.cc b/onnxruntime/core/optimizer/graph_transformer_mgr.cc
index 83c3f70799987..dd8c5f8a96c17 100644
--- a/onnxruntime/core/optimizer/graph_transformer_mgr.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_mgr.cc
@@ -21,6 +21,7 @@ common::Status GraphTransformerManager::GetSteps(unsigned& steps) const {
 
 common::Status GraphTransformerManager::ApplyTransformers(Graph& graph, TransformerLevel level,
                                                           const logging::Logger& logger) const {
+  _is_graph_modified = false;
   const auto& transformers = level_to_transformer_map_.find(level);
   if (transformers == level_to_transformer_map_.end()) {
     return Status::OK();
@@ -38,6 +39,7 @@ common::Status GraphTransformerManager::ApplyTransformers(Graph& graph, Transfor
       bool modified = false;
       ORT_RETURN_IF_ERROR(transformer->Apply(graph, modified, logger));
       graph_changed = graph_changed || modified;
+      _is_graph_modified = _is_graph_modified || modified;
     }
     if (!graph_changed) {
       break;
@@ -47,6 +49,14 @@ common::Status GraphTransformerManager::ApplyTransformers(Graph& graph, Transfor
   return Status::OK();
 }
 
+const bool& GraphTransformerManager::IsGraphModified(void) const {
+  return _is_graph_modified;
+}
+
+void GraphTransformerManager::ClearGraphModified(void) {
+  _is_graph_modified = false;
+}
+
 common::Status GraphTransformerManager::Register(std::unique_ptr<GraphTransformer> transformer,
                                                  TransformerLevel level) {
   const auto& name = transformer->Name();
diff --git a/onnxruntime/core/optimizer/graph_transformer_mgr.h b/onnxruntime/core/optimizer/graph_transformer_mgr.h
index eab57f12bfcbb..e5bf18c2c3f64 100644
--- a/onnxruntime/core/optimizer/graph_transformer_mgr.h
+++ b/onnxruntime/core/optimizer/graph_transformer_mgr.h
@@ -40,6 +40,11 @@ class GraphTransformerManager {
   // Apply all transformers registered for the given level on the given graph
   common::Status ApplyTransformers(Graph& graph, TransformerLevel level, const logging::Logger& logger) const;
 
+  // Get if the graph is modified while applying the registered transformers
+  const bool& IsGraphModified(void) const;
+  // Set/Re-Set graph modified to "false" (generally) to remove any trace of previous application
+  void ClearGraphModified(void);
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphTransformerManager);
 
@@ -49,5 +54,6 @@ class GraphTransformerManager {
   InlinedHashMap<TransformerLevel, InlinedVector<std::unique_ptr<GraphTransformer>>> level_to_transformer_map_;
   InlinedHashMap<std::string, GraphTransformer*> transformers_info_;
   CheckLoadCancellationFn check_load_cancellation_fn_;
+  mutable bool _is_graph_modified = false;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index b03959e4f067b..24f4ad867d101 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -24,6 +24,7 @@
 #include "core/optimizer/bias_gelu_fusion.h"
 #include "core/optimizer/bias_softmax_fusion.h"
 #include "core/optimizer/cast_elimination.h"
+#include "core/optimizer/cast_chain_elimination.h"
 #include "core/optimizer/common_subexpression_elimination.h"
 #include "core/optimizer/constant_folding.h"
 #include "core/optimizer/constant_sharing.h"
@@ -61,6 +62,7 @@
 #include "core/optimizer/not_where_fusion.h"
 #include "core/optimizer/pad_fusion.h"
 #include "core/optimizer/pre_shape_node_elimination.h"
+#include "core/optimizer/fuse_initializers_transformer.h"
 #ifdef MLAS_TARGET_AMD64_IX86
 #include "core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.h"
 #endif
@@ -114,8 +116,10 @@ std::string GenerateRuleBasedTransformerName(TransformerLevel level) {
 
 InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
     TransformerLevel level,
-    const InlinedHashSet<std::string>& rules_to_disable) {
+    const InlinedHashSet<std::string>& rules_to_disable,
+    const bool enable_cast_chain_elimination) {
   InlinedVector<std::unique_ptr<RewriteRule>> rules;
+
   switch (level) {
     case TransformerLevel::Level1:
       rules.push_back(std::make_unique<EliminateIdentity>());
@@ -124,6 +128,9 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
       rules.push_back(std::make_unique<EliminateDropout>());
       rules.push_back(std::make_unique<ExpandElimination>());
       rules.push_back(std::make_unique<CastElimination>());
+      if (enable_cast_chain_elimination) {
+        rules.push_back(std::make_unique<CastChainElimination>());
+      }
       rules.push_back(std::make_unique<PreShapeNodeElimination>());
       rules.push_back(std::make_unique<NoopElimination>());
       rules.push_back(std::make_unique<DivMulFusion>());
@@ -148,6 +155,9 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
     case TransformerLevel::Level3:
       break;
 
+    case TransformerLevel::Level4:
+      break;
+
     default:
       ORT_THROW("Unsupported optimization level: ", static_cast<int>(level));
   }
@@ -171,8 +181,9 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
 std::unique_ptr<RuleBasedGraphTransformer> GenerateRuleBasedGraphTransformer(
     TransformerLevel level,
     const InlinedHashSet<std::string>& rules_to_disable,
-    const InlinedHashSet<std::string_view>& compatible_execution_providers) {
-  auto rewrite_rules_to_register = GenerateRewriteRules(level, rules_to_disable);
+    const InlinedHashSet<std::string_view>& compatible_execution_providers,
+    const bool enable_cast_chain_elimination) {
+  auto rewrite_rules_to_register = GenerateRewriteRules(level, rules_to_disable, enable_cast_chain_elimination);
   if (rewrite_rules_to_register.empty()) {
     return nullptr;
   }
@@ -198,6 +209,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool disable_quant_qdq =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
+  const bool enable_cast_chain_elimination =
+      session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableCastChainElimination, "0") == "1";
 #ifndef DISABLE_CONTRIB_OPS
   const InlinedHashSet<std::string_view> cpu_ep = {onnxruntime::kCpuExecutionProvider};
   const InlinedHashSet<std::string_view> cpu_acl_eps = {onnxruntime::kCpuExecutionProvider,
@@ -211,7 +224,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       // RewriteRule optimizations are the simplest (they generally remove unnecessary nodes and are cheap to run)
       // so run them first so there is potentially less for the more intensive optimizations like ConstantFolding,
       // CommonSubexpressionElimination and TransposeOptimizer to do.
-      auto rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, {});
+      auto rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, {}, enable_cast_chain_elimination);
       if (rule_transformer != nullptr) {
         transformers.emplace_back(std::move(rule_transformer));
       }
@@ -265,7 +278,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     } break;
 
     case TransformerLevel::Level2: {
-      auto rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, {});
+      auto rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, {}, enable_cast_chain_elimination);
       if (rule_transformer != nullptr) {
         transformers.emplace_back(std::move(rule_transformer));
       }
@@ -428,6 +441,16 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
 
     } break;
 
+    case TransformerLevel::Level4: {
+      auto fuse_initializers_transformer_fp16_to_fp32 = std::make_unique<FuseInitializersTransformer>(
+          "FuseFp16InitializerToFp32NodeTransformer",
+          DataTypeImpl::GetTensorType<MLFloat16>(),
+          DataTypeImpl::GetTensorType<float>(),
+          intra_op_thread_pool);
+      transformers.emplace_back(std::move(fuse_initializers_transformer_fp16_to_fp32));
+
+    } break;
+
     default:
       ORT_THROW("Unsupported optimization level: ", static_cast<int>(level));
   }
@@ -510,6 +533,10 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
 #endif
       }
     } break;
+
+    case TransformerLevel::Level4:
+      break;
+
     default:
       ORT_THROW("Unsupported optimization level: ", static_cast<int>(level));
   }
diff --git a/onnxruntime/core/optimizer/initializer.cc b/onnxruntime/core/optimizer/initializer.cc
index 33fd613bb1a50..81eb50286728f 100644
--- a/onnxruntime/core/optimizer/initializer.cc
+++ b/onnxruntime/core/optimizer/initializer.cc
@@ -5,11 +5,13 @@
 
 #include <functional>
 #include <memory>
-
+#include <string>
 #include <gsl/gsl>
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/tensor_external_data_info.h"
 #include "core/platform/env.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/common/cpuid_info.h"
 
 namespace onnxruntime {
 
@@ -125,6 +127,62 @@ struct TensorToProtoBFloat16 {
   }
 };
 
+template <typename T>
+struct ToFloat32;
+
+template <>
+struct ToFloat32<float> {
+  float operator()(const float& f) const {
+    return f;
+  }
+};
+
+template <>
+struct ToFloat32<double> {
+  float operator()(double d) const {
+    return static_cast<float>(d);
+  }
+};
+
+template <>
+struct ToFloat32<BFloat16> {
+  float operator()(BFloat16 bf) const {
+    return static_cast<float>(bf);
+  }
+};
+
+template <>
+struct ToFloat32<MLFloat16> {
+  float operator()(MLFloat16 hf) const {
+    return static_cast<float>(hf);
+  }
+};
+
+template <typename T>
+struct TensorToProtoFloat32 {
+  void operator()(const Tensor& data, ONNX_NAMESPACE::TensorProto& proto, onnxruntime::concurrency::ThreadPool* /*thread_pool*/) const {
+    auto span = data.DataAsSpan<T>();
+    ToFloat32<T> to_float32;
+    for (const auto& v : span) {
+      proto.add_float_data(to_float32(v));
+    }
+  }
+};
+
+template <>
+struct TensorToProtoFloat32<MLFloat16> {
+  void operator()(const Tensor& data,
+                  ONNX_NAMESPACE::TensorProto& proto,
+                  onnxruntime::concurrency::ThreadPool* thread_pool) const {
+    auto source = reinterpret_cast<const MLFloat16*>(data.DataRaw());
+    auto count = size_t(data.SizeInBytes() / sizeof(MLFloat16));
+    auto destination_mem = std::make_unique<float[]>(count);
+    auto destination = destination_mem.get();
+    MlasConvertHalfToFloatBufferInParallel(source, destination, count, thread_pool);
+    utils::SetRawDataInTensorProto(proto, destination, count * sizeof(float));
+  }
+};
+
 inline void SetNameDims(const std::string& name,
                         gsl::span<const int64_t> dims,
                         ONNX_NAMESPACE::TensorProto_DataType dt,
@@ -155,6 +213,14 @@ ONNX_NAMESPACE::TensorProto Initializer::ToBFloat16(const std::string& name) con
   return tensor_proto;
 }
 
+ONNX_NAMESPACE::TensorProto Initializer::ToFloat32(const std::string& name, onnxruntime::concurrency::ThreadPool* thread_pool) const {
+  ONNX_NAMESPACE::TensorProto tensor_proto;
+  SetNameDims(name, data_.Shape().GetDims(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT, tensor_proto);
+  utils::MLTypeCallDispatcher<float, double, BFloat16, MLFloat16> t_disp(data_.GetElementType());
+  t_disp.Invoke<TensorToProtoFloat32>(data_, tensor_proto, thread_pool);
+  return tensor_proto;
+}
+
 namespace {
 
 // std::identity c++20
diff --git a/onnxruntime/core/optimizer/initializer.h b/onnxruntime/core/optimizer/initializer.h
index 3099faed18ac3..17d1ada29d778 100644
--- a/onnxruntime/core/optimizer/initializer.h
+++ b/onnxruntime/core/optimizer/initializer.h
@@ -38,6 +38,8 @@ class Initializer final {
   ONNX_NAMESPACE::TensorProto ToFP16(const std::string& name) const;
 
   ONNX_NAMESPACE::TensorProto ToBFloat16(const std::string& name) const;
+
+  ONNX_NAMESPACE::TensorProto ToFloat32(const std::string& name, onnxruntime::concurrency::ThreadPool* thread_pool = nullptr) const;
 #endif  // ORT_EXTENDED_MINIMAL_BUILD
   int data_type() const {
     return data_.GetElementType();
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 6515661a2ee6a..255714054cdaa 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -764,6 +764,24 @@ bool TopKNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& n
   return IsQDQPairSupported(q_node, dq_node, get_const_initializer, graph_viewer.ModelPath());
 }
 
+bool CumSumNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const Node* redundant_clip_node,
+                                    const std::vector<const Node*>& dq_nodes,
+                                    const std::vector<const Node*>& q_nodes) const {
+  // Only the first input has DQ node
+  if (!CheckQDQNodes(graph_viewer, node, redundant_clip_node, dq_nodes, q_nodes, 1)) {
+    return false;
+  }
+
+  int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+
+  if (dt_input != dt_output) {
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace QDQ
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index e4f4844fb88ad..6f7e153ec6ecb 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -285,6 +285,14 @@ class TopKNodeGroupSelector : public NodeGroupSelector {
              const std::vector<const Node*>& q_nodes) const override;
 };
 
+// one DQ node for first input -> node -> Q
+class CumSumNodeGroupSelector : public NodeGroupSelector {
+  bool Check(const GraphViewer& graph_viewer,
+             const Node& node, const Node* redundant_clip_node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 /*
  * NodeSelector instances for use in the QDQ::SelectorActionTransformer.
  */
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index d3957a34dcfca..a39a6e8cc0e93 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -146,6 +146,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetPadOpVersionsMap() {
 static const OpVersionsAndSelector::OpVersionsMap GetTopKOpVersionsMap() {
   return {{"TopK", {}}};
 }
+static const OpVersionsAndSelector::OpVersionsMap GetCumSumOpVersionsMap() {
+  return {{"CumSum", {}}};
+}
 
 /* Selector rules registration related */
 void RegisterMiscSelectors(Selectors& qdq_selectors) {
@@ -268,6 +271,13 @@ void RegisterTopKSelector(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
+void RegisterCumSumSelector(Selectors& qdq_selectors) {
+  /* register selector for cumsum op */
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<CumSumNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetCumSumOpVersionsMap(),
+                                 std::move(selector));
+}
+
 void SelectorManager::CreateSelectors() {
   RegisterMiscSelectors(qdq_selectors_);
   RegisterDropDQSelectors(qdq_selectors_);
@@ -286,6 +296,7 @@ void SelectorManager::CreateSelectors() {
   RegisterWhereSelectors(qdq_selectors_);
   RegisterPadSelectors(qdq_selectors_);
   RegisterTopKSelector(qdq_selectors_);
+  RegisterCumSumSelector(qdq_selectors_);
 }
 
 void SelectorManager::InitializeSelectorsMap() {
diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc
index 61db2bf368b09..fdd4fa5b815d6 100644
--- a/onnxruntime/core/platform/windows/device_discovery.cc
+++ b/onnxruntime/core/platform/windows/device_discovery.cc
@@ -362,13 +362,31 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoD3D12() {
   return device_info;
 }
 
+typedef HRESULT(WINAPI* PFN_DXCoreCreateAdapterFactory)(REFIID riid, void** ppvFactory);
+
 // returns LUID to DeviceInfo
 std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoDxcore() {
   std::unordered_map<uint64_t, DeviceInfo> device_info;
 
+  // Load dxcore.dll. We do this manually so there's not a hard dependency on dxcore which is newer.
+  wil::unique_hmodule dxcore_lib{LoadLibraryExW(L"dxcore.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32)};
+  if (!dxcore_lib) {
+    LOGS_DEFAULT(INFO) << "Failed to load dxcore.dll. Expected on older Windows version that do not support dxcore.";
+    return device_info;
+  }
+
+  auto pfnDXCoreCreateAdapterFactory = reinterpret_cast<PFN_DXCoreCreateAdapterFactory>(
+      GetProcAddress(dxcore_lib.get(), "DXCoreCreateAdapterFactory"));
+
+  if (!pfnDXCoreCreateAdapterFactory) {
+    // this isn't expected to fail so ERROR not WARNING
+    LOGS_DEFAULT(ERROR) << "Failed to get DXCoreCreateAdapterFactory function address.";
+    return device_info;
+  }
+
   // Get all GPUs and NPUs by querying WDDM/MCDM.
   wil::com_ptr<IDXCoreAdapterFactory> adapterFactory;
-  if (FAILED(DXCoreCreateAdapterFactory(IID_PPV_ARGS(&adapterFactory)))) {
+  if (FAILED(pfnDXCoreCreateAdapterFactory(IID_PPV_ARGS(&adapterFactory)))) {
     return device_info;
   }
 
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 489cd19b11302..6a7f292d83b72 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -106,18 +106,15 @@ HRESULT EtwRegistrationManager::Status() const {
   return etw_status_;
 }
 
-void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) {
+void EtwRegistrationManager::RegisterInternalCallback(const std::string& cb_key, EtwInternalCallback callback) {
   std::lock_guard<std::mutex> lock(callbacks_mutex_);
-  callbacks_.push_back(&callback);
+  [[maybe_unused]] auto result = callbacks_.emplace(cb_key, std::move(callback));
+  assert(result.second);
 }
 
-void EtwRegistrationManager::UnregisterInternalCallback(const EtwInternalCallback& callback) {
+void EtwRegistrationManager::UnregisterInternalCallback(const std::string& cb_key) {
   std::lock_guard<std::mutex> lock(callbacks_mutex_);
-  auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(),
-                                [&callback](const EtwInternalCallback* ptr) {
-                                  return ptr == &callback;
-                                });
-  callbacks_.erase(new_end, callbacks_.end());
+  callbacks_.erase(cb_key);
 }
 
 void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback(
@@ -138,21 +135,12 @@ void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback(
   manager.InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
 }
 
-EtwRegistrationManager::~EtwRegistrationManager() {
-  std::lock_guard<std::mutex> lock(callbacks_mutex_);
-  callbacks_.clear();
-  if (initialization_status_ == InitializationStatus::Initialized ||
-      initialization_status_ == InitializationStatus::Initializing) {
-    std::lock_guard<std::mutex> init_lock(init_mutex_);
-    assert(initialization_status_ != InitializationStatus::Initializing);
-    if (initialization_status_ == InitializationStatus::Initialized) {
-      ::TraceLoggingUnregister(etw_provider_handle);
-      initialization_status_ = InitializationStatus::NotInitialized;
-    }
-  }
-}
-
-EtwRegistrationManager::EtwRegistrationManager() {
+EtwRegistrationManager::EtwRegistrationManager()
+    : initialization_status_(InitializationStatus::NotInitialized),
+      is_enabled_(false),
+      level_(),
+      keyword_(0),
+      etw_status_(S_OK) {
 }
 
 void EtwRegistrationManager::LazyInitialize() {
@@ -173,6 +161,13 @@ void EtwRegistrationManager::LazyInitialize() {
   }
 }
 
+EtwRegistrationManager::~EtwRegistrationManager() {
+  if (initialization_status_ == InitializationStatus::Initialized) {
+    ::TraceLoggingUnregister(etw_provider_handle);
+    initialization_status_ = InitializationStatus::NotInitialized;
+  }
+}
+
 void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
                                              ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
                                              PVOID CallbackContext) {
@@ -182,10 +177,9 @@ void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled,
   }
 
   std::lock_guard<std::mutex> lock(callbacks_mutex_);
-  for (const auto& callback : callbacks_) {
-    if (callback != nullptr) {
-      (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
-    }
+  for (const auto& entry : callbacks_) {
+    const auto& cb = entry.second;
+    cb(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
   }
 }
 
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index 62b762886ca82..308770252f85a 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -20,6 +20,7 @@
 #include <atomic>
 #include <iostream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "core/common/logging/capture.h"
@@ -77,9 +78,9 @@ class EtwRegistrationManager {
   // Get the ETW registration status
   HRESULT Status() const;
 
-  void RegisterInternalCallback(const EtwInternalCallback& callback);
+  void RegisterInternalCallback(const std::string& cb_key, EtwInternalCallback callback);
 
-  void UnregisterInternalCallback(const EtwInternalCallback& callback);
+  void UnregisterInternalCallback(const std::string& cb_key);
 
  private:
   EtwRegistrationManager();
@@ -100,11 +101,11 @@ class EtwRegistrationManager {
       _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
       _In_opt_ PVOID CallbackContext);
 
-  std::vector<const EtwInternalCallback*> callbacks_;
+  std::mutex init_mutex_;
+  std::atomic<InitializationStatus> initialization_status_ = InitializationStatus::NotInitialized;
+  std::unordered_map<std::string, EtwInternalCallback> callbacks_;
   std::mutex callbacks_mutex_;
   mutable std::mutex provider_change_mutex_;
-  std::mutex init_mutex_;
-  InitializationStatus initialization_status_ = InitializationStatus::NotInitialized;
   bool is_enabled_;
   UCHAR level_;
   ULONGLONG keyword_;
@@ -133,8 +134,8 @@ class EtwRegistrationManager {
   Severity MapLevelToSeverity() { return Severity::kFATAL; }
   uint64_t Keyword() const { return 0; }
   HRESULT Status() const { return 0; }
-  void RegisterInternalCallback(const EtwInternalCallback& callback) {}
-  void UnregisterInternalCallback(const EtwInternalCallback& callback) {}
+  void RegisterInternalCallback(const std::string& cb_key, EtwInternalCallback callback) {}
+  void UnregisterInternalCallback(const std::string& cb_key) {}
 
  private:
   EtwRegistrationManager() = default;
diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc
index 2385bae65d491..5d2cfd216dffc 100644
--- a/onnxruntime/core/platform/windows/telemetry.cc
+++ b/onnxruntime/core/platform/windows/telemetry.cc
@@ -60,11 +60,12 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim
 std::mutex WindowsTelemetry::mutex_;
 std::mutex WindowsTelemetry::provider_change_mutex_;
 uint32_t WindowsTelemetry::global_register_count_ = 0;
-bool WindowsTelemetry::enabled_ = true;
+std::atomic_bool WindowsTelemetry::enabled_{true};
 uint32_t WindowsTelemetry::projection_ = 0;
-UCHAR WindowsTelemetry::level_ = 0;
-UINT64 WindowsTelemetry::keyword_ = 0;
-std::vector<const WindowsTelemetry::EtwInternalCallback*> WindowsTelemetry::callbacks_;
+std::atomic<UCHAR> WindowsTelemetry::level_{0};
+std::atomic<UINT64> WindowsTelemetry::keyword_{0};
+
+std::unordered_map<std::string, WindowsTelemetry::CallbackRecord> WindowsTelemetry::callbacks_;
 std::mutex WindowsTelemetry::callbacks_mutex_;
 
 WindowsTelemetry::WindowsTelemetry() {
@@ -79,49 +80,45 @@ WindowsTelemetry::WindowsTelemetry() {
 }
 
 WindowsTelemetry::~WindowsTelemetry() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  if (global_register_count_ > 0) {
-    global_register_count_ -= 1;
-    if (global_register_count_ == 0) {
-      TraceLoggingUnregister(telemetry_provider_handle);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (global_register_count_ > 0) {
+      global_register_count_ -= 1;
+      if (global_register_count_ == 0) {
+        TraceLoggingUnregister(telemetry_provider_handle);
+      }
     }
   }
-
-  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  callbacks_.clear();
 }
 
 bool WindowsTelemetry::IsEnabled() const {
-  std::lock_guard<std::mutex> lock(provider_change_mutex_);
   return enabled_;
 }
 
 UCHAR WindowsTelemetry::Level() const {
-  std::lock_guard<std::mutex> lock(provider_change_mutex_);
   return level_;
 }
 
 UINT64 WindowsTelemetry::Keyword() const {
-  std::lock_guard<std::mutex> lock(provider_change_mutex_);
   return keyword_;
 }
 
-// HRESULT WindowsTelemetry::Status() {
-//     return etw_status_;
-// }
-
-void WindowsTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) {
+void WindowsTelemetry::RegisterInternalCallback(const std::string& callback_key, EtwInternalCallback callback) {
   std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  callbacks_.push_back(&callback);
+  auto result = callbacks_.emplace(callback_key, std::move(callback));
+  if (!result.second) {
+    result.first->second.IncrementRef();
+  }
 }
 
-void WindowsTelemetry::UnregisterInternalCallback(const EtwInternalCallback& callback) {
+void WindowsTelemetry::UnregisterInternalCallback(const std::string& callback_key) {
   std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(),
-                                [&callback](const EtwInternalCallback* ptr) {
-                                  return ptr == &callback;
-                                });
-  callbacks_.erase(new_end, callbacks_.end());
+  auto hit = callbacks_.find(callback_key);
+  if (hit != callbacks_.end()) {
+    if (hit->second.DecrementRef() < 1) {
+      callbacks_.erase(hit);
+    }
+  }
 }
 
 void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback(
@@ -132,10 +129,12 @@ void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback(
     _In_ ULONGLONG MatchAllKeyword,
     _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
     _In_opt_ PVOID CallbackContext) {
-  std::lock_guard<std::mutex> lock(provider_change_mutex_);
-  enabled_ = (IsEnabled != 0);
-  level_ = Level;
-  keyword_ = MatchAnyKeyword;
+  {
+    std::lock_guard<std::mutex> lock(provider_change_mutex_);
+    enabled_ = (IsEnabled != 0);
+    level_ = Level;
+    keyword_ = MatchAnyKeyword;
+  }
 
   InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
 }
@@ -144,8 +143,9 @@ void WindowsTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR
                                        ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
                                        PVOID CallbackContext) {
   std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  for (const auto& callback : callbacks_) {
-    (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  for (const auto& entry : callbacks_) {
+    const auto& cb = entry.second.cb;
+    cb(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
   }
 }
 
diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h
index 92b3d11d77702..ff4afcae79ac5 100644
--- a/onnxruntime/core/platform/windows/telemetry.h
+++ b/onnxruntime/core/platform/windows/telemetry.h
@@ -2,7 +2,9 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include <atomic>
+#include <unordered_map>
 #include <vector>
 
 #include "core/platform/telemetry.h"
@@ -34,9 +36,6 @@ class WindowsTelemetry : public Telemetry {
   // Get the current keyword
   UINT64 Keyword() const override;
 
-  // Get the ETW registration status
-  // static HRESULT Status();
-
   void LogProcessInfo() const override;
 
   void LogSessionCreationStart() const override;
@@ -68,21 +67,30 @@ class WindowsTelemetry : public Telemetry {
                                                  ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
                                                  PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
 
-  static void RegisterInternalCallback(const EtwInternalCallback& callback);
+  static void RegisterInternalCallback(const std::string& callback_key, EtwInternalCallback callback);
 
-  static void UnregisterInternalCallback(const EtwInternalCallback& callback);
+  static void UnregisterInternalCallback(const std::string& callback_key);
 
  private:
   static std::mutex mutex_;
   static uint32_t global_register_count_;
-  static bool enabled_;
   static uint32_t projection_;
 
-  static std::vector<const EtwInternalCallback*> callbacks_;
+  struct CallbackRecord {
+    EtwInternalCallback cb;
+    int ref = 1;
+    explicit CallbackRecord(EtwInternalCallback cb) : cb(std::move(cb)) {}
+    void IncrementRef() { ++ref; }
+    int DecrementRef() { return --ref; }
+  };
+
+  static std::unordered_map<std::string, CallbackRecord> callbacks_;
   static std::mutex callbacks_mutex_;
+
+  static std::atomic_bool enabled_;
+  static std::atomic<UCHAR> level_;
+  static std::atomic<UINT64> keyword_;
   static std::mutex provider_change_mutex_;
-  static UCHAR level_;
-  static ULONGLONG keyword_;
 
   static void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
                               ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext);
diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index 38ac629331749..c879c4c56daf1 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -74,17 +74,6 @@ bool IsInputSupported(const Node& node, const NodeArg& input, const OpBuilderInp
   }
 
   for (const auto dim : shape) {
-    // For some undocumented reason, Apple CoreML framework will fail loading the model if the model
-    // input has dimension > 16384
-    // See this issue, https://github.com/apple/coremltools/issues/1003
-    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf has maximum texture widths which may be the
-    // root cause.
-    if (dim > 16384) {
-      LOGS(logger, WARNING) << "CoreML does not support input dim > 16384. Input:" << input_name
-                            << ", shape: " << Shape2String(shape);
-      return false;
-    }
-
     if (dim == 0 && !allow_empty_input) {
       LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
                             << ", shape: " << Shape2String(shape);
@@ -173,5 +162,22 @@ bool HasNeuralEngine() {
   return has_neural_engine;
 }
 
+bool CheckShapeForConvMemoryLimit(gsl::span<const int64_t> shape, const logging::Logger& logger) {
+  // For some undocumented reason, Apple CoreML framework will fail loading the model if the model
+  // input has dimension > 16384
+  // See this issue, https://github.com/apple/coremltools/issues/1003
+  // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf has maximum texture widths which may be the
+  // root cause.
+  // Only seems to apply to convolution networks -- limit comes from the size of the texture memory
+  for (auto dim : shape) {
+    if (dim > 16384) {
+      LOGS(logger, VERBOSE) << "Input shape: " << Shape2String(shape)
+                            << " exceeds CoreML convolution memory limit of 16384";
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/helper.h b/onnxruntime/core/providers/coreml/builders/helper.h
index ae7f3bdbc31a9..43132ea13e13f 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.h
+++ b/onnxruntime/core/providers/coreml/builders/helper.h
@@ -48,5 +48,10 @@ bool CheckIsConstantInitializer(const NodeArg& node_arg, const GraphViewer& grap
 // This is to detect if the current system has Apple Neural Engine
 bool HasNeuralEngine();
 
+// See this issue, https://github.com/apple/coremltools/issues/1003
+// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf has maximum texture widths which may be the
+// root cause.
+bool CheckShapeForConvMemoryLimit(gsl::span<const int64_t> shape, const logging::Logger& logger);
+
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 18823bcc78d19..97a5a078b18a4 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -236,6 +236,32 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   // use the weight for the shape as it should always be known
   const auto* weight_shape = input_defs[1]->Shape();
   int64_t num_dims = weight_shape ? weight_shape->dim_size() : -1;
+  const auto& output = *node.OutputDefs()[0];
+
+  std::vector<int64_t> weight_shape_vec;
+  std::vector<int64_t> x_shape_vec;
+  std::vector<int64_t> output_shape_vec;
+
+  if (!GetShape(*input_defs[1], weight_shape_vec, logger)) {
+    LOGS(logger, VERBOSE) << "Unable to get the shape of 'W' input, which is necessary to check for valid convolutions.";
+    return false;
+  }
+
+  if (!GetShape(*input_defs[0], x_shape_vec, logger)) {
+    LOGS(logger, VERBOSE) << "Unable to get the shape of 'X' input, which is necessary to check for valid convolutions.";
+    return false;
+  }
+
+  if (!GetShape(output, output_shape_vec, logger)) {
+    LOGS(logger, VERBOSE) << "Unable to get the shape of the output, which is necessary to check for valid convolutions.";
+    return false;
+  }
+
+  if (!CheckShapeForConvMemoryLimit(weight_shape_vec, logger) ||
+      !CheckShapeForConvMemoryLimit(x_shape_vec, logger) ||
+      !CheckShapeForConvMemoryLimit(output_shape_vec, logger)) {
+    return false;
+  }
 
   // ONNX spec requires N and C as first 2 dims
   if (num_dims != 3 && num_dims != 4) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
index 2e2c898b0e10a..3ef2c328e6f41 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
@@ -137,6 +137,10 @@ bool ConvTransposeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilder
     return false;
   }
 
+  if (!CheckShapeForConvMemoryLimit(weight_shape, logger) || !CheckShapeForConvMemoryLimit(input_shape, logger)) {
+    return false;
+  }
+
   int64_t num_spatial_dims = narrow<int64_t>(weight_shape.size()) - 2;
 
   NodeAttrHelper helper(node);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index 2411cd459fecd..39d7bb6974e9d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -30,12 +30,19 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& output_name = node.OutputDefs()[0]->Name();
 
   std::vector<int64_t> data_shape;
-  ORT_RETURN_IF_NOT(GetStaticShape(*node.InputDefs()[0], data_shape, logger), "Failed to get input shape.");
 
   NodeAttrHelper helper(node);
   int32_t axis_default_value = (node.SinceVersion() < 13) ? 1 : -1;
   const auto axis = helper.Get("axis", axis_default_value);
-  auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
+  int64_t axis_nonnegative = axis;
+
+  if (node.SinceVersion() < 13) {
+    ORT_RETURN_IF_NOT(GetStaticShape(*node.InputDefs()[0], data_shape, logger), "Failed to get input shape.");
+    axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
+  } else {
+    ORT_RETURN_IF_NOT(GetShape(*node.InputDefs()[0], data_shape, logger),
+                      "Softmax input must have shape information.");
+  }
 
   // CoreML's softmax match onnx's softmax behavior since opset 13.
   // For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior.
@@ -129,12 +136,14 @@ bool SoftmaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
                                          const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   std::vector<int64_t> input_shape;
-  if (!GetStaticShape(*input_defs[0], input_shape, logger))
+
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    LOGS(logger, VERBOSE) << "Softmax input [" << input_defs[0]->Name() << "] must have shape information.";
     return false;
+  }
 
-  const TensorShape shape(input_shape);
-  if (shape.Size() == 0) {
-    LOGS(logger, VERBOSE) << "Empty input data is not supported.";
+  if (!IsStaticShape(input_shape) && node.SinceVersion() < 13) {
+    LOGS(logger, VERBOSE) << "Softmax input must have static shape for ONNX opset < 13";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index e9cbcb253b304..51b3146c25a73 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -1329,6 +1329,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Tr
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Unsqueeze);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Size);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, float, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, double, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, MLFloat16, RMSNormalization);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -3309,6 +3312,12 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, float,
+                                                                  RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, double,
+                                                                  RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 23, MLFloat16,
+                                                                  RMSNormalization)>,
   };
   for (auto& function_table_entry : function_table) {
     KernelCreateInfo info = function_table_entry();
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
index bf3fd37d10f5c..af34d14f86cd8 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
@@ -8,6 +8,13 @@
 #include "ml_common.h"
 #include <math.h>
 
+#if !defined(ORT_MINIMAL_BUILD) && defined(_DEBUG)
+// This prints out the tree structure in DEBUG mode.
+// Method AddNodes can fuse multiple node BRANCH_EQ into a single
+// BRANCH_MEMBER to be more efficient. This shows what was done by the method.
+// #define _TREE_DEBUG
+#endif
+
 namespace onnxruntime {
 namespace ml {
 namespace detail {
@@ -41,9 +48,9 @@ struct ScoreValue {
   T score;
   unsigned char has_score;
   operator T() const { return has_score ? score : 0; }
-  T operator-() { return has_score ? -score : 0; }
-  T operator*(float val) { return has_score ? score * static_cast<T>(val) : 0; }
-  T operator*(double val) { return has_score ? score * static_cast<T>(val) : 0; }
+  T operator-() const { return has_score ? -score : 0; }
+  T operator*(float val) const { return has_score ? score * static_cast<T>(val) : 0; }
+  T operator*(double val) const { return has_score ? score * static_cast<T>(val) : 0; }
   ScoreValue<T>& operator=(ScoreValue<T> v) {
     this->score = v.score;
     this->has_score = v.has_score;
@@ -112,6 +119,31 @@ inline NODE_MODE_ORT Convert_NODE_MODE_ONNX_to_ORT(NODE_MODE_ONNX node_mode) {
   };
 }
 
+#if defined(_TREE_DEBUG)
+inline const char* Convert_NODE_MODE_ONNX_to_string(NODE_MODE_ORT node_mode) {
+  switch (node_mode) {
+    case NODE_MODE_ORT::LEAF:
+      return "LEAF";
+    case NODE_MODE_ORT::BRANCH_LEQ:
+      return "LEQ";
+    case NODE_MODE_ORT::BRANCH_LT:
+      return "LT";
+    case NODE_MODE_ORT::BRANCH_GTE:
+      return "GTE";
+    case NODE_MODE_ORT::BRANCH_GT:
+      return "GT";
+    case NODE_MODE_ORT::BRANCH_EQ:
+      return "EQ";
+    case NODE_MODE_ORT::BRANCH_NEQ:
+      return "NEQ";
+    case NODE_MODE_ORT::BRANCH_MEMBER:
+      return "MEMBER";
+    default:
+      ORT_THROW("Unexpected value for node_mode");
+  };
+}
+#endif
+
 template <typename T>
 struct TreeNodeElement {
   int feature_id;
@@ -137,6 +169,20 @@ struct TreeNodeElement {
   inline NODE_MODE_ORT mode() const { return NODE_MODE_ORT(flags & 0xF); }
   inline bool is_not_leaf() const { return !(flags & NODE_MODE_ORT::LEAF); }
   inline bool is_missing_track_true() const { return flags & MissingTrack::kTrue; }
+
+#if defined(_TREE_DEBUG)
+  std::string str() const {
+    std::ostringstream msg;
+    msg << (&feature_id) << ":" << Convert_NODE_MODE_ONNX_to_string(mode())
+        << (is_missing_track_true() ? ":MT" : ":MF")
+        << ":F" << feature_id
+        << ":[" << value_or_unique_weight << "]"
+        << "->" << truenode_or_weight.ptr
+        << ":" << truenode_or_weight.weight_data.weight
+        << ":" << truenode_or_weight.weight_data.weight;
+    return msg.str();
+  }
+#endif
 };
 
 template <typename InputType, typename ThresholdType, typename OutputType>
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
index 10d4db0e0e3b0..a02dbaf9f6e83 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
@@ -147,14 +147,19 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
 
   // Additional members
   size_t limit;
-  uint32_t i;
+  uint32_t i;  // this variable is used in different loops with different types for the upper bound
   InlinedVector<NODE_MODE_ONNX> cmodes;
   cmodes.reserve(attributes.nodes_modes.size());
   same_mode_ = true;
+  bool check_same_mode_again = false;
   int fpos = -1;
   for (i = 0, limit = attributes.nodes_modes.size(); i < limit; ++i) {
     cmodes.push_back(attributes.nodes_modes[i]);
     if (cmodes[i] == NODE_MODE_ONNX::LEAF) continue;
+    // The struture may be compressed if it contains only BRANCH_EQ
+    // and if ai.onnx.ml == 3, it changes the branch mode into BRANCH_MEMBER.
+    // same_mode_ needs to be recomputed again.
+    check_same_mode_again |= cmodes[i] == NODE_MODE_ONNX::BRANCH_EQ;
     if (fpos == -1) {
       fpos = static_cast<int>(i);
       continue;
@@ -288,6 +293,27 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     }
   }
 
+  if (same_mode_ && check_same_mode_again) {
+    // A node BRANCH_EQ may have been changed into BRANCH_MEMBER
+    // to compress the structure. same_mode_ needs to evaluated again.
+    same_mode_ = true;
+    auto mode = nodes_[0].mode();
+    for (auto& node : nodes_) {
+      if (node.is_not_leaf()) {
+        if (node.mode() != mode) {
+          same_mode_ = false;
+          break;
+        }
+      }
+    }
+  }
+
+#if defined(_TREE_DEBUG)
+  std::cout << "TreeEnsemble:same_mode_=" << (same_mode_ ? 1 : 0) << "\n";
+  for (auto& node : nodes_) {
+    std::cout << node.str() << "\n";
+  }
+#endif
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/nn/rms_norm.cc b/onnxruntime/core/providers/cpu/nn/rms_norm.cc
new file mode 100644
index 0000000000000..5577ef6cb7be4
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/nn/rms_norm.cc
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cpu/nn/rms_norm.h"
+
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+// RMSNorm uses LayerNorm kernel, which only supports X and scale both
+// being the same data type.
+#define REGISTER_ONNX_KERNEL_TYPED(T)                                                        \
+  ONNX_CPU_OPERATOR_TYPED_KERNEL(RMSNormalization, 23, T,                                    \
+                                 KernelDefBuilder()                                          \
+                                     .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())  \
+                                     .TypeConstraint("V", DataTypeImpl::GetTensorType<T>()), \
+                                 RMSNorm);
+
+REGISTER_ONNX_KERNEL_TYPED(float)
+REGISTER_ONNX_KERNEL_TYPED(double)
+REGISTER_ONNX_KERNEL_TYPED(MLFloat16)
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/rms_norm.h b/onnxruntime/core/providers/cpu/nn/rms_norm.h
new file mode 100644
index 0000000000000..fb8a931112c12
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/nn/rms_norm.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cpu/nn/layer_norm_impl.h"
+
+namespace onnxruntime {
+
+class RMSNorm final : public LayerNormImpl {
+ public:
+  RMSNorm(const OpKernelInfo& op_kernel_info)
+      : LayerNormImpl(op_kernel_info, /* simplified */ true) {}
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 7a27b04ece7cf..d2541cf3d35ce 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -339,7 +339,7 @@ ONNX_CPU_OPERATOR_KERNEL(
     11,
     KernelDefBuilder()
         .TypeConstraint("T",
-                        BuildKernelDefConstraints<float, MLFloat16, double, int32_t, int64_t, std::string>())
+                        BuildKernelDefConstraints<float, MLFloat16, double, int32_t, int64_t, bool, std::string>())
         .TypeConstraint("S", DataTypeImpl::AllSequenceTensorTypes())
         .TypeConstraint("I", BuildKernelDefConstraints<int32_t, int64_t>()),
     SplitToSequence);
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index e14a8d6b87fb0..d1c280d9886f4 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -258,28 +258,7 @@ struct TensorCaster<MLFloat16, float> {
     auto out_data = out.MutableData<float>();
     auto in_data = in.Data<MLFloat16>();
     const size_t shape_size = narrow<size_t>(shape.Size());
-
-    // Check if the tensor is long enough to use threads
-    if (shape_size <= 128000) {
-      MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
-      return;
-    }
-    // Calculate the number of compute cyles per implementation
-    auto cpu_info = CPUIDInfo::GetCPUIDInfo();
-    double num_compute_cycles;
-    if (cpu_info.HasSSE3()) {
-      num_compute_cycles = static_cast<double>(shape_size >> 1);
-    } else if (cpu_info.HasAVX2()) {
-      num_compute_cycles = static_cast<double>(shape_size >> 2);
-    } else {
-      num_compute_cycles = static_cast<double>(shape_size * 10);
-    }
-
-    concurrency::ThreadPool::TryParallelFor(ctx.GetOperatorThreadPool(), shape_size,
-                                            {shape_size * 2.f, shape_size * 4.f, num_compute_cycles},
-                                            [in_data, out_data](std::ptrdiff_t first_span, std::ptrdiff_t last_span) {
-                                              MlasConvertHalfToFloatBuffer(in_data + first_span, out_data + first_span, static_cast<size_t>(last_span - first_span));
-                                            });
+    MlasConvertHalfToFloatBufferInParallel(in_data, out_data, shape_size, ctx.GetOperatorThreadPool());
   }
 };
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 635eb67bbedd0..b6ef787e89e1a 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1450,6 +1450,14 @@ class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear);
 #endif
 
+// Opset 23.
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float_float, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, double_double, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16_MLFloat16, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, BFloat16_BFloat16, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float_MLFloat16, RMSNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16_float, RMSNormalization);
+
 #endif
 
 template <>
@@ -2432,6 +2440,13 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear)>,
 #endif
+      // Opset 23
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float_float, RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, double_double, RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16_MLFloat16, RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, BFloat16_BFloat16, RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float_MLFloat16, RMSNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, MLFloat16_float, RMSNormalization)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/nn/rms_norm.cc b/onnxruntime/core/providers/cuda/nn/rms_norm.cc
new file mode 100644
index 0000000000000..8db7cac1687bd
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/nn/rms_norm.cc
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/nn/rms_norm.h"
+#include "core/providers/cuda/nn/layer_norm.h"
+#include "core/providers/cuda/nn/layer_norm_impl.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cpu/nn/layer_norm_helper.h"
+#include <vector>
+
+namespace onnxruntime {
+namespace cuda {
+
+// For T and V both are double, we need U to be double
+template <typename T, typename V>
+using MeanVarType = typename std::conditional<
+    std::is_same<T, double>::value && std::is_same<V, double>::value,
+    double,
+    float>::type;
+
+// RMSNorm uses LayerNorm kernel, which only supports X and scale both
+// being the same data type.
+#define REGISTER_KERNEL_TYPED(T, V)                                                                 \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(RMSNormalization, kOnnxDomain, 23, T##_##V, kCudaExecutionProvider, \
+                                (*KernelDefBuilder::Create())                                       \
+                                    .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())          \
+                                    .TypeConstraint("V", DataTypeImpl::GetTensorType<V>()),         \
+                                RMSNorm<T, MeanVarType<T, V>, V>);
+
+REGISTER_KERNEL_TYPED(float, float)
+REGISTER_KERNEL_TYPED(float, MLFloat16)
+REGISTER_KERNEL_TYPED(MLFloat16, MLFloat16)
+REGISTER_KERNEL_TYPED(MLFloat16, float)
+REGISTER_KERNEL_TYPED(BFloat16, BFloat16)
+REGISTER_KERNEL_TYPED(double, double)
+
+// The following code is shared from "core/providers/cuda/nn/layer_norm.cc".
+// It is used to implement the RMSNorm kernel, which is a simplified version of LayerNorm.
+template <typename T, typename U, typename V>
+RMSNorm<T, U, V>::RMSNorm(const OpKernelInfo& op_kernel_info)
+    : CudaKernel(op_kernel_info) {
+  ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK());
+  float tmp_epsilon;
+  ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
+  epsilon_ = tmp_epsilon;
+}
+
+template <typename T, typename U, typename V>
+Status RMSNorm<T, U, V>::ComputeInternal(OpKernelContext* ctx) const {
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  typedef typename ToCudaType<U>::MappedType CudaU;
+  typedef typename ToCudaType<V>::MappedType CudaV;
+  // Inputs
+  const Tensor* X = ctx->Input<Tensor>(0);
+  const Tensor* scale = ctx->Input<Tensor>(1);
+
+  auto X_data = reinterpret_cast<const CudaT*>(X->Data<T>());
+  auto scale_data = reinterpret_cast<const CudaV*>(scale->Data<V>());
+  auto bias_data = nullptr;
+
+  const TensorShape& x_shape = X->Shape();
+  auto x_num_dims = x_shape.NumDimensions();
+  const int64_t axis = HandleNegativeAxis(axis_, x_num_dims);
+
+  const TensorShape& scale_shape = scale->Shape();
+  const TensorShape& bias_shape = TensorShape();
+
+  LayerNormParams params;
+  ORT_RETURN_IF_ERROR(LayerNormHelper::CheckInputs(x_shape, scale_shape, bias_shape, bias_data != nullptr, axis, params));
+
+  // Outputs
+  Tensor* Y = ctx->Output(0, x_shape);
+  auto Y_data = reinterpret_cast<CudaV*>(Y->MutableData<V>());
+
+  if (x_shape.Size() == 0) {
+    return Status::OK();
+  }
+
+  // For RMSNorm, we don't need mean and inv_var data, so we can pass nullptr.
+  CudaU* mean_data = nullptr;
+  CudaU* inv_var_data = nullptr;
+
+  // simplified should always be true for RMSNorm
+  HostApplyLayerNorm<CudaT, CudaU, CudaV, true>(
+      GetDeviceProp(), Stream(ctx), Y_data, mean_data, inv_var_data, X_data,
+      onnxruntime::narrow<int>(params.num_rows), onnxruntime::narrow<int>(params.norm_size), epsilon_,
+      scale_data, bias_data,
+      onnxruntime::narrow<int>(params.broadcast_param));
+  CUDA_RETURN_IF_ERROR(cudaGetLastError());
+  return Status::OK();
+}
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/rms_norm.h b/onnxruntime/core/providers/cuda/nn/rms_norm.h
new file mode 100644
index 0000000000000..9f86fcc4470ee
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/nn/rms_norm.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/cuda/cuda_kernel.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T, typename U, typename V>
+class RMSNorm final : public CudaKernel {
+ public:
+  RMSNorm(const OpKernelInfo& op_kernel_info);
+
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+
+ private:
+  int64_t axis_;
+  double epsilon_;
+  // The stash_type is not supported in the current implementation.
+  // int64_t stash_type;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index e8fe235fc1d46..c0d8a4f02bbc3 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -292,6 +292,8 @@ static void SortHeterogenousDXCoreAdapterList(
   std::sort(adapter_infos.begin(), adapter_infos.end(), policy);
 }
 
+typedef HRESULT(WINAPI* PFN_DXCoreCreateAdapterFactory)(REFIID riid, void** ppvFactory);
+
 std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFromDeviceOptions(
     const ConfigOptions& config_options,
     const OrtDmlDeviceOptions* device_options,
@@ -305,9 +307,25 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
   OrtDmlPerformancePreference preference = device_options->Preference;
   OrtDmlDeviceFilter filter = device_options->Filter;
 
+  // Load dxcore.dll. We do this manually so there's not a hard dependency on dxcore which is newer.
+  wil::unique_hmodule dxcore_lib{LoadLibraryExW(L"dxcore.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32)};
+  if (!dxcore_lib) {
+    ORT_THROW("Failed to load dxcore.dll. Expected on older Windows version that do not support dxcore.");
+  }
+
+  auto pfnDXCoreCreateAdapterFactory = reinterpret_cast<PFN_DXCoreCreateAdapterFactory>(
+      GetProcAddress(dxcore_lib.get(), "DXCoreCreateAdapterFactory"));
+
+  if (!pfnDXCoreCreateAdapterFactory) {
+    // this isn't expected to fail so ERROR not WARNING
+    ORT_THROW("Failed to get DXCoreCreateAdapterFactory function address.");
+  }
+
   // Create DXCore Adapter Factory
   ComPtr<IDXCoreAdapterFactory> adapter_factory;
-  ORT_THROW_IF_FAILED(::DXCoreCreateAdapterFactory(adapter_factory.GetAddressOf()));
+  if (FAILED(pfnDXCoreCreateAdapterFactory(IID_PPV_ARGS(&adapter_factory)))) {
+    ORT_THROW("DXCore is not available on this platform. This is expected on older versions of Windows.");
+  }
 
   // Get all DML compatible DXCore adapters
   ComPtr<IDXCoreAdapterList> adapter_list;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index e70ddc481ba43..ed373466198a6 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -13,11 +13,12 @@
 #include "core/common/safeint.h"
 #include "core/common/logging/severity.h"
 #include "migraphx_execution_provider.h"
+#include "migraphx_execution_provider_info.h"
 #include "migraphx_execution_provider_utils.h"
 #include "migraphx_allocator.h"
 #include "gpu_data_transfer.h"
-#include "migraphx_inc.h"
 #include <hip/hip_version.h>
+#include "migraphx_call.h"
 
 #include "migraphx_stream_handle.h"
 
@@ -107,32 +108,108 @@ std::shared_ptr<KernelRegistry> MIGraphXExecutionProvider::GetKernelRegistry() c
 MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_(info) {
   InitProviderOrtApi();
+  get_flags_from_session_info(info);
+  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
+  get_flags_from_env();
+}
+
+MIGraphXExecutionProvider::~MIGraphXExecutionProvider() {
+}
+
+void MIGraphXExecutionProvider::get_flags_from_session_info(const MIGraphXExecutionProviderInfo& info) {
   // Set GPU device to be used
   HIP_CALL_THROW(hipSetDevice(info_.device_id));
   t_ = migraphx::target(info.target_device.c_str());
 
+  // Quantization
+  fp16_enable_ = info.fp16_enable;
+
+#if HIP_VERSION_MAJOR > 6 || (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR >= 4)
+  fp8_enable_ = info.fp8_enable;
+#else
+  LOGS_DEFAULT(WARNING) << "MIGraphX: FP8 Quantization requires ROCm 6.4 or greater";
+  fp8_enable_ = false;
+#endif
+  int8_enable_ = info.int8_enable;
+
+  if (int8_enable_ and fp8_enable_) {
+    LOGS_DEFAULT(FATAL) << "MIGraphX: FP8 and INT8 Quantization Mutually exclusive. Ignoring both Quantization flags";
+  }
+
+  if (int8_enable_ xor fp8_enable_) {
+    int8_calibration_cache_name_ = info.int8_calibration_table_name;
+    int8_use_native_migraphx_calibration_table_ = info.int8_use_native_calibration_table;
+  }
+
+  if (int8_enable_ or fp8_enable_) {
+    int8_calibration_cache_available_ = !info.int8_calibration_table_name.empty();
+  }
+
+  // Load INT8 calibration table
+  std::unordered_map<std::string, float> dynamic_range_map;
+  if ((int8_enable_ || fp8_enable_) && int8_calibration_cache_available_) {
+    const std::string calibration_cache_path = GetCachePath(calibration_cache_path_, int8_calibration_cache_name_);
+    if (!ReadDynamicRange(calibration_cache_path, int8_use_native_migraphx_calibration_table_, dynamic_range_map)) {
+      throw std::runtime_error("Session Failed to read INT8 calibration table " + calibration_cache_path);
+    }
+  }
+
+  // Save/load migraphx compiled models
+  save_compiled_model_ = info.save_compiled_model;
+  save_compiled_path_ = info.save_model_file;
+  load_compiled_model_ = info.load_compiled_model;
+  load_compiled_path_ = info.load_model_file;
+
+  exhaustive_tune_ = info.exhaustive_tune;
+
+  LOGS_DEFAULT(WARNING) << "[MIGraphX EP] MIGraphX provider Session Options:";
+  print_migraphx_ep_flags();
+}
+
+void MIGraphXExecutionProvider::get_flags_from_env() {
+  LOGS_DEFAULT(WARNING) << "\n[MIGraphX EP] MIGraphX ENV Override Variables Set:";
   // whether fp16 is enable
   const std::string fp16_enable_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kFP16Enable);
   if (!fp16_enable_env.empty()) {
     fp16_enable_ = (std::stoi(fp16_enable_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_FP16_ENABLE: " << fp16_enable_;
+  }
+
+  // whether fp8 quantization is enabled
+  const std::string fp8_enable_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kFP8Enable);
+  if (!fp8_enable_env.empty()) {
+#if HIP_VERSION_MAJOR > 6 || (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR >= 4)
+    fp8_enable_ = (std::stoi(fp8_enable_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_FP8_ENABLE: " << fp8_enable_;
+#else
+    LOGS_DEFAULT(WARNING) << "MIGraphX: FP8 Quantization requires ROCm 6.4 or greater";
+    fp8_enable = false;
+#endif
   }
 
   // whether int8 is enabled
   const std::string int8_enable_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kINT8Enable);
   if (!int8_enable_env.empty()) {
     int8_enable_ = (std::stoi(int8_enable_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_INT8_ENABLE: " << int8_enable_;
   }
 
-  if (int8_enable_) {
+  if (int8_enable_ and fp8_enable_) {
+    LOGS_DEFAULT(FATAL) << "\nMIGraphX: FP8 and INT8 Quantization Mutually exclusive. Ignoring both Quantization flags";
+  }
+
+  if (int8_enable_ || fp8_enable_) {
     const std::string int8_calibration_cache_name_env =
         onnxruntime::GetEnvironmentVar(migraphx_env_vars::kINT8CalibrationTableName);
     if (!int8_calibration_cache_name_env.empty()) {
       int8_calibration_cache_name_ = int8_calibration_cache_name_env;
+      LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_CALIBRATION_TABLE_NAME: " << int8_calibration_cache_name_;
     }
 
     const std::string cache_path = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kCachePath);
     if (!cache_path.empty()) {
       calibration_cache_path_ = cache_path;
+      LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_CACHE_PATH: " << calibration_cache_path_;
     }
 
     const std::string int8_use_native_migraphx_calibration_table_env =
@@ -140,19 +217,21 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
     if (!int8_use_native_migraphx_calibration_table_env.empty()) {
       int8_use_native_migraphx_calibration_table_ =
           (std::stoi(int8_use_native_migraphx_calibration_table_env) == 0 ? false : true);
+      LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_INT8_USE_NATIVE_CALIBRATION_TABLE: "
+                            << int8_use_native_migraphx_calibration_table_;
     }
   }
 
-  if (int8_enable_) {
+  if (int8_enable_ or fp8_enable_) {
     int8_calibration_cache_available_ = !int8_calibration_cache_name_.empty();
   }
 
   // Load INT8 calibration table
   std::unordered_map<std::string, float> dynamic_range_map;
-  if (int8_enable_ && int8_calibration_cache_available_) {
+  if ((int8_enable_ || fp8_enable_) && int8_calibration_cache_available_) {
     const std::string calibration_cache_path = GetCachePath(calibration_cache_path_, int8_calibration_cache_name_);
     if (!ReadDynamicRange(calibration_cache_path, int8_use_native_migraphx_calibration_table_, dynamic_range_map)) {
-      throw std::runtime_error("Failed to read INT8 calibration table " + calibration_cache_path);
+      throw std::runtime_error("ENV Failed to read calibration table " + calibration_cache_path);
     }
   }
 
@@ -160,63 +239,102 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
   const std::string save_comp_model_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kSaveCompiledModel);
   if (!save_comp_model_env.empty()) {
     save_compiled_model_ = (std::stoi(save_comp_model_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_SAVE_COMPILED_MODEL: " << save_compiled_model_;
   }
 
   const std::string save_model_path_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kSavedModelPath);
-
   if (save_compiled_model_ && !save_model_path_env.empty()) {
     save_compiled_path_ = save_model_path_env;
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_SAVE_COMPILED_PATH: " << save_compiled_path_;
   }
 
   const std::string load_comp_model_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kLoadCompiledModel);
   if (!load_comp_model_env.empty()) {
     load_compiled_model_ = (std::stoi(load_comp_model_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_LOAD_COMPILED_MODEL: " << load_compiled_model_;
   }
 
   const std::string load_model_path_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kLoadModelPath);
   if (load_compiled_model_ && !load_model_path_env.empty()) {
     load_compiled_path_ = load_model_path_env;
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_LOAD_COMPILED_PATH: " << load_compiled_path_;
   }
 
   // dump unsupported ops
   const std::string dump_model_ops_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::dumpModelOps);
   if (!dump_model_ops_env.empty()) {
     dump_model_ops_ = (std::stoi(dump_model_ops_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_DUMP_MODEL_OPS: " << dump_model_ops_;
   }
 
   // Allow for exhaustive tune during compile
   const std::string exhaustive_tune_env = onnxruntime::GetEnvironmentVar(migraphx_env_vars::kExhaustiveTune);
   if (!exhaustive_tune_env.empty()) {
     exhaustive_tune_ = (std::stoi(exhaustive_tune_env) == 0 ? false : true);
+    LOGS_DEFAULT(WARNING) << "\nORT_MIGRAPHX_EXHAUSTIVE_TUNE_OPS: " << exhaustive_tune_;
   }
+}
 
-  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
-
-  LOGS_DEFAULT(VERBOSE) << "[MIGraphX EP] MIGraphX provider options: "
-                        << "device_id: " << info_.device_id
-                        << ", migraphx_fp16_enable: " << fp16_enable_
-                        << ", migraphx_int8_enable: " << int8_enable_
-                        << ", migraphx_int8_enable: " << int8_enable_
-                        << ", dump_model_ops: " << dump_model_ops_
-                        << ", exhaustive_tune: " << exhaustive_tune_
-                        << ", migraphx_int8_calibration_cache_name: " << int8_calibration_cache_name_
-                        << ", int8_calibration_cache_available: " << int8_calibration_cache_available_
-                        << ", use_native_migraphx_calibration_table: " << int8_use_native_migraphx_calibration_table_
-                        << ", migraphx_save_compiled_model: " << save_compiled_model_
-                        << ", migraphx_save_compiled_model_path: " << save_compiled_path_
-                        << ", migraphx_load_compiled_model: " << load_compiled_model_
-                        << ", migraphx_load_compiled_model_path: " << load_compiled_path_;
+void MIGraphXExecutionProvider::print_migraphx_ep_flags() {
+  LOGS_DEFAULT(WARNING) << "\n device_id: " << info_.device_id
+                        << "\n migraphx_fp16_enable: " << fp16_enable_
+                        << "\n migraphx_fp8_enable: " << fp8_enable_
+                        << "\n migraphx_int8_enable: " << int8_enable_
+                        << "\n dump_model_ops: " << dump_model_ops_
+                        << "\n exhaustive_tune: " << exhaustive_tune_
+                        << "\n migraphx_int8_calibration_cache_name: " << int8_calibration_cache_name_
+                        << "\n int8_calibration_cache_available: " << int8_calibration_cache_available_
+                        << "\n use_native_migraphx_calibration_table: " << int8_use_native_migraphx_calibration_table_
+                        << "\n migraphx_save_compiled_model: " << save_compiled_model_
+                        << "\n migraphx_save_compiled_model_path: " << save_compiled_path_
+                        << "\n migraphx_load_compiled_model: " << load_compiled_model_
+                        << "\n migraphx_load_compiled_model_path: " << load_compiled_path_;
 }
 
-MIGraphXExecutionProvider::~MIGraphXExecutionProvider() {
+AllocatorPtr MIGraphXExecutionProvider::CreateMIGraphXAllocator(OrtDevice::DeviceId device_id,
+                                                                size_t migx_mem_limit,
+                                                                ArenaExtendStrategy arena_extend_strategy,
+                                                                MIGraphXExecutionProviderExternalAllocatorInfo
+                                                                    external_allocator_info,
+                                                                const OrtArenaCfg* default_memory_arena_cfg) {
+  if (external_allocator_info.UseExternalAllocator()) {
+    AllocatorCreationInfo default_memory_info(
+        [external_allocator_info](OrtDevice::DeviceId id) {
+          return std::make_unique<MIGraphXExternalAllocator>(id, HIP,
+                                                             external_allocator_info.alloc,
+                                                             external_allocator_info.free,
+                                                             external_allocator_info.empty_cache);
+        },
+        device_id,
+        false);
+
+    return CreateAllocator(default_memory_info);
+  } else {
+    AllocatorCreationInfo default_memory_info(
+        [](OrtDevice::DeviceId id) {
+          return std::make_unique<MIGraphXAllocator>(id, HIP);
+        },
+        device_id,
+        true,
+        {default_memory_arena_cfg ? *default_memory_arena_cfg
+                                  : OrtArenaCfg(migx_mem_limit, static_cast<int>(arena_extend_strategy),
+                                                -1, -1, -1, -1L)},
+        // make it stream aware
+        true,
+        // enable cross stream sharing?
+        false);
+
+    // ROCM malloc/free is expensive so always use an arena
+    return CreateAllocator(default_memory_info);
+  }
 }
 
 std::vector<AllocatorPtr> MIGraphXExecutionProvider::CreatePreferredAllocators() {
   AllocatorCreationInfo default_memory_info(
-      [](OrtDevice::DeviceId device_id) { return CreateMIGraphXAllocator(device_id, onnxruntime::CUDA); }, info_.device_id);
+      [](OrtDevice::DeviceId device_id) { return std::make_unique<MIGraphXAllocator>(device_id, onnxruntime::CUDA); }, info_.device_id);
   AllocatorCreationInfo pinned_allocator_info(
       [](OrtDevice::DeviceId device_id) {
-        return CreateMIGraphXPinnedAllocator(device_id, onnxruntime::CUDA_PINNED);
+        return std::make_unique<HIPPinnedAllocator>(device_id, onnxruntime::CUDA_PINNED);
       },
       0);
   return std::vector<AllocatorPtr>{CreateAllocator(default_memory_info), CreateAllocator(pinned_allocator_info)};
@@ -235,11 +353,17 @@ static bool IsTypeSupported(const NodeArg* node_arg) {
   switch (type_proto->tensor_type().elem_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2FNUZ:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_DOUBLE:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32:
@@ -264,6 +388,21 @@ static bool getMIGraphXType(ONNXTensorElementDataType type,
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
       mgx_type = migraphx_shape_double_type;
       break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ:
+      mgx_type = migraphx_shape_fp8e4m3fnuz_type;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN:
+      mgx_type = migraphx_shape_fp8e4m3fn_type;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2:
+      mgx_type = migraphx_shape_fp8e5m2_type;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
+      mgx_type = migraphx_shape_fp8e5m2fnuz_type;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4:
+      mgx_type = migraphx_shape_int8_type;
+      break;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
       mgx_type = migraphx_shape_int8_type;
       break;
@@ -276,6 +415,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type,
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
       mgx_type = migraphx_shape_int64_type;
       break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4:
+      mgx_type = migraphx_shape_uint8_type;
+      break;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
       mgx_type = migraphx_shape_uint8_type;
       break;
@@ -920,6 +1062,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "SimplifiedLayerNormalization",
                                                     "Sin",
                                                     "Sinh",
+                                                    "SkipLayerNormalization",
                                                     "SkipSimplifiedLayerNormalization",
                                                     "Slice",
                                                     "Softmax",
@@ -1028,7 +1171,7 @@ MIGraphXExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_v
     if (dump_model_ops_) {
       LOGS_DEFAULT(INFO) << "============= Unsupported nodes ====================";
       for (auto idx : unsupported_nodes) {
-        LOGS_DEFAULT(INFO) << graph_viewer.GetNode(idx)->OpType() << std::endl;
+        LOGS_DEFAULT(INFO) << graph_viewer.GetNode(idx)->OpType();
       }
       LOGS_DEFAULT(INFO) << "************* Unsupported nodes ********************";
     }
@@ -1101,9 +1244,9 @@ bool get_input_output_names(const GraphViewer& graph,
 bool load_precompiled_model(migraphx::program& prog, bool load_enable, std::string path) {
   try {
     if (load_enable) {
-      LOGS_DEFAULT(INFO) << "Attempting to load model at:" << path;
+      LOGS_DEFAULT(WARNING) << "Attempting to load model at:" << path;
       prog = migraphx::load(path.c_str());
-      LOGS_DEFAULT(INFO) << "load model : Success";
+      LOGS_DEFAULT(WARNING) << "load model : Success";
       return true;
     } else {
       return false;
@@ -1116,14 +1259,75 @@ bool load_precompiled_model(migraphx::program& prog, bool load_enable, std::stri
 
 void save_compiled_model(migraphx::program& prog, bool save_enable, std::string out_path) {
   if (save_enable) {
-    LOGS_DEFAULT(INFO) << "Model Save at " << out_path << ": Begin" << std::endl;
+    LOGS_DEFAULT(WARNING) << "Model Save at " << out_path << ": Begin";
     migraphx::file_options fo;
     fo.set_file_format("msgpack");
     migraphx::save(prog, out_path.c_str(), fo);
-    LOGS_DEFAULT(INFO) << "Model Save: Complete" << std::endl;
+    LOGS_DEFAULT(WARNING) << "Model Save: Complete";
   }
 }
 
+// Order matters here especially if the program uses mixed quantization
+// Calibrate on full precision for int8/fp8 and then quantize down to fp16
+void calibrate_and_quantize(migraphx::program& prog,
+                            const migraphx::target& t,
+                            const migraphx::program_parameters quant_params,
+                            bool fp16_enable,
+                            bool int8_enable,
+                            bool fp8_enable,
+                            bool int8_calibration_cache_available,
+                            std::unordered_map<std::string, float>& dynamic_range_map) {
+  // Read in the calibration data and map it to an migraphx paramater map for the calibration ops
+  if ((int8_enable xor fp8_enable) && int8_calibration_cache_available) {
+    LOGS_DEFAULT(WARNING) << "Quantizing input program";
+
+    auto param_shapes = prog.get_parameter_shapes();
+
+    // Add all calibration data read in from int8 table
+    for (auto& [cal_key, cal_val] : dynamic_range_map) {
+      auto cal_val_shape = migraphx::shape(migraphx_shape_float_type);
+      quant_params.add(cal_key.c_str(), migraphx::argument(cal_val_shape, static_cast<void*>(std::move(&cal_val))));
+    }
+
+    // perform static quantization on the programs
+    if (int8_enable) {
+      LOGS_DEFAULT(WARNING) << "Quantizing input program to int8";
+      migraphx::quantize_int8_options quant_opts;
+      quant_opts.add_calibration_data(quant_params);
+      // specify thing we want to int8 quantize
+      quant_opts.add_op_name("convolution");
+      quant_opts.add_op_name("dot");
+      migraphx::quantize_int8(prog, t, quant_opts);
+      LOGS_DEFAULT(WARNING) << "Quantizing int8: Complete";
+    } else if (fp8_enable) {
+#if HIP_VERSION_MAJOR > 6 || (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR >= 4)
+      LOGS_DEFAULT(WARNING) << "Quantizing input program to fp8";
+      migraphx::quantize_fp8_options quant_opts;
+      quant_opts.add_calibration_data(quant_params);
+      migraphx::quantize_fp8(prog, t, quant_opts);
+      LOGS_DEFAULT(WARNING) << "Quantizing fp8: Complete";
+#endif
+    }
+  }
+
+  if (fp16_enable) {
+    LOGS_DEFAULT(WARNING) << "Quantizing input program to fp16";
+    migraphx::quantize_fp16(prog);
+    LOGS_DEFAULT(WARNING) << "Quantizing fp16: Complete";
+  }
+}
+
+void compile_program(migraphx::program& prog,
+                     const migraphx::target& t,
+                     bool exhaustive_tune) {
+  LOGS_DEFAULT(WARNING) << "Model Compile: Begin";
+  migraphx::compile_options co;
+  co.set_fast_math(false);
+  co.set_exhaustive_tune_flag(exhaustive_tune);
+  prog.compile(t, co);
+  LOGS_DEFAULT(WARNING) << "Model Compile: Complete";
+}
+
 Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                                           std::vector<NodeComputeInfo>& node_compute_funcs) {
   migraphx::onnx_options options;
@@ -1164,44 +1368,11 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       if (!load_precompiled_model(prog, load_compiled_model_, std::string{load_compiled_path_})) {
         LOGS_DEFAULT(INFO) << "No input shapes detected quantizing model";
         prog = migraphx::parse_onnx_buffer(onnx_string_buffer, options);
+        migraphx::program_parameters quant_params;
 
-        // Read in the calibration data and map it to an migraphx paramater map for the calibration ops
-        if (int8_enable_ && int8_calibration_cache_available_) {
-          LOGS_DEFAULT(INFO) << "Quantizing input program to int8" << std::endl;
-          migraphx::quantize_int8_options quant_opts;
-          migraphx::program_parameters quant_params;
-
-          auto param_shapes = prog.get_parameter_shapes();
-
-          // Add all calibration data read in from int8 table
-          for (auto& [cal_key, cal_val] : dynamic_range_map_) {
-            auto cal_val_shape = migraphx::shape(migraphx_shape_float_type);
-            quant_params.add(cal_key.c_str(), migraphx::argument(cal_val_shape, static_cast<void*>(std::move(&cal_val))));
-          }
-          quant_opts.add_calibration_data(quant_params);
-
-          // specify thing we want to int8 quantize
-          quant_opts.add_op_name("convolution");
-          quant_opts.add_op_name("dot");
-
-          // perform static quantization on the programs
-          migraphx::quantize_int8(prog, t_, quant_opts);
-          LOGS_DEFAULT(INFO) << "Quantizing input program to int8: Complete" << std::endl;
-        }
-
-        if (fp16_enable_) {
-          LOGS_DEFAULT(INFO) << "Quantizing input program to fp16" << std::endl;
-          migraphx::quantize_fp16(prog);
-          LOGS_DEFAULT(INFO) << "Quantizing input program to fp16: Complete" << std::endl;
-        }
-
-        migraphx::compile_options co;
-        co.set_fast_math(false);
-        co.set_exhaustive_tune_flag(exhaustive_tune_);
-        LOGS_DEFAULT(INFO) << "Model Compile: Begin" << std::endl;
-        prog.compile(t_, co);
-        LOGS_DEFAULT(INFO) << "Model Compile: Complete" << std::endl;
-
+        calibrate_and_quantize(prog, t_, quant_params, fp16_enable_, int8_enable_,
+                               fp8_enable_, int8_calibration_cache_available_, dynamic_range_map_);
+        compile_program(prog, t_, exhaustive_tune_);
         save_compiled_model(prog, save_compiled_model_, save_compiled_path_);
       }
 
@@ -1223,7 +1394,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       std::unique_ptr<MIGraphXFuncState> p = std::make_unique<MIGraphXFuncState>();
       *p = {context->allocate_func, context->release_func, context->allocator_handle, map_progs_[context->node_name],
             map_onnx_string_[context->node_name], options, t_, map_input_index_[context->node_name], &mgx_mu_,
-            map_no_input_shape_[context->node_name], fp16_enable_, int8_enable_,
+            map_no_input_shape_[context->node_name], fp16_enable_, fp8_enable_, int8_enable_,
             int8_calibration_cache_available_, dynamic_range_map_,
             save_compiled_model_, save_compiled_path_,
             load_compiled_model_, load_compiled_path_, dump_model_ops_};
@@ -1248,6 +1419,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       migraphx::onnx_options& cmp_options = mgx_state->options;
       bool& no_input_shape = mgx_state->no_input_shape;
       bool fp16_enable = mgx_state->fp16_enable;
+      bool fp8_enable = mgx_state->fp8_enable;
       bool int8_enable = mgx_state->int8_enable;
       bool int8_calibration_cache_available = mgx_state->int8_calibration_cache_available;
 
@@ -1256,7 +1428,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       bool input_shape_match = true;
       migraphx::program_parameter_shapes param_shapes;
       if (no_input_shape) {
-        LOGS_DEFAULT(VERBOSE) << "Missing input shape setting input parameters again" << std::endl;
+        LOGS_DEFAULT(INFO) << "Missing input shape setting input parameters again";
         for (auto& it : map_input_name_index) {
           auto& name = it.first;
           auto& index = it.second;
@@ -1268,7 +1440,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
           input_shape_match = false;
         }
       } else {
-        LOGS_DEFAULT(VERBOSE) << "Assigning inputs, and parameters from compiled model" << std::endl;
+        LOGS_DEFAULT(INFO) << "Assigning inputs, and parameters from compiled model";
         param_shapes = prog.get_parameter_shapes();
         auto prog_output_shapes = prog.get_output_shapes();
 
@@ -1310,15 +1482,10 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
 #endif
 #endif
           prog = migraphx::parse_onnx_buffer(onnx_string, cmp_options);
+          migraphx::program_parameters quant_params;
 
-          // Read in the calibration data and map it to an migraphx paramater map for the calibration ops
-          if (int8_enable && int8_calibration_cache_available) {
-            LOGS_DEFAULT(INFO) << "Quantize Int8: Begin" << std::endl;
-            migraphx::quantize_int8_options quant_opts;
-            migraphx::program_parameters quant_params;
-
+          if ((int8_enable xor fp8_enable) and int8_calibration_cache_available) {
             auto param_shapes = prog.get_parameter_shapes();
-
             // Add input parameter data and the values they're set to
             for (auto&& name : param_shapes.names()) {
               if (map_input_name_index.count(name) > 0) {
@@ -1337,34 +1504,10 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
                 quant_params.add(name, migraphx::argument(param_shapes[name], const_cast<void*>(input_tensor.GetTensorRawData())));
               }
             }
-
-            // Add all calibration data read in from int8 table
-            for (auto& [cal_key, cal_val] : map_dynamic_range) {
-              auto cal_val_shape = migraphx::shape(migraphx_shape_float_type);
-              quant_params.add(cal_key.c_str(), migraphx::argument(cal_val_shape, static_cast<void*>(std::move(&cal_val))));
-            }
-            quant_opts.add_calibration_data(quant_params);
-            // specify thing we want to int8 quantize
-            quant_opts.add_op_name("convolution");
-            quant_opts.add_op_name("dot");
-
-            // perform static quantization on the programs
-            migraphx::quantize_int8(prog, t, quant_opts);
-            LOGS_DEFAULT(INFO) << "Quantize Int8: Completed" << std::endl;
           }
-
-          if (fp16_enable) {
-            LOGS_DEFAULT(INFO) << "Quantize fp16: Begin" << std::endl;
-            migraphx::quantize_fp16(prog);
-            LOGS_DEFAULT(INFO) << "Quantize fp16: Completed" << std::endl;
-          }
-
-          LOGS_DEFAULT(INFO) << "Model Compile: Begin" << std::endl;
-          migraphx::compile_options co;
-          co.set_fast_math(false);
-          co.set_exhaustive_tune_flag(exhaustive_tune_);
-          prog.compile(t, co);
-
+          calibrate_and_quantize(prog, t, quant_params, fp16_enable, int8_enable,
+                                 fp8_enable, int8_calibration_cache_available, map_dynamic_range);
+          compile_program(prog, t, exhaustive_tune_);
           save_compiled_model(prog, mgx_state->save_compiled_mode, mgx_state->save_compiled_path);
         }
 
@@ -1379,7 +1522,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       if (param_shapes.size() > 0) {
         for (auto&& name : param_shapes.names()) {
           if (map_input_name_index.count(name) > 0) {
-            LOGS_DEFAULT(INFO) << "Setting parameters for:" << name << std::endl;
+            LOGS_DEFAULT(INFO) << "Setting parameters for:" << name;
             auto input_tensor = ctx.GetInput(map_input_name_index[name]);
             auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
             const auto tensor_shape = tensor_info.GetShape();
@@ -1393,7 +1536,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
               LOGS_DEFAULT(FATAL) << "MIGraphX: param type mismatch";
             }
 
-            LOGS_DEFAULT(INFO) << "Writing Raw tensor data " << std::endl;
+            LOGS_DEFAULT(INFO) << "Writing Raw tensor data ";
             m.add(name, migraphx::argument(param_shapes[name],
                                            const_cast<void*>(input_tensor.GetTensorRawData())));
           }
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index 7c89b5ec544a1..aecccdd54d697 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -7,7 +7,7 @@
 #include "core/framework/execution_provider.h"
 #include <mutex>
 #include "core/providers/migraphx/migraphx_execution_provider_info.h"
-#include "core/providers/migraphx/migraphx_inc.h"
+#include "core/providers/migraphx/migraphx_call.h"
 
 #include <map>
 #include <unordered_map>
@@ -17,15 +17,16 @@ namespace onnxruntime {
 
 namespace migraphx_env_vars {
 static const char kFP16Enable[] = "ORT_MIGRAPHX_FP16_ENABLE";
+static const char kFP8Enable[] = "ORT_MIGRAPHX_FP8_ENABLE";
 static const char kINT8Enable[] = "ORT_MIGRAPHX_INT8_ENABLE";
 static const char dumpModelOps[] = "ORT_MIGRAPHX_DUMP_MODEL_OPS";
 static const char kINT8CalibrationTableName[] = "ORT_MIGRAPHX_INT8_CALIBRATION_TABLE_NAME";
 static const char kCachePath[] = "ORT_MIGRAPHX_CACHE_PATH";
 static const char kINT8UseNativeMIGraphXCalibrationTable[] = "ORT_MIGRAPHX_INT8_USE_NATIVE_CALIBRATION_TABLE";
 static const char kSaveCompiledModel[] = "ORT_MIGRAPHX_SAVE_COMPILED_MODEL";
-static const char kSavedModelPath[] = "ORT_MIGRAPHX_SAVE_COMPILE_PATH";
+static const char kSavedModelPath[] = "ORT_MIGRAPHX_SAVE_COMPILED_PATH";
 static const char kLoadCompiledModel[] = "ORT_MIGRAPHX_LOAD_COMPILED_MODEL";
-static const char kLoadModelPath[] = "ORT_MIGRAPHX_LOAD_COMPILE_PATH";
+static const char kLoadModelPath[] = "ORT_MIGRAPHX_LOAD_COMPILED_PATH";
 static const char kExhaustiveTune[] = "ORT_MIGRAPHX_EXHAUSTIVE_TUNE";
 
 };  // namespace migraphx_env_vars
@@ -43,6 +44,7 @@ struct MIGraphXFuncState {
   std::mutex* mgx_mu_ptr = nullptr;
   bool no_input_shape = false;
   bool fp16_enable = false;
+  bool fp8_enable = false;
   bool int8_enable = false;
   bool int8_calibration_cache_available = false;
   std::unordered_map<std::string, float> dynamic_range_map;
@@ -60,6 +62,10 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   explicit MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info);
   ~MIGraphXExecutionProvider();
 
+  void get_flags_from_session_info(const MIGraphXExecutionProviderInfo& info);
+  void get_flags_from_env();
+  void print_migraphx_ep_flags();
+
   Status Sync() const override;
 
   Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
@@ -78,6 +84,9 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
 
+  static AllocatorPtr CreateMIGraphXAllocator(OrtDevice::DeviceId device_id, size_t migx_mem_limit, ArenaExtendStrategy arena_extend_strategy,
+                                              MIGraphXExecutionProviderExternalAllocatorInfo external_alloc_info, const OrtArenaCfg* arena_cfg);
+
   std::unique_ptr<IndexedSubGraph> GetSubGraph(const std::vector<std::size_t>& graph_nodes_index, const GraphViewer& graph) const;
   void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
@@ -91,6 +100,7 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
  private:
   MIGraphXExecutionProviderInfo info_;
   bool fp16_enable_ = false;
+  bool fp8_enable_ = false;
   bool int8_enable_ = false;
   std::string int8_calibration_cache_name_;
   bool int8_calibration_cache_available_ = false;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.cc
index 1f9a47d3ad87d..cf21d791cfe6b 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/shared_library/provider_api.h"
 #include "core/providers/migraphx/migraphx_execution_provider_info.h"
 
 #include "core/common/make_string.h"
@@ -10,10 +11,17 @@
 #include "migraphx_call.h"
 
 namespace onnxruntime {
+
+const EnumNameMapping<ArenaExtendStrategy> arena_extend_strategy_mapping{
+    {ArenaExtendStrategy::kNextPowerOfTwo, "kNextPowerOfTwo"},
+    {ArenaExtendStrategy::kSameAsRequested, "kSameAsRequested"},
+};
+
 namespace migraphx {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
 constexpr const char* kFp16Enable = "trt_fp16_enable";
+constexpr const char* kFp8Enable = "migx_fp8_enable";
 constexpr const char* kInt8Enable = "migx_int8_enable";
 constexpr const char* kInt8CalibTable = "migx_int8_calibration_table_name";
 constexpr const char* kInt8UseNativeCalibTable = "migx_int8_use_native_calibration_table";
@@ -22,12 +30,20 @@ constexpr const char* kSaveModelPath = "migx_save_model_name";
 constexpr const char* kLoadCompiledModel = "migx_load_compiled_model";
 constexpr const char* kLoadModelPath = "migx_load_model_name";
 constexpr const char* kExhaustiveTune = "migx_exhaustive_tune";
+constexpr const char* kMemLimit = "migx_mem_limit";
+constexpr const char* kArenaExtendStrategy = "migx_arena_extend_strategy";
+constexpr const char* kGpuExternalAlloc = "migx_external_alloc";
+constexpr const char* kGpuExternalFree = "migx_external_free";
+constexpr const char* kGpuExternalEmptyCache = "migx_external_empty_cache";
 
 }  // namespace provider_option_names
 }  // namespace migraphx
 
 MIGraphXExecutionProviderInfo MIGraphXExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
   MIGraphXExecutionProviderInfo info{};
+  void* alloc = nullptr;
+  void* free = nullptr;
+  void* empty_cache = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -42,13 +58,43 @@ MIGraphXExecutionProviderInfo MIGraphXExecutionProviderInfo::FromProviderOptions
                     ", must be between 0 (inclusive) and ", num_devices, " (exclusive).");
                 return Status::OK();
               })
+          .AddValueParser(
+              migraphx::provider_option_names::kGpuExternalAlloc,
+              [&alloc](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                alloc = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
+          .AddValueParser(
+              migraphx::provider_option_names::kGpuExternalFree,
+              [&free](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                free = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
+          .AddValueParser(
+              migraphx::provider_option_names::kGpuExternalEmptyCache,
+              [&empty_cache](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                empty_cache = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
           .AddAssignmentToReference(migraphx::provider_option_names::kFp16Enable, info.fp16_enable)
+          .AddAssignmentToReference(migraphx::provider_option_names::kFp8Enable, info.fp8_enable)
           .AddAssignmentToReference(migraphx::provider_option_names::kInt8Enable, info.int8_enable)
           .AddAssignmentToReference(migraphx::provider_option_names::kSaveCompiledModel, info.save_compiled_model)
           .AddAssignmentToReference(migraphx::provider_option_names::kLoadCompiledModel, info.load_compiled_model)
           .AddAssignmentToReference(migraphx::provider_option_names::kExhaustiveTune, info.exhaustive_tune)
+          .AddAssignmentToReference(migraphx::provider_option_names::kMemLimit, info.mem_limit)
+          .AddAssignmentToEnumReference(migraphx::provider_option_names::kArenaExtendStrategy, arena_extend_strategy_mapping, info.arena_extend_strategy)
           .Parse(options));
 
+  MIGraphXExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
+  info.external_allocator_info = alloc_info;
+
   return info;
 }
 
@@ -56,9 +102,16 @@ ProviderOptions MIGraphXExecutionProviderInfo::ToProviderOptions(const MIGraphXE
   const ProviderOptions options{
       {migraphx::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
       {migraphx::provider_option_names::kFp16Enable, MakeStringWithClassicLocale(info.fp16_enable)},
+      {migraphx::provider_option_names::kFp8Enable, MakeStringWithClassicLocale(info.fp8_enable)},
       {migraphx::provider_option_names::kInt8Enable, MakeStringWithClassicLocale(info.int8_enable)},
       {migraphx::provider_option_names::kSaveCompiledModel, MakeStringWithClassicLocale(info.save_compiled_model)},
       {migraphx::provider_option_names::kLoadCompiledModel, MakeStringWithClassicLocale(info.load_compiled_model)},
+      {migraphx::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.mem_limit)},
+      {migraphx::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.alloc))},
+      {migraphx::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.free))},
+      {migraphx::provider_option_names::kGpuExternalEmptyCache, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache))},
+      {migraphx::provider_option_names::kArenaExtendStrategy,
+       EnumToName(arena_extend_strategy_mapping, info.arena_extend_strategy)},
       {migraphx::provider_option_names::kExhaustiveTune, MakeStringWithClassicLocale(info.exhaustive_tune)},
   };
   return options;
@@ -68,9 +121,12 @@ ProviderOptions MIGraphXExecutionProviderInfo::ToProviderOptions(const OrtMIGrap
   const ProviderOptions options{
       {migraphx::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
       {migraphx::provider_option_names::kFp16Enable, MakeStringWithClassicLocale(info.migraphx_fp16_enable)},
+      {migraphx::provider_option_names::kFp8Enable, MakeStringWithClassicLocale(info.migraphx_fp8_enable)},
       {migraphx::provider_option_names::kInt8Enable, MakeStringWithClassicLocale(info.migraphx_int8_enable)},
       {migraphx::provider_option_names::kSaveCompiledModel, MakeStringWithClassicLocale(info.migraphx_save_compiled_model)},
       {migraphx::provider_option_names::kLoadCompiledModel, MakeStringWithClassicLocale(info.migraphx_load_compiled_model)},
+      {migraphx::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.migraphx_mem_limit)},
+      {migraphx::provider_option_names::kArenaExtendStrategy, EnumToName(arena_extend_strategy_mapping, static_cast<onnxruntime::ArenaExtendStrategy>(info.migraphx_arena_extend_strategy))},
       {migraphx::provider_option_names::kExhaustiveTune, MakeStringWithClassicLocale(info.migraphx_exhaustive_tune)},
   };
   return options;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h
index b8bf86580f03d..a598052c5f025 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h
@@ -7,15 +7,42 @@
 #include <string>
 
 #include "core/framework/ortdevice.h"
+#include "core/common/hash_combine.h"
+#include "core/framework/arena_extend_strategy.h"
 #include "core/framework/provider_options.h"
 #include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
+
+// Information needed to construct MIGraphX execution providers.
+struct MIGraphXExecutionProviderExternalAllocatorInfo {
+  void* alloc{nullptr};
+  void* free{nullptr};
+  void* empty_cache{nullptr};
+
+  MIGraphXExecutionProviderExternalAllocatorInfo() {
+    alloc = nullptr;
+    free = nullptr;
+    empty_cache = nullptr;
+  }
+
+  MIGraphXExecutionProviderExternalAllocatorInfo(void* a, void* f, void* e) {
+    alloc = a;
+    free = f;
+    empty_cache = e;
+  }
+
+  bool UseExternalAllocator() const {
+    return (alloc != nullptr) && (free != nullptr);
+  }
+};
+
 // Information needed to construct trt execution providers.
 struct MIGraphXExecutionProviderInfo {
   std::string target_device;
   OrtDevice::DeviceId device_id{0};
   bool fp16_enable{false};
+  bool fp8_enable{false};
   bool int8_enable{false};
   std::string int8_calibration_table_name{""};
   bool int8_use_native_calibration_table{false};
@@ -25,8 +52,42 @@ struct MIGraphXExecutionProviderInfo {
   std::string load_model_file{"./compiled_model.mxr"};
   bool exhaustive_tune{false};
 
+  size_t mem_limit{std::numeric_limits<size_t>::max()};                             // Will be over-ridden by contents of `default_memory_arena_cfg` (if specified)
+  ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo};  // Will be over-ridden by contents of `default_memory_arena_cfg` (if specified)
+
+  OrtArenaCfg* default_memory_arena_cfg{nullptr};
+  MIGraphXExecutionProviderExternalAllocatorInfo external_allocator_info{};
+
   static MIGraphXExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const MIGraphXExecutionProviderInfo& info);
   static ProviderOptions ToProviderOptions(const OrtMIGraphXProviderOptions& info);
 };
 }  // namespace onnxruntime
+
+template <>
+struct std::hash<::onnxruntime::MIGraphXExecutionProviderInfo> {
+  size_t operator()(const ::onnxruntime::MIGraphXExecutionProviderInfo& info) const {
+    size_t value{0xbc9f1d34};  // seed
+
+    // Bits: device_id (16), arena_extend_strategy (reserved 2), boolean options (1 each)
+    size_t data = static_cast<size_t>(info.device_id) ^
+                  (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
+                  (static_cast<size_t>(info.fp16_enable) << 18) ^
+                  (static_cast<size_t>(info.int8_enable) << 19) ^
+                  (static_cast<size_t>(info.int8_use_native_calibration_table) << 20) ^
+                  (static_cast<size_t>(info.save_compiled_model) << 21) ^
+                  (static_cast<size_t>(info.load_compiled_model) << 22) ^
+                  (static_cast<size_t>(info.exhaustive_tune) << 23);
+    onnxruntime::HashCombine(data, value);
+
+    onnxruntime::HashCombine(info.mem_limit, value);
+
+    // Memory pointers
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.alloc), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.free), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache), value);
+
+    // The default memory arena cfg is not used in hashing right now.
+    return value;
+  }
+};
diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
index 2904c17bb4aa0..4a3945ac680d0 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
@@ -5,6 +5,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/migraphx/migraphx_provider_factory.h"
 #include "migraphx_execution_provider.h"
+#include "migraphx_execution_provider_info.h"
 #include "migraphx_provider_factory_creator.h"
 #include "migraphx_allocator.h"
 #include "gpu_data_transfer.h"
@@ -42,6 +43,27 @@ struct ProviderInfo_MIGraphX_Impl final : ProviderInfo_MIGraphX {
     return std::make_unique<MIGraphXPinnedAllocator>(device_id, name);
   }
 
+  void MIGraphXMemcpy_HostToDevice(void* dst, const void* src, size_t count) override {
+    // hipMemcpy() operates on the default stream
+    HIP_CALL_THROW(hipMemcpy(dst, src, count, hipMemcpyHostToDevice));
+
+    // To ensure that the copy has completed, invoke a stream sync for the default stream.
+    // For transfers from pageable host memory to device memory, a stream sync is performed before the copy is initiated.
+    // The function will return once the pageable buffer has been copied to the staging memory for DMA transfer
+    // to device memory, but the DMA to final destination may not have completed.
+
+    HIP_CALL_THROW(hipStreamSynchronize(0));
+  }
+
+  // Used by onnxruntime_pybind_state.cc
+  void MIGraphXMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override {
+    // For transfers from device to either pageable or pinned host memory, the function returns only once the copy has completed.
+    HIP_CALL_THROW(hipMemcpy(dst, src, count, hipMemcpyDeviceToHost));
+  }
+
+  std::shared_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, size_t migx_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::MIGraphXExecutionProviderExternalAllocatorInfo& external_allocator_info, const OrtArenaCfg* default_memory_arena_cfg) override {
+    return MIGraphXExecutionProvider::CreateMIGraphXAllocator(device_id, migx_mem_limit, arena_extend_strategy, external_allocator_info, default_memory_arena_cfg);
+  }
 } g_info;
 
 struct MIGraphX_Provider : Provider {
@@ -60,6 +82,7 @@ struct MIGraphX_Provider : Provider {
     info.device_id = static_cast<OrtDevice::DeviceId>(options.device_id);
     info.target_device = "gpu";
     info.fp16_enable = options.migraphx_fp16_enable;
+    info.fp8_enable = options.migraphx_fp8_enable;
     info.exhaustive_tune = options.migraphx_exhaustive_tune;
     info.int8_enable = options.migraphx_int8_enable;
     info.int8_calibration_table_name = "";
@@ -77,6 +100,8 @@ struct MIGraphX_Provider : Provider {
     if (options.migraphx_load_model_path != nullptr) {
       info.load_model_file = options.migraphx_load_model_path;
     }
+    info.arena_extend_strategy = static_cast<onnxruntime::ArenaExtendStrategy>(options.migraphx_arena_extend_strategy);
+    info.mem_limit = options.migraphx_mem_limit;
     return std::make_shared<MIGraphXProviderFactory>(info);
   }
 
@@ -85,6 +110,7 @@ struct MIGraphX_Provider : Provider {
     auto& migx_options = *reinterpret_cast<OrtMIGraphXProviderOptions*>(provider_options);
     migx_options.device_id = internal_options.device_id;
     migx_options.migraphx_fp16_enable = internal_options.fp16_enable;
+    migx_options.migraphx_fp8_enable = internal_options.fp8_enable;
     migx_options.migraphx_int8_enable = internal_options.int8_enable;
     migx_options.migraphx_exhaustive_tune = internal_options.exhaustive_tune;
 
@@ -109,6 +135,8 @@ struct MIGraphX_Provider : Provider {
     migx_options.migraphx_save_model_path = internal_options.save_model_file.c_str();
     migx_options.migraphx_load_compiled_model = internal_options.load_compiled_model;
     migx_options.migraphx_load_model_path = internal_options.load_model_file.c_str();
+    migx_options.migraphx_arena_extend_strategy = static_cast<int>(internal_options.arena_extend_strategy);
+    migx_options.migraphx_mem_limit = internal_options.mem_limit;
   }
 
   ProviderOptions GetProviderOptions(const void* provider_options) override {
diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
index b257a4318dc0e..d1c9457bafa0f 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
@@ -14,6 +14,9 @@ struct MIGraphXExecutionProviderExternalAllocatorInfo;
 struct ProviderInfo_MIGraphX {
   virtual std::unique_ptr<onnxruntime::IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0;
   virtual std::unique_ptr<onnxruntime::IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) = 0;
+  virtual void MIGraphXMemcpy_HostToDevice(void* dst, const void* src, size_t count) = 0;
+  virtual void MIGraphXMemcpy_DeviceToHost(void* dst, const void* src, size_t count) = 0;
+  virtual std::shared_ptr<onnxruntime::IAllocator> CreateMIGraphXAllocator(int16_t device_id, size_t migx_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::MIGraphXExecutionProviderExternalAllocatorInfo& external_allocator_info, const OrtArenaCfg* default_memory_arena_cfg) = 0;
 
  protected:
   ~ProviderInfo_MIGraphX() = default;  // Can only be destroyed through a subclass instance
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
index 0fb44fe4eda85..6f3acd76212a7 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -117,7 +117,6 @@ void Impl_Cast(
 }
 }  // namespace cuda
 
-#if NV_TENSORRT_MAJOR >= 10
 void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
                                              uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept {
   // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
@@ -134,25 +133,6 @@ void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /
   // if cudaMalloc fails, returns nullptr.
   return outputPtr;
 }
-#else
-// Only override this method when TensorRT <= 8.6
-void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
-                                        uint64_t /*alignment*/) noexcept {
-  // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
-  // even for empty tensors, so allocate a dummy byte.
-  size = std::max(size, static_cast<uint64_t>(1));
-  if (size > allocated_size) {
-    cudaFree(outputPtr);
-    outputPtr = nullptr;
-    allocated_size = 0;
-    if (cudaMalloc(&outputPtr, size) == cudaSuccess) {
-      allocated_size = size;
-    }
-  }
-  // if cudaMalloc fails, returns nullptr.
-  return outputPtr;
-}
-#endif
 
 void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept {
   output_shapes.clear();
@@ -912,6 +892,7 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
 }
 
 NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream) {
+  // TODO: figure out if PerThreadContext is used at all. If not, just clean it up.
   if (has_user_compute_stream) {
     CUDA_CALL_THROW(cudaSetDevice(device_id));
     (void)(stream);
@@ -1046,8 +1027,16 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
       info_(info),
       device_id_(info.device_id) {
   InitProviderOrtApi();
+
   // TODO(maximlianm) remove this since we should be able to compile an AOT context file without GPU
-  CUDA_CALL_THROW(cudaSetDevice(device_id_));
+
+  if (!info.has_user_compute_stream) {
+    // If the app is passing in a compute stream, it already has initialized cuda and created a context.
+    // Calling cudaSetDevice() will set the default context in the current thread
+    // which may not be compatible with the stream created by the app.
+    CUDA_CALL_THROW(cudaSetDevice(device_id_));
+  }
+
   cudaDeviceProp prop;
   CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
   compute_capability_ = GetComputeCapacity(prop);
@@ -1068,6 +1057,7 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
   max_partition_iterations_ = info.max_partition_iterations;
   min_subgraph_size_ = info.min_subgraph_size;
   max_workspace_size_ = info.max_workspace_size;
+  max_shared_mem_size_ = info.max_shared_mem_size;
   dump_subgraphs_ = info.dump_subgraphs;
   weight_stripped_engine_enable_ = info.weight_stripped_engine_enable;
   onnx_model_folder_path_ = info.onnx_model_folder_path;
@@ -2294,6 +2284,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   if (max_workspace_size_ > 0) {
     trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
   }
+  if (max_shared_mem_size_ > 0) {
+    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kTACTIC_SHARED_MEMORY, max_shared_mem_size_);
+  }
+  // Only set default compute capabilities if user hasn't explicitly configured them
+  constexpr int kDefaultNumComputeCapabilities = 1;  // Default number of compute capabilities for Turing support
+  if (trt_config->getNbComputeCapabilities() == 0) {
+    trt_config->setNbComputeCapabilities(kDefaultNumComputeCapabilities);
+    trt_config->setComputeCapability(nvinfer1::ComputeCapability::kCURRENT, 0);
+  }
 
   int num_inputs = trt_network->getNbInputs();
   int num_outputs = trt_network->getNbOutputs();
@@ -2581,7 +2580,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-    size_t mem_size = trt_engine->getDeviceMemorySize();
+    size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -2835,7 +2834,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-      size_t mem_size = trt_engine->getDeviceMemorySize();
+      size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -2917,7 +2916,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
       if (IsGraphCaptureAllowed()) {
         CaptureEnd(0);
-        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // CUDA work issued to a capturing stream doesn't actually run on the GPU,
         // so run the captured graph here to actually execute the work.
         ORT_RETURN_IF_ERROR(ReplayGraph(0));
       } else {
@@ -2967,7 +2966,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-    size_t mem_size = trt_engine->getDeviceMemorySize();
+    size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -3149,7 +3148,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-      size_t mem_size = trt_engine->getDeviceMemorySize();
+      size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -3231,7 +3230,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
       if (IsGraphCaptureAllowed()) {
         CaptureEnd(0);
-        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // CUDA work issued to a capturing stream doesn't actually run on the GPU,
         // so run the captured graph here to actually execute the work.
         ORT_RETURN_IF_ERROR(ReplayGraph(0));
       } else {
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
index 6c5e1a1f0a8d3..7a0c47d28c81d 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
@@ -78,11 +78,8 @@ using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
 //
 class OutputAllocator : public nvinfer1::IOutputAllocator {
  public:
-#if NV_TENSORRT_MAJOR >= 10
   void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) noexcept override;
-#else
-  void* reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override;
-#endif
+
   void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;
 
   void* getBuffer() {
@@ -238,6 +235,7 @@ class NvExecutionProvider : public IExecutionProvider {
   int max_partition_iterations_ = 1000;
   size_t min_subgraph_size_ = 1;
   size_t max_workspace_size_ = 0;
+  size_t max_shared_mem_size_ = 0;
   bool force_sequential_engine_build_ = false;
   bool dump_subgraphs_ = false;
   bool engine_cache_enable_ = false;
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc
index 78f2723a20118..d99db5acb94ff 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc
@@ -41,6 +41,7 @@ NvExecutionProviderInfo NvExecutionProviderInfo::FromProviderOptions(const Provi
                 return Status::OK();
               })
           .AddAssignmentToReference(nv::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size)
+          .AddAssignmentToReference(nv::provider_option_names::kMaxSharedMemSize, info.max_shared_mem_size)
           .AddAssignmentToReference(nv::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
           .AddAssignmentToReference(nv::provider_option_names::kDetailedBuildLog, info.detailed_build_log)
           .AddAssignmentToReference(nv::provider_option_names::kProfilesMinShapes, info.profile_min_shapes)
@@ -90,6 +91,7 @@ ProviderOptions NvExecutionProviderInfo::ToProviderOptions(const NvExecutionProv
       {nv::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
       {nv::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {nv::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.max_workspace_size)},
+      {nv::provider_option_names::kMaxSharedMemSize, MakeStringWithClassicLocale(info.max_shared_mem_size)},
       {nv::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)},
       {nv::provider_option_names::kDetailedBuildLog, MakeStringWithClassicLocale(info.detailed_build_log)},
       {nv::provider_option_names::kProfilesMinShapes, MakeStringWithClassicLocale(info.profile_min_shapes)},
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
index e70e70bf05eb9..2a67f3c3bec4d 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
@@ -24,6 +24,7 @@ struct NvExecutionProviderInfo {
   int max_partition_iterations{1000};
   int min_subgraph_size{1};
   size_t max_workspace_size{0};
+  size_t max_shared_mem_size{0};
   bool dump_subgraphs{false};
   std::string engine_cache_path{""};
   bool weight_stripped_engine_enable{false};
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index e4d768093aa37..53fef09aec0fa 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -185,6 +185,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
   {
     CreateLSTMOpBuilder("LSTM", *this);
   }
+
+  {
+    CreateCumSumOpBuilder("CumSum", *this);
+  }
 }
 
 const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index c1cc61ad19341..1cc8e12068cca 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -104,5 +104,8 @@ void CreateMatMulOpBuilder(const std::string& op_type, OpBuilderRegistrations& o
 void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 void CreateLSTMOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
+void CreateCumSumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 5b3fa6ed3b950..a83e8e064c7d0 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -230,6 +230,7 @@ class BaseOpBuilder : public IOpBuilder {
 
         {"LogSoftmax", QNN_OP_LOG_SOFTMAX},
         {"Concat", QNN_OP_CONCAT},
+        {"CumSum", QNN_OP_CUMULATIVE_SUM},
 
         {"Gemm", QNN_OP_FULLY_CONNECTED},
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cumsum_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cumsum_op_builder.cc
new file mode 100644
index 0000000000000..68d2808a91e3e
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cumsum_op_builder.cc
@@ -0,0 +1,148 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+
+namespace onnxruntime {
+namespace qnn {
+namespace {
+
+Status GetOnnxAxis(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, uint32_t& onnx_axis) {
+  const auto& inputs = node_unit.Inputs();
+  TensorInfo axis_input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], axis_input_info));
+  ORT_RETURN_IF_NOT(axis_input_info.is_initializer, "axis must be initializers");
+  std::vector<uint8_t> axis_unpacked_tensor;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*axis_input_info.initializer_tensor, axis_unpacked_tensor));
+  ORT_RETURN_IF_NOT(1 == static_cast<uint32_t>(axis_unpacked_tensor.size() / sizeof(axis_input_info.qnn_data_type)),
+                    "axis should be a single element");
+
+  int32_t axis = 0;
+  if (axis_input_info.qnn_data_type == QNN_DATATYPE_INT_64) {
+    axis = static_cast<int32_t>(*reinterpret_cast<const int64_t*>(axis_unpacked_tensor.data()));
+  } else {
+    axis = static_cast<int32_t>(*reinterpret_cast<const int32_t*>(axis_unpacked_tensor.data()));
+  }
+
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape");
+
+  auto rank = static_cast<int32_t>(input_shape.size());
+  if (axis < 0) {
+    axis += rank;
+  }
+
+  ORT_RETURN_IF_NOT((axis >= 0 && axis < static_cast<int32_t>(input_shape.size())), "QNN requires axis range [0, rank-1].");
+
+  onnx_axis = static_cast<uint32_t>(axis);
+
+  return Status::OK();
+}
+
+}  // namespace
+
+class CumSumOpBuilder : public BaseOpBuilder {
+ public:
+  CumSumOpBuilder() : BaseOpBuilder("CumSumOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CumSumOpBuilder);
+
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+};
+
+Status CumSumOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      const logging::Logger& logger) const {
+  const auto& inputs = node_unit.Inputs();
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.IsConstantInput(inputs[1].node_arg.Name()),
+                    "QNN CumSum needs axis as a param, hence input[1] must be a constant.");
+
+  NodeAttrHelper node_helper(node_unit);
+  int64_t exclusive = node_helper.Get("exclusive", static_cast<int64_t>(0));
+  int64_t reverse = node_helper.Get("reverse", static_cast<int64_t>(0));
+
+  // QNN HTP op validation passes for non-default values of attributes but fails in finalize.
+  // Hence adding the checks here.
+  ORT_RETURN_IF_NOT(exclusive == 0, "QNN only supports default value 0 for exclusive attribute");
+  ORT_RETURN_IF_NOT(reverse == 0, "QNN only supports default value 0 for reverse attribute");
+
+  return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
+}
+
+Status CumSumOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      const logging::Logger& logger,
+                                      std::vector<std::string>& input_names,
+                                      bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+  const auto& inputs = node_unit.Inputs();
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
+  return Status::OK();
+}
+
+Status CumSumOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                    const NodeUnit& node_unit,
+                                                    std::vector<std::string>&& input_names,
+                                                    const logging::Logger& logger,
+                                                    bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+
+  std::vector<std::string> param_tensor_names;
+
+  // Add axis param
+  Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
+  uint32_t onnx_axis = 0;
+  ORT_RETURN_IF_ERROR(GetOnnxAxis(qnn_model_wrapper, node_unit, onnx_axis));
+  axis_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
+  axis_qnn_scalar.uint32Value = onnx_axis;
+  QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_CUMULATIVE_SUM_PARAM_AXIS, axis_qnn_scalar);
+  param_tensor_names.push_back(axis_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
+
+  // Add exclusive param
+  NodeAttrHelper node_helper(node_unit);
+  int64_t exclusive = node_helper.Get("exclusive", static_cast<int64_t>(0));
+  Qnn_Scalar_t exclusive_qnn_scalar = QNN_SCALAR_INIT;
+  exclusive_qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
+  exclusive_qnn_scalar.bool8Value = static_cast<uint8_t>(exclusive == 0 ? 0 : 1);
+  QnnParamWrapper exclusive_param(node_unit.Index(), node_unit.Name(), QNN_OP_CUMULATIVE_SUM_PARAM_EXCLUSIVE, exclusive_qnn_scalar);
+  param_tensor_names.push_back(exclusive_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(exclusive_param));
+
+  // Add reverse param
+  int64_t reverse = node_helper.Get("reverse", static_cast<int64_t>(0));
+  Qnn_Scalar_t reverse_qnn_scalar = QNN_SCALAR_INIT;
+  reverse_qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
+  reverse_qnn_scalar.bool8Value = static_cast<uint8_t>(reverse == 0 ? 0 : 1);
+  QnnParamWrapper reverse_param(node_unit.Index(), node_unit.Name(), QNN_OP_CUMULATIVE_SUM_PARAM_REVERSE, reverse_qnn_scalar);
+  param_tensor_names.push_back(reverse_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(reverse_param));
+
+  return ProcessOutputs(qnn_model_wrapper, node_unit,
+                        std::move(input_names),
+                        std::move(param_tensor_names),
+                        logger, do_op_validation, GetQnnOpType(node_unit.OpType()));
+}
+
+void CreateCumSumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<CumSumOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index e89ca636f63d6..1ad910be810bb 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -143,6 +143,52 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     QnnTensorWrapper input_tensorwrapper(input_tensor_name, tensor_type, qnn_data_type, std::move(quantize_param),
                                          std::move(input_shape), std::move(unpacked_tensor));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
+
+    if (1 == input_i) {
+      // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to signed symmetric int16)
+      // to avoid a QNN validation failure.
+      //
+      // QNN graph WITHOUT workaround (fails validation):
+      //     input_0_uint16 ---> FC ---> output_uint16
+      //                         ^
+      //                         |
+      //     input_1_uint16 -----+
+      //
+      // QNN graph WITH workaround (passes validation):
+      //     input_0_uint16 ----------------------> FC ---> output_uint16
+      //                                            ^
+      //                                            |
+      //     input_1_uint16 --> Convert(to int16) --+
+
+      std::string weight_input_name = input_tensor_name;
+      const auto& weight_tensor_wrapper = qnn_model_wrapper.GetQnnTensorWrapper(weight_input_name);
+
+      if (weight_tensor_wrapper.GetTensorDataType() == QNN_DATATYPE_UFIXED_POINT_16) {
+        const auto& quant_param_wrapper = weight_tensor_wrapper.GetQnnQuantParams();
+        const Qnn_QuantizeParams_t& quant_param = quant_param_wrapper.Get();
+        const auto& transformed_input1_shape = weight_tensor_wrapper.GetTensorDims();
+
+        ORT_RETURN_IF_NOT(quant_param_wrapper.IsPerTensor(),
+                          "FC's INT16 weight inputs only support INT16 per-tensor quantization");
+
+        // Pop FC weight. Insert Convert op after Weight
+        input_names.pop_back();
+        const std::string& fc_output_name = node_unit.Outputs()[0].node_arg.Name();
+        std::string convert_output_name = weight_input_name + "_convert_" + fc_output_name;
+
+        ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                                   weight_input_name,
+                                                   convert_output_name,
+                                                   QNN_DATATYPE_UFIXED_POINT_16,
+                                                   QNN_DATATYPE_SFIXED_POINT_16,
+                                                   quant_param.scaleOffsetEncoding.offset,
+                                                   quant_param.scaleOffsetEncoding.scale,
+                                                   transformed_input1_shape,
+                                                   true,  // Symmetric
+                                                   do_op_validation));
+        input_names.push_back(convert_output_name);
+      }
+    }
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index 7e91b17abc720..b367f58faf139 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -49,50 +49,6 @@ class MatMulOpBuilder : public BaseOpBuilder {
 };
 
 namespace {
-
-// Inserts a QNN Convert operator to convert from one quantization type (e.g., uint16) to another (e.g., uint8).
-Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
-                       const std::string& convert_input_name,
-                       const std::string& convert_output_name,
-                       Qnn_DataType_t input_qnn_data_type,
-                       Qnn_DataType_t output_qnn_data_type,
-                       int32_t input_offset,
-                       float input_scale,
-                       const std::vector<uint32_t>& output_shape,
-                       bool do_op_validation) {
-  // Assume input is already handled.
-  float qmin = 0.0f;
-  float qmax = 255.0f;
-  ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
-  double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
-  double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
-  float scale = 0.0f;
-  int32_t offset = 0;
-  ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
-                                                 static_cast<float>(value_max),
-                                                 output_qnn_data_type,
-                                                 scale,
-                                                 offset));
-
-  std::vector<uint32_t> output_shape_copy = output_shape;
-  QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
-                                                QNN_TENSOR_TYPE_NATIVE,
-                                                output_qnn_data_type,
-                                                QnnQuantParamsWrapper(scale, offset),
-                                                std::move(output_shape_copy));
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
-
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
-                                                    QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                    "Convert",
-                                                    {convert_input_name},
-                                                    {convert_output_name},
-                                                    {},
-                                                    do_op_validation),
-                    "Failed to add node.");
-  return Status::OK();
-}
-
 inline bool IsQuant16bit(Qnn_DataType_t qnn_data_type) {
   return qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16 || qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16;
 }
@@ -253,7 +209,8 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
   }
   input_names.emplace_back(input_1_name);
 
-  // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8)
+  // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8
+  // OR converts from asymmetric quantized uint16 to symmetric quantized uint16)
   // to avoid a QNN validation failure.
   //
   // QNN graph WITHOUT workaround (fails validation):
@@ -262,12 +219,18 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
   //                         |
   //     input_1_uint16 -----+
   //
-  // QNN graph WITH workaround (passes validation):
+  // For Dynamic weights, QNN graph WITH workaround (passes validation):
   //     input_0_uint16 ----------------------> MatMul ---> output_uint16
   //                                            ^
   //                                            |
   //     input_1_uint16 --> Convert(to uint8) --+
-  if (!input_info_0.is_initializer && !input_info_1.is_initializer &&
+  //
+  // For Static weights, QNN graph WITH workaround (passes validation):
+  //     input_0_uint16 ------------------------------> MatMul ---> output_uint16
+  //                                                      ^
+  //                                                      |
+  //     input_1_uint16 --> Convert(to symmetric int16) --+
+  if (!input_info_0.is_initializer &&
       input_info_0.qnn_data_type == input_info_1.qnn_data_type &&
       input_info_0.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
     ORT_RETURN_IF_NOT(input_info_1.quant_param.IsPerTensor(),
@@ -282,15 +245,29 @@ Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wra
     if (reshape_input_1) {
       input_1_shape = {input_info_1.shape[0], 1};
     }
-    ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
-                                        convert_input_name,
-                                        convert_output_name,
-                                        input_info_1.qnn_data_type,
-                                        QNN_DATATYPE_UFIXED_POINT_8,
-                                        quant_param.scaleOffsetEncoding.offset,
-                                        quant_param.scaleOffsetEncoding.scale,
-                                        input_1_shape,
-                                        do_op_validation));
+    if (!input_info_1.is_initializer) {
+      ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                                 convert_input_name,
+                                                 convert_output_name,
+                                                 input_info_1.qnn_data_type,
+                                                 QNN_DATATYPE_UFIXED_POINT_8,
+                                                 quant_param.scaleOffsetEncoding.offset,
+                                                 quant_param.scaleOffsetEncoding.scale,
+                                                 input_1_shape,
+                                                 false,  // asymmetric
+                                                 do_op_validation));
+    } else {
+      ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                                 convert_input_name,
+                                                 convert_output_name,
+                                                 input_info_1.qnn_data_type,
+                                                 QNN_DATATYPE_SFIXED_POINT_16,
+                                                 quant_param.scaleOffsetEncoding.offset,
+                                                 quant_param.scaleOffsetEncoding.scale,
+                                                 input_1_shape,
+                                                 true,  // symmetric
+                                                 do_op_validation));
+    }
     input_names.push_back(convert_output_name);
   }
   return Status::OK();
@@ -355,6 +332,50 @@ Status MatMulOpBuilder::ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_m
                                                          qnn_model_wrapper.IsGraphInput(org_input_1_name), false));
   }
   input_names.emplace_back(input_1_name);
+
+  // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to signed symmetric int16)
+  // to avoid a QNN validation failure.
+  //
+  // QNN graph WITHOUT workaround (fails validation):
+  //     input_0_uint16 ---> FC ---> output_uint16
+  //                         ^
+  //                         |
+  //     input_1_uint16 -----+
+  //
+  // QNN graph WITH workaround (passes validation):
+  //     input_0_uint16 ----------------------> FC ---> output_uint16
+  //                                            ^
+  //                                            |
+  //     input_1_uint16 --> Convert(to int16) --+
+
+  std::string weight_input_name = input_names.back();
+  const auto& weight_tensor_wrapper = qnn_model_wrapper.GetQnnTensorWrapper(weight_input_name);
+
+  if (weight_tensor_wrapper.GetTensorDataType() == QNN_DATATYPE_UFIXED_POINT_16) {
+    const auto& quant_param_wrapper = weight_tensor_wrapper.GetQnnQuantParams();
+    const Qnn_QuantizeParams_t& quant_param = quant_param_wrapper.Get();
+    const auto& transformed_input1_shape = weight_tensor_wrapper.GetTensorDims();
+
+    ORT_RETURN_IF_NOT(quant_param_wrapper.IsPerTensor(),
+                      "FC's INT16 weight inputs only support INT16 per-tensor quantization");
+
+    // Pop Conv weight. Insert Convert op after Weight
+    input_names.pop_back();
+    const std::string& conv_output_name = node_unit.Outputs()[0].node_arg.Name();
+    std::string convert_output_name = weight_input_name + "_convert_" + conv_output_name;
+
+    ORT_RETURN_IF_ERROR(utils::InsertConvertOp(qnn_model_wrapper,
+                                               weight_input_name,
+                                               convert_output_name,
+                                               QNN_DATATYPE_UFIXED_POINT_16,
+                                               QNN_DATATYPE_SFIXED_POINT_16,
+                                               quant_param.scaleOffsetEncoding.offset,
+                                               quant_param.scaleOffsetEncoding.scale,
+                                               transformed_input1_shape,
+                                               true,  // Symmetric
+                                               do_op_validation));
+    input_names.push_back(convert_output_name);
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index 8704420d98ead..ff3f476cbc4dc 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -56,22 +56,6 @@ std::vector<uint32_t> FlattenShapeFromAxis(const std::vector<uint32_t>& input_sh
   return output_shape;
 }
 
-std::vector<uint32_t> GetTransposePermToUseLastAxis(uint32_t input_rank, uint32_t axis) {
-  assert(axis < input_rank);
-  std::vector<uint32_t> transpose_perm;
-  transpose_perm.reserve(input_rank);
-
-  for (uint32_t dim = 0; dim < input_rank; dim++) {
-    transpose_perm.push_back(dim);
-  }
-
-  // Swap axis dim with last dim.
-  transpose_perm[axis] = input_rank - 1;
-  transpose_perm[input_rank - 1] = axis;
-
-  return transpose_perm;
-}
-
 Status SoftmaxOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                        const NodeUnit& node_unit,
                                        const logging::Logger& logger,
@@ -131,8 +115,10 @@ Status SoftmaxOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     QNN EP is able to support arbitrary axis attribute by wrapping transposes around the operator.
     */
     std::string transpose_output_name = input_name + "_ort_qnn_ep_transpose";
-    std::vector<uint32_t> transpose_perm = GetTransposePermToUseLastAxis(static_cast<uint32_t>(input_rank),
-                                                                         static_cast<uint32_t>(axis));
+    std::vector<uint32_t> transpose_perm;
+    ORT_RETURN_IF_ERROR(utils::GetPermToLastAxis(static_cast<uint32_t>(axis),
+                                                 static_cast<uint32_t>(input_rank),
+                                                 transpose_perm));
 
     std::vector<uint32_t> transpose_output_shape = input_info.shape;
     transpose_output_shape[input_rank - 1] = input_info.shape[axis];
@@ -243,8 +229,10 @@ Status SoftmaxOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_
                       "Failed to add node.");
 
     const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(orig_output_name);
-    std::vector<uint32_t> transpose_perm = GetTransposePermToUseLastAxis(static_cast<uint32_t>(output_rank),
-                                                                         static_cast<uint32_t>(axis));
+    std::vector<uint32_t> transpose_perm;
+    ORT_RETURN_IF_ERROR(utils::GetPermToLastAxis(static_cast<uint32_t>(axis),
+                                                 static_cast<uint32_t>(output_rank),
+                                                 transpose_perm));
 
     ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
                                                            transpose_input_name,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index 520e8c7493b6d..1c22bf55c914d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -1,12 +1,21 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+
 namespace onnxruntime {
 namespace qnn {
+
 const int TOPK_MIN_INPUT = 2;
 const int TOPK_MAX_INPUT = 2;
+
 class TopKOpBuilder : public BaseOpBuilder {
  public:
   TopKOpBuilder() : BaseOpBuilder("TopKOpBuilder") {}
@@ -41,8 +50,11 @@ class TopKOpBuilder : public BaseOpBuilder {
 
 Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
   size_t input_count = node_unit.Inputs().size();
+  size_t output_count = node_unit.Outputs().size();
   ORT_RETURN_IF_NOT(input_count >= TOPK_MIN_INPUT && input_count <= TOPK_MAX_INPUT,
                     "For ONNX TopK operation the expected number of inputs is 2.");
+  ORT_RETURN_IF_NOT(output_count == 2, "QNN TopK expects exactly 2 outputs.");
+
   // Skip the first input. The second input needs to be an initializer.
   const auto& input_1 = node_unit.Inputs()[1].node_arg.Name();
   if (!qnn_model_wrapper.IsConstantInput(input_1)) {
@@ -57,14 +69,6 @@ Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
   if (0 == largest) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN TopK output is always largest values");
   }
-  auto& input_0 = node_unit.Inputs()[0];
-  std::vector<uint32_t> input_shape;
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape), "Cannot get shape");
-  auto rank = input_shape.size();
-  auto axis = node_helper.Get("axis", -1);
-
-  ORT_RETURN_IF_NOT(axis == -1 || axis == static_cast<int32_t>(rank - 1),
-                    "QNN TopK's axis is always the last dimension");
 
   return Status::OK();
 }
@@ -81,6 +85,40 @@ Status TopKOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   const auto& inputs = node_unit.Inputs();
   ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
 
+  // HTP only supports TopK at the last axis, and thus check whether extra Transpose is required.
+  TensorInfo input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(node_unit.Inputs()[0], input_info));
+
+  size_t input_rank = input_info.shape.size();
+  int32_t axis = NodeAttrHelper(node_unit).Get("axis", -1);
+  if (axis == -1 || axis == static_cast<int32_t>(input_rank - 1)) {
+    return Status::OK();
+  }
+
+  // Add Transpose to permute axis to the last.
+  std::string transpose_output_name = input_names[0] + "_ort_qnn_ep_transpose";
+  std::vector<uint32_t> transpose_perm;
+  ORT_RETURN_IF_ERROR(utils::GetPermToLastAxis(static_cast<uint32_t>(axis),
+                                               static_cast<uint32_t>(input_rank),
+                                               transpose_perm));
+
+  std::vector<uint32_t> transpose_output_shape = input_info.shape;
+  transpose_output_shape[input_rank - 1] = input_info.shape[axis];
+  transpose_output_shape[axis] = input_info.shape[input_rank - 1];
+
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
+                                                         input_names[0],
+                                                         transpose_output_name,
+                                                         input_info.shape,
+                                                         transpose_perm,
+                                                         transpose_output_shape,
+                                                         input_info.qnn_data_type,
+                                                         input_info.quant_param,
+                                                         do_op_validation,
+                                                         false,
+                                                         false));
+  input_names[0] = transpose_output_name;
+
   return Status::OK();
 }
 
@@ -108,9 +146,125 @@ Status TopKOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   std::string k_param_name = k_param.GetParamTensorName();
   qnn_model_wrapper.AddParamWrapper(std::move(k_param));
   std::vector<std::string> param_tensor_names{k_param_name};
-  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names),
-                                     std::move(param_tensor_names), logger, do_op_validation,
-                                     GetQnnOpType(node_unit.OpType())));
+
+  // HTP only supports TopK at the last axis, and thus check whether extra Transpose is required.
+  TensorInfo input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(node_unit.Inputs()[0], input_info));
+
+  size_t input_rank = input_info.shape.size();
+  int32_t axis = NodeAttrHelper(node_unit).Get("axis", -1);
+  if (axis == -1 || axis == static_cast<int32_t>(input_rank - 1)) {
+    ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper,
+                                       node_unit,
+                                       std::move(input_names),
+                                       std::move(param_tensor_names),
+                                       logger,
+                                       do_op_validation,
+                                       GetQnnOpType(node_unit.OpType())));
+    return Status::OK();
+  }
+
+  const auto& outputs = node_unit.Outputs();
+  std::vector<std::string> transpose_input_names;
+  std::vector<std::vector<std::uint32_t>> transpose_input_shapes;
+
+  // Add TopK outputs.
+  for (size_t output_idx = 0; output_idx < 2; ++output_idx) {
+    const auto& output = outputs[output_idx];
+
+    // Since user may not be aware of the additional Transpose, the original output name of TopK node must be used by
+    // the additional Transpose node which has the same output as original TopK node.
+    const std::string& output_name = output.node_arg.Name();
+    std::string transpose_input_name = output_name + "_ort_qnn_ep_transpose";
+    transpose_input_names.push_back(std::move(transpose_input_name));
+
+    // Since the input of TopK node is permuted, its output shape must be manually calculated.
+    TensorInfo output_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(output, output_info));
+    size_t output_rank = output_info.shape.size();
+
+    std::vector<uint32_t> transpose_input_shape = output_info.shape;
+    transpose_input_shape[output_rank - 1] = output_info.shape[axis];
+    transpose_input_shape[axis] = output_info.shape[output_rank - 1];
+    transpose_input_shapes.push_back(std::move(transpose_input_shape));
+
+    QnnTensorWrapper output_tensorwrapper(transpose_input_names[output_idx],
+                                          QNN_TENSOR_TYPE_NATIVE,
+                                          output_info.qnn_data_type,
+                                          output_info.quant_param.Copy(),
+                                          std::vector<uint32_t>(transpose_input_shapes[output_idx]));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
+  }
+
+  // Add TopK node.
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit),
+                                                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                    GetQnnOpType(node_unit.OpType()),
+                                                    std::move(input_names),
+                                                    std::vector<std::string>(transpose_input_names),
+                                                    std::move(param_tensor_names)),
+                    "Failed to add node.");
+
+  // Add Transpose nodes for each output to permute back.
+  for (size_t output_idx = 0; output_idx < 2; ++output_idx) {
+    const auto& output = outputs[output_idx];
+    const std::string& output_name = output.node_arg.Name();
+
+    TensorInfo output_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(output, output_info));
+    size_t output_rank = output_info.shape.size();
+
+    std::vector<uint32_t> transpose_perm;
+    ORT_RETURN_IF_ERROR(utils::GetPermToLastAxis(static_cast<uint32_t>(axis),
+                                                 static_cast<uint32_t>(output_rank),
+                                                 transpose_perm));
+
+    std::string transpose_output_name = output_name;
+    bool is_graph_output = qnn_model_wrapper.IsGraphOutput(output_name);
+
+    // TopK's second output is indices which could be INT64 dtype, and QnnTensorWrapper directly changes the dtype to
+    // INT32 during the wrapper construction. Nevertheless, if this output happens to be graph output, an additional
+    // Cast must be added to cast dtype from INT32 back to INT64.
+    bool is_cast_required = output_idx == 1 && output_info.qnn_data_type == QNN_DATATYPE_INT_64 && is_graph_output;
+    std::string cast_input_name = "";
+    if (is_cast_required) {
+      cast_input_name = transpose_output_name + "_ort_qnn_ep_cast";
+      // For the same reason described above, the original output name is now used by this Cast.
+      transpose_output_name = cast_input_name;
+      // Since additional Cast is added, below Transpose is no longer graph output.
+      is_graph_output = false;
+    }
+
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
+                                                           transpose_input_names[output_idx],
+                                                           transpose_output_name,
+                                                           transpose_input_shapes[output_idx],
+                                                           transpose_perm,
+                                                           output_info.shape,
+                                                           output_info.qnn_data_type,
+                                                           output_info.quant_param,
+                                                           do_op_validation,
+                                                           false,
+                                                           is_graph_output));
+
+    if (is_cast_required) {
+      QnnTensorWrapper cast_output_tensorwrapper(output_name,
+                                                 QNN_TENSOR_TYPE_APP_READ,
+                                                 output_info.qnn_data_type,
+                                                 output_info.quant_param.Copy(),
+                                                 std::vector<uint32_t>(output_info.shape));
+      ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(cast_output_tensorwrapper)),
+                        "Failed to add tensor.");
+      ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(cast_input_name,
+                                                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                        "Cast",
+                                                        {cast_input_name},
+                                                        {output_name},
+                                                        {}),
+                        "Failed to add node");
+    }
+  }
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index bd22aec89102c..200f046632cdd 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -601,13 +601,14 @@ Status QnnModelWrapper::AddTransposeNode(NodeIndex node_index,
   ORT_RETURN_IF_NOT(AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
   const static std::string qnn_node_type = "Transpose";
 
-  CreateQnnNode(output_name,
-                QNN_OP_PACKAGE_NAME_QTI_AISW,
-                qnn_node_type,
-                {input_name},
-                {output_name},
-                {param_tensor_name},
-                do_op_validation);
+  ORT_RETURN_IF_NOT(CreateQnnNode(output_name,
+                                  QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                  qnn_node_type,
+                                  {input_name},
+                                  {output_name},
+                                  {param_tensor_name},
+                                  do_op_validation),
+                    "QNN EP: Failed to create manually inserted Qnn Transpose node.");
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.cc
new file mode 100644
index 0000000000000..bad5a3c038cf9
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.cc
@@ -0,0 +1,313 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.h"
+
+#include <gsl/gsl>
+#include <optional>
+#include <utility>
+#include <string>
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_node_group/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+
+namespace onnxruntime {
+namespace qnn {
+namespace {
+
+constexpr char kAttrTransposePerm[] = "perm";
+constexpr char kOpChannelShuffle[] = "ChannelShuffle";
+constexpr char kOpTranspose[] = "Transpose";
+constexpr char kOpReshape[] = "Reshape";
+
+using MapNodeToNodeUnit = std::unordered_map<const Node*, const NodeUnit*>;
+using MapNodeUnitToGroup = std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>;
+
+std::optional<std::vector<int64_t>> GetTransposePerm(const NodeUnit& transpose) {
+  if (transpose.OpType() != kOpTranspose) {
+    return std::nullopt;
+  }
+  NodeAttrHelper helper(transpose.GetNode());
+  return helper.Get(kAttrTransposePerm, std::vector<int64_t>());
+}
+
+std::vector<int64_t> InvertTransposePerm(gsl::span<const int64_t> perm) {
+  const size_t perm_size = perm.size();
+  std::vector<int64_t> perm_inverse(perm_size);
+  for (size_t i = 0; i < perm_size; ++i) {
+    size_t j = gsl::narrow_cast<size_t>(perm[i]);
+    perm_inverse[j] = gsl::narrow_cast<int64_t>(i);
+  }
+  return perm_inverse;
+}
+
+bool IsCancelingTransposePermPair(
+    std::optional<gsl::span<const int64_t>> perm1,
+    std::optional<gsl::span<const int64_t>> perm2) {
+  if (!perm1.has_value() || !perm2.has_value()) {
+    return false;
+  }
+  if (perm1->size() != perm2->size()) {
+    return false;
+  }
+  std::vector<int64_t> perm1_inverted_vector = InvertTransposePerm(*perm1);
+  auto perm1_inverted = gsl::make_span<const int64_t>(
+      perm1_inverted_vector.data(), perm1_inverted_vector.size());
+  if (perm1_inverted != perm2.value()) {
+    return false;
+  }
+  return true;
+}
+
+/// @brief Match pattern: Transpose -> ChannelShuffle (Reshape -> Transpose -> Reshape) -> Transpose
+/// E.g.,: T(perm=[0, 2, 1, 3]) -> R(N, G, C/G, H, W) -> T(perm=[0, 1, 3, 2, 4]) -> R(N, C, H, W) -> T(perm=[0, 2, 1, 3])
+/// @param graph_viewer QNN graph viewer.
+/// @param transpose_head The first transpose node starting the pattern.
+/// @param node_to_node_unit Maps a Node to a NodeUnit.
+/// @param node_unit_to_qnn_node_group Maps a NodeUnit to a IQnnNodeGroup.
+/// @return The matched pattern as an array of NodeUnits if found, otherwise std::nullopt.
+/// @note This is ChannelShuffle with transpose wraps commonly seen ORT post partitioning.
+std::optional<std::array<const NodeUnit*, 5>> MatchChannelShufflePattern(
+    const GraphViewer& graph_viewer,
+    const NodeUnit* transpose_head,
+    const MapNodeToNodeUnit& node_to_node_unit,
+    const MapNodeUnitToGroup& node_unit_to_qnn_node_group) {
+  // Helper function to get a single child of a specific type
+  auto GetChildOfType = [&](const NodeUnit& node, std::string_view expect_type) -> const NodeUnit* {
+    const std::array<std::string_view, 1> child_op_types{expect_type};
+    const NodeUnit* child = GetOnlyChildOfType(
+        graph_viewer, node, child_op_types, node_to_node_unit, node_unit_to_qnn_node_group);
+    if (child == nullptr) {
+      return nullptr;
+    }
+    if (child->OpType() != expect_type) {
+      return nullptr;
+    }
+    if (child->UnitType() != NodeUnit::Type::SingleNode) {
+      return nullptr;
+    }
+    return child;
+  };
+
+  if (transpose_head->OpType() != kOpTranspose) {
+    return std::nullopt;
+  }
+  if (transpose_head->UnitType() != NodeUnit::Type::SingleNode) {
+    return std::nullopt;
+  }
+  const NodeUnit* reshape1 = GetChildOfType(*transpose_head, kOpReshape);
+  if (reshape1 == nullptr) {
+    return std::nullopt;
+  }
+  const NodeUnit* transpose = GetChildOfType(*reshape1, kOpTranspose);
+  if (transpose == nullptr) {
+    return std::nullopt;
+  }
+  const NodeUnit* reshape2 = GetChildOfType(*transpose, kOpReshape);
+  if (reshape2 == nullptr) {
+    return std::nullopt;
+  }
+  const NodeUnit* transpose_tail = GetChildOfType(*reshape2, kOpTranspose);
+  if (transpose_tail == nullptr) {
+    return std::nullopt;
+  }
+  return std::array<const NodeUnit*, 5>{transpose_head, reshape1, transpose, reshape2, transpose_tail};
+}
+
+/// @brief Create or validate the QNN node of type ChannelShuffle.
+/// @param qnn_model_wrapper QNN model wrapper
+/// @param node_units The node units containing the nodes in pattern
+/// @param validate Whether to validate the QNN node
+/// @return Status
+Status CreateOrValidateOnQnn(
+    QnnModelWrapper* qnn_model_wrapper,
+    gsl::span<const NodeUnit* const> node_units,
+    bool validate) {
+  const NodeUnit* transpose_head = node_units[0];
+  const NodeUnit* transpose_tail = node_units[4];
+  const NodeUnitIODef& cs_input_def = transpose_head->Inputs()[0];
+  const NodeUnitIODef& cs_output_def = transpose_tail->Outputs()[0];
+
+  std::vector<std::string> param_tensor_names;
+  std::vector<Qnn_Param_t> param_tensors;
+  {
+    auto transpose_head_proto = transpose_head->GetNode().InputDefs()[0]->Shape();
+    ORT_RETURN_IF_NOT(transpose_head_proto != nullptr, "Failed to get input shape proto.");
+    TensorShape transpose_head_input_shape = utils::GetTensorProtoShape(*transpose_head_proto);
+    const uint32_t channel_axis = static_cast<uint32_t>(transpose_head_input_shape.NumDimensions() - 1);
+    Qnn_Scalar_t axis_scalar = QNN_SCALAR_INIT;
+    axis_scalar.dataType = QNN_DATATYPE_UINT_32;
+    axis_scalar.uint32Value = channel_axis;
+    QnnParamWrapper param_wrapper(transpose_tail->Index(),
+                                  transpose_tail->Name(),
+                                  QNN_OP_CHANNEL_SHUFFLE_PARAM_AXIS,
+                                  axis_scalar);
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddParamWrapper(std::move(param_wrapper)), "Failed to add param");
+    param_tensor_names.push_back(param_wrapper.GetParamTensorName());
+    param_tensors.push_back(param_wrapper.GetQnnParam());
+  }
+  {
+    // Extract channel dimension from transpose (from channel last -> first)
+    const NodeUnit* reshape1 = node_units[1];
+    auto reshape1_proto = reshape1->GetNode().OutputDefs()[0]->Shape();
+    ORT_RETURN_IF_NOT(reshape1_proto != nullptr, "Failed to get input shape proto.");
+    TensorShape reshape1_output_shape = utils::GetTensorProtoShape(*reshape1_proto);
+    Qnn_Scalar_t num_groups_scalar = QNN_SCALAR_INIT;
+    num_groups_scalar.dataType = QNN_DATATYPE_UINT_32;
+    num_groups_scalar.uint32Value = static_cast<uint32_t>(reshape1_output_shape.GetDims()[1]);
+    QnnParamWrapper param_wrapper(transpose_tail->Index(),
+                                  transpose_tail->Name(),
+                                  QNN_OP_CHANNEL_SHUFFLE_PARAM_NUM_GROUPS,
+                                  num_groups_scalar);
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddParamWrapper(std::move(param_wrapper)), "Failed to add param");
+    param_tensor_names.push_back(param_wrapper.GetParamTensorName());
+    param_tensors.push_back(param_wrapper.GetQnnParam());
+  }
+
+  QnnTensorWrapper channel_shuffle_input;
+  QnnTensorWrapper channel_shuffle_output;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->MakeTensorWrapper(cs_input_def, channel_shuffle_input));
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->MakeTensorWrapper(cs_output_def, channel_shuffle_output));
+
+  // Note: Skipped QNN validation API due to its inconsistent behavior than creation API. Re-enable it when fixed.
+  if (!validate) {
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(channel_shuffle_input)), "Failed to add input");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(channel_shuffle_output)), "Failed to add output");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->CreateQnnNode(transpose_tail->Name(),
+                                                       QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                       QNN_OP_CHANNEL_SHUFFLE,
+                                                       {cs_input_def.node_arg.Name()},
+                                                       {cs_output_def.node_arg.Name()},
+                                                       std::move(param_tensor_names),
+                                                       validate),
+                      "Failed to add fused " + std::string(kOpChannelShuffle) + " node.");
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+std::unique_ptr<IQnnNodeGroup> ChannelShuffleFusion::TryFusion(
+    QnnModelWrapper& qnn_model_wrapper,
+    const NodeUnit& transpose_head,
+    const MapNodeToNodeUnit& node_to_node_unit,
+    const MapNodeUnitToGroup& node_unit_to_qnn_node_group,
+    [[maybe_unused]] const logging::Logger& logger) {
+  const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
+  std::optional<std::array<const NodeUnit*, 5>> pattern = MatchChannelShufflePattern(
+      graph_viewer, &transpose_head, node_to_node_unit, node_unit_to_qnn_node_group);
+  if (!pattern.has_value()) {
+    return nullptr;
+  }
+  const NodeUnit* reshape1 = pattern->at(1);
+  const NodeUnit* transpose = pattern->at(2);
+  const NodeUnit* reshape2 = pattern->at(3);
+
+  // Input shape to reshape1 must equal output shape of reshape2; and has rank > 2
+  auto reshape1_input0_proto = reshape1->GetNode().InputDefs()[0]->Shape();
+  auto reshape2_output_proto = reshape2->GetNode().OutputDefs()[0]->Shape();
+  if (reshape1_input0_proto == nullptr || reshape2_output_proto == nullptr) {
+    return nullptr;
+  }
+  TensorShape reshape1_input0_shape = utils::GetTensorProtoShape(*reshape1_input0_proto);
+  TensorShape reshape2_output_shape = utils::GetTensorProtoShape(*reshape2_output_proto);
+  if (reshape1_input0_shape.NumDimensions() != reshape2_output_shape.NumDimensions()) {
+    return nullptr;
+  }
+  gsl::span<const int64_t> reshape1_input0_dims = reshape1_input0_shape.GetDims();
+  gsl::span<const int64_t> reshape2_output_dims = reshape2_output_shape.GetDims();
+  if (reshape1_input0_dims != reshape2_output_dims) {
+    return nullptr;
+  }
+
+  // Intermediate shape must be 1 rank higher than input shape
+  auto reshape1_output_proto = reshape1->GetNode().OutputDefs()[0]->Shape();
+  if (reshape1_output_proto == nullptr) {
+    return nullptr;
+  }
+  TensorShape reshape1_output_shape = utils::GetTensorProtoShape(*reshape1_output_proto);
+
+  // Intermediate shape must split channels in groups only
+  gsl::span<const int64_t> reshape1_output_dims = reshape1_output_shape.GetDims();
+  if (reshape1_input0_dims[0] != reshape1_output_dims[0]) {
+    return nullptr;
+  }
+  if (reshape1_output_dims.size() < 3) {
+    return nullptr;
+  }
+  if (reshape1_input0_dims[1] != (reshape1_output_dims[1] * reshape1_output_dims[2])) {
+    return nullptr;
+  }
+  if (reshape1_output_dims.size() != reshape1_input0_dims.size() + 1) {
+    return nullptr;
+  }
+  size_t remaining_dims = reshape1_input0_dims.size() - 2;
+  if (reshape1_output_dims.size() < remaining_dims + 3) {
+    return nullptr;
+  }
+  for (size_t i = 0; i < remaining_dims; ++i) {
+    if (reshape1_input0_dims[i + 2] != reshape1_output_dims[i + 3]) {
+      return nullptr;
+    }
+  }
+
+  // Intermediate transpose must only permute channels
+  std::optional<std::vector<int64_t>> perm = GetTransposePerm(*transpose);
+  if (!perm.has_value()) {
+    return nullptr;
+  }
+  std::vector<int64_t> perm_to_check = perm.value();
+  std::swap(perm_to_check[1], perm_to_check[2]);
+  std::vector<int64_t> perm_expected(perm_to_check.size());
+  for (size_t i = 0; i < perm_expected.size(); ++i) {
+    perm_expected[i] = static_cast<int64_t>(i);
+  }
+  if (perm_to_check != perm_expected) {
+    return nullptr;
+  }
+
+  // Check if the first and last transpose is a canceling transpose pair
+  const NodeUnit* transpose_tail = pattern->at(4);
+  std::optional<std::vector<int64_t>> perm_head = GetTransposePerm(transpose_head);
+  if (!perm_head.has_value()) {
+    return nullptr;
+  }
+  std::optional<std::vector<int64_t>> perm_tail = GetTransposePerm(*transpose_tail);
+  if (!perm_tail.has_value()) {
+    return nullptr;
+  }
+  if (!IsCancelingTransposePermPair(perm_head, perm_tail)) {
+    return nullptr;
+  }
+
+  if (CreateOrValidateOnQnn(&qnn_model_wrapper, pattern.value(), /*validate=*/true) != Status::OK()) {
+    return nullptr;
+  }
+  return std::make_unique<ChannelShuffleFusion>(pattern.value());
+}
+
+gsl::span<const NodeUnit* const> ChannelShuffleFusion::GetNodeUnits() const {
+  return gsl::span<const NodeUnit* const>{node_units_.data(), node_units_.size()};
+}
+
+Status ChannelShuffleFusion::IsSupported(
+    QnnModelWrapper& qnn_model_wrapper, [[maybe_unused]] const logging::Logger& logger) const {
+  return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), /*validate=*/true);
+}
+
+Status ChannelShuffleFusion::AddToModelBuilder(
+    QnnModelWrapper& qnn_model_wrapper, [[maybe_unused]] const logging::Logger& logger) const {
+  return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), /*validate=*/false);
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.h
new file mode 100644
index 0000000000000..687a157a497c5
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.h
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <gsl/gsl>
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
+#include "core/providers/qnn/ort_api.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class QnnModelWrapper;
+
+/// <summary>
+/// Represents a fusion of pattern:  Transpose -> ChannelShuffle (Reshape -> Transpose -> Reshape) -> Transpose
+/// </summary>
+class ChannelShuffleFusion : public IQnnNodeGroup {
+ public:
+  explicit ChannelShuffleFusion(gsl::span<const NodeUnit* const> node_units) {
+    ORT_ENFORCE(node_units.size() == 5, "Pattern expect exactly 5 NodeUnits.");
+    node_units_[0] = node_units[0];
+    node_units_[1] = node_units[1];
+    node_units_[2] = node_units[2];
+    node_units_[3] = node_units[3];
+    node_units_[4] = node_units[4];
+  }
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(ChannelShuffleFusion);
+
+  Status IsSupported(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override;
+  Status AddToModelBuilder(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override;
+  gsl::span<const NodeUnit* const> GetNodeUnits() const override;
+  const NodeUnit* GetTargetNodeUnit() const override { return node_units_[0]; }
+  std::string_view Type() const override { return "ChannelShuffleFusion"; }
+
+  /// <summary>
+  /// Traverses graph to check if the given starting NodeUnit is part of a channel shuffle pattern.
+  /// If so, returns a IQnnNodeGroup that contains the ChannelShuffle NodeUnits.
+  /// </summary>
+  static std::unique_ptr<IQnnNodeGroup> TryFusion(
+      QnnModelWrapper& qnn_model_wrapper,
+      const NodeUnit& transpose_node_unit,
+      const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
+      const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
+      const logging::Logger& logger);
+
+ private:
+  std::array<const NodeUnit*, 5> node_units_;
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index 839079e6c1a8e..cc512524e4dd7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -16,6 +16,8 @@
 #include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h"
 #include "core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.h"
+#include "core/providers/qnn/builder/qnn_node_group/channel_shuffle_fusion.h"
+
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/ort_api.h"
 
@@ -92,7 +94,7 @@ static std::unique_ptr<IQnnNodeGroup> TryQnnFusions(
       {"HardSigmoid", HardSigmoidMulFusion::TryFusion},
       {"Gemm", ReshapeGemmFusion::TryFusion},
       {"Mul", ScaleSoftmaxFusion::TryFusion},
-  };
+      {"Transpose", ChannelShuffleFusion::TryFusion}};
 
   // For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes).
   if (starting_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index f869f33847bbf..407fce4a4374c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -1299,6 +1299,21 @@ Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
+Status GetPermToLastAxis(uint32_t axis, uint32_t rank, std::vector<uint32_t>& perm) {
+  ORT_RETURN_IF_NOT(axis < rank, "Expected axis must be smaller than rank: ", axis, " >= ", rank);
+
+  perm.reserve(rank);
+  for (uint32_t dim = 0; dim < rank; ++dim) {
+    perm.push_back(dim);
+  }
+
+  // Swap axis with the last one.
+  perm[axis] = rank - 1;
+  perm[rank - 1] = axis;
+
+  return Status::OK();
+}
+
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index eefde87630077..c0c47b6d1e931 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -385,6 +385,16 @@ Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
                        bool output_symmetric,
                        bool do_op_validation);
 
+/**
+ * Get permutation to transpose given axis to the last one.
+ *
+ * @param[in] axis the current axis to be transposed
+ * @param[in] rank the expected rank for permutation
+ * @param[out] perm the permutation for transpose
+ * @return execution status of this function
+ */
+Status GetPermToLastAxis(uint32_t axis, uint32_t rank, std::vector<uint32_t>& perm);
+
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index c085ef7c31f0e..bc69d38edf482 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -517,7 +517,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) {
     auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
     // Register callback for ETW capture state (rundown)
-    callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
+    auto etw_callback =
         [&etwRegistrationManager, this](
             LPCGUID SourceId,
             ULONG IsEnabled,
@@ -557,8 +557,10 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
             // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID);
             (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt);
           }
-        });
-    etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
+        };
+    callback_ETWSink_key_ = "QnnExecutionProvider_";
+    callback_ETWSink_key_.append(std::to_string(reinterpret_cast<uintptr_t>(this)));
+    etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_key_, etw_callback);
   }
 #endif
 }
@@ -574,9 +576,7 @@ QNNExecutionProvider::~QNNExecutionProvider() {
 
   // Unregister the ETW callback
 #if defined(_WIN32)
-  if (callback_ETWSink_provider_ != nullptr) {
-    logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
-  }
+  logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_key_);
 #endif
 }
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 4ccb1554f8b15..bf022ae0e0018 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -96,7 +96,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool stop_share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
 #if defined(_WIN32)
-  onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
+  std::string callback_ETWSink_key_;
 #endif
   qnn::ModelSettings model_settings_ = {};
   bool dump_json_qnn_graph_ = false;
diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.cc b/onnxruntime/core/providers/qnn/qnn_telemetry.cc
index b2c8350bfe8ca..9acc85c5feebe 100644
--- a/onnxruntime/core/providers/qnn/qnn_telemetry.cc
+++ b/onnxruntime/core/providers/qnn/qnn_telemetry.cc
@@ -55,6 +55,7 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim
 #endif  // !BUILD_QNN_EP_STATIC_LIB
 
 #include "core/providers/qnn/ort_api.h"
+#include <unordered_map>
 
 namespace onnxruntime {
 namespace qnn {
@@ -66,7 +67,7 @@ uint32_t QnnTelemetry::global_register_count_ = 0;
 bool QnnTelemetry::enabled_ = true;
 UCHAR QnnTelemetry::level_ = 0;
 UINT64 QnnTelemetry::keyword_ = 0;
-std::vector<const QnnTelemetry::EtwInternalCallback*> QnnTelemetry::callbacks_;
+std::unordered_map<std::string, QnnTelemetry::EtwInternalCallback> QnnTelemetry::callbacks_;
 std::mutex QnnTelemetry::callbacks_mutex_;
 #endif  // !BUILD_QNN_EP_STATIC_LIB
 
@@ -157,25 +158,21 @@ void QnnTelemetry::LogQnnProfileEvent(uint64_t timestamp,
       TraceLoggingString(eventIdentifier, "Event Identifier"));
 }
 
-void QnnTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) {
+void QnnTelemetry::RegisterInternalCallback(const std::string& cb_key, EtwInternalCallback callback) {
 #if BUILD_QNN_EP_STATIC_LIB
-  WindowsTelemetry::RegisterInternalCallback(callback);
+  WindowsTelemetry::RegisterInternalCallback(cb_key, std::move(callback));
 #else
   std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  callbacks_.push_back(&callback);
+  callbacks_.insert_or_assign(cb_key, std::move(callback));
 #endif
 }
 
-void QnnTelemetry::UnregisterInternalCallback(const EtwInternalCallback& callback) {
+void QnnTelemetry::UnregisterInternalCallback(const std::string& cb_key) {
 #if BUILD_QNN_EP_STATIC_LIB
-  WindowsTelemetry::UnregisterInternalCallback(callback);
+  WindowsTelemetry::UnregisterInternalCallback(cb_key);
 #else
   std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(),
-                                [&callback](const EtwInternalCallback* ptr) {
-                                  return ptr == &callback;
-                                });
-  callbacks_.erase(new_end, callbacks_.end());
+  callbacks_.erase(cb_key);
 #endif
 }
 
@@ -188,10 +185,12 @@ void NTAPI QnnTelemetry::ORT_TL_EtwEnableCallback(
     _In_ ULONGLONG MatchAllKeyword,
     _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
     _In_opt_ PVOID CallbackContext) {
-  std::lock_guard<std::mutex> lock(provider_change_mutex_);
-  enabled_ = (IsEnabled != 0);
-  level_ = Level;
-  keyword_ = MatchAnyKeyword;
+  {
+    std::lock_guard<std::mutex> lock(provider_change_mutex_);
+    enabled_ = (IsEnabled != 0);
+    level_ = Level;
+    keyword_ = MatchAnyKeyword;
+  }
 
   InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
 }
@@ -200,8 +199,9 @@ void QnnTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Leve
                                    ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
                                    PVOID CallbackContext) {
   std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
-  for (const auto& callback : callbacks_) {
-    (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  for (const auto& entry : callbacks_) {
+    const auto& cb = entry.second;
+    cb(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
   }
 }
 #endif  // !BUILD_QNN_EP_STATIC_LIB
diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.h b/onnxruntime/core/providers/qnn/qnn_telemetry.h
index a2d42c518c1ac..4d68f14969e9e 100644
--- a/onnxruntime/core/providers/qnn/qnn_telemetry.h
+++ b/onnxruntime/core/providers/qnn/qnn_telemetry.h
@@ -12,6 +12,7 @@
 #include <functional>
 #include <mutex>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "core/providers/qnn/ort_api.h"
@@ -58,9 +59,9 @@ class QnnTelemetry {
                                                  ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
                                                  PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
 
-  static void RegisterInternalCallback(const EtwInternalCallback& callback);
+  static void RegisterInternalCallback(const std::string& cb_key, EtwInternalCallback callback);
 
-  static void UnregisterInternalCallback(const EtwInternalCallback& callback);
+  static void UnregisterInternalCallback(const std::string& cb_key);
 
  private:
   QnnTelemetry();
@@ -72,7 +73,7 @@ class QnnTelemetry {
   static uint32_t global_register_count_;
   static bool enabled_;
 
-  static std::vector<const EtwInternalCallback*> callbacks_;
+  static std::unordered_map<std::string, EtwInternalCallback> callbacks_;
   static std::mutex callbacks_mutex_;
   static std::mutex provider_change_mutex_;
   static UCHAR level_;
diff --git a/onnxruntime/core/providers/rocm/roctracer_manager.h b/onnxruntime/core/providers/rocm/roctracer_manager.h
index ed2d935e9fe4f..52a5dccae4840 100644
--- a/onnxruntime/core/providers/rocm/roctracer_manager.h
+++ b/onnxruntime/core/providers/rocm/roctracer_manager.h
@@ -7,7 +7,6 @@
 
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctracer.h>
-#include <roctracer/roctracer_hcc.h>
 #include <roctracer/roctracer_hip.h>
 #include <roctracer/roctracer_ext.h>
 #include <roctracer/roctracer_roctx.h>
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 5545aca1b6140..f20760fcc86fd 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -314,10 +314,11 @@ struct ProviderHost {
   virtual logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) = 0;
   virtual void logging__EtwRegistrationManager__RegisterInternalCallback(
       logging::EtwRegistrationManager* p,
-      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+      const std::string& cb_key,
+      logging::EtwRegistrationManager_EtwInternalCallback callback) = 0;
   virtual void logging__EtwRegistrationManager__UnregisterInternalCallback(
       logging::EtwRegistrationManager* p,
-      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+      const std::string& cb_key) = 0;
 #endif  // defined(_WIN32)
 
   // Env
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 5ca268c453be6..5fadd0b0966e8 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -58,11 +58,11 @@ struct EtwRegistrationManager final {
   static EtwRegistrationManager& Instance() { return g_host->logging__EtwRegistrationManager__Instance(); }
   static bool SupportsETW() { return g_host->logging__EtwRegistrationManager__SupportsETW(); }
   Severity MapLevelToSeverity() { return g_host->logging__EtwRegistrationManager__MapLevelToSeverity(this); }
-  void RegisterInternalCallback(const EtwInternalCallback& callback) {
-    g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, callback);
+  void RegisterInternalCallback(const std::string& cb_key, EtwInternalCallback callback) {
+    g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, cb_key, std::move(callback));
   }
-  void UnregisterInternalCallback(const EtwInternalCallback& callback) {
-    g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, callback);
+  void UnregisterInternalCallback(const std::string& cb_key) {
+    g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, cb_key);
   }
 };
 #endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/webgpu/math/gemm.cc b/onnxruntime/core/providers/webgpu/math/gemm.cc
index ac8d1a590c250..c833938f9ad30 100644
--- a/onnxruntime/core/providers/webgpu/math/gemm.cc
+++ b/onnxruntime/core/providers/webgpu/math/gemm.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/webgpu/math/gemm.h"
-#include "core/providers/webgpu/math/gemm_vec4.h"
+#include "core/providers/webgpu/math/gemm_packed.h"
 
 #include <vector>
 
@@ -38,109 +38,37 @@ WEBGPU_GEMM_VERSIONED_KERNEL(9, 10)
 WEBGPU_GEMM_VERSIONED_KERNEL(11, 12)
 WEBGPU_GEMM_KERNEL(13)
 
-Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const uint32_t TILE_SIZE = 16;
-
-  // Add shared memory arrays
-  shader.AdditionalImplementation() << "var<workgroup> tile_a: array<array<output_value_t, " << TILE_SIZE << ">, " << TILE_SIZE << ">;\n"
-                                    << "var<workgroup> tile_b: array<array<output_value_t, " << TILE_SIZE << ">, " << TILE_SIZE << ">;\n\n";
-
+Status GemmNaiveProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
-  shader.MainFunctionBody() << "  var value = output_value_t(0);\n\n"
-                            << "  let tile_col_start = (workgroup_idx % uniforms.num_tile_n) * " << TILE_SIZE << "u;\n"
-                            << "  let tile_row_start = (workgroup_idx / uniforms.num_tile_n) * " << TILE_SIZE << "u;\n";
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "  let m = global_idx / uniforms.N;\n"
+                            << "  let n = global_idx % uniforms.N;\n"
+                            << "  var value = output_value_t(0);\n"
+                            << "\n";
 
   // When A or B is empty, we don't bind A and B. Because WebGPU doesn't support binding a zero-sized buffer.
   if (need_handle_matmul_) {
     const ShaderVariableHelper& A = shader.AddInput("A", ShaderUsage::UseUniform);
     const ShaderVariableHelper& B = shader.AddInput("B", ShaderUsage::UseUniform);
 
-    shader.MainFunctionBody()
-        << "  let num_tiles = (uniforms.K - 1u) / " << TILE_SIZE << "u + 1u;\n"
-        << "  var k_start = 0u;\n"
-        << "  for (var t = 0u; t < num_tiles; t = t + 1u) {\n";
-
-    // Fill workgroup shared memory
-    if (transA_ && transB_) {
-      shader.MainFunctionBody() << "    var col = tile_row_start + local_id.x;\n"
-                                << "    var row = k_start + local_id.y;\n"
-                                << "    if (col < uniforms.M && row < uniforms.K) {\n"
-                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.M + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n\n"
-                                << "    col = k_start + local_id.x;\n"
-                                << "    row = tile_col_start + local_id.y;\n"
-                                << "    if (col < uniforms.K && row < uniforms.N) {\n"
-                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.K + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n";
-    } else if (transA_ && !transB_) {
-      shader.MainFunctionBody() << "    var col = tile_row_start + local_id.x;\n"
-                                << "    var row = k_start + local_id.y;\n"
-                                << "    if (col < uniforms.M && row < uniforms.K) {\n"
-                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.M + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n\n"
-                                << "    col = tile_col_start + local_id.x;\n"
-                                << "    row = k_start + local_id.y;\n"
-                                << "    if (col < uniforms.N && row < uniforms.K) {\n"
-                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.N + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n";
-    } else if (!transA_ && transB_) {
-      shader.MainFunctionBody() << "    var col = k_start + local_id.x;\n"
-                                << "    var row = tile_row_start + local_id.y;\n"
-                                << "    if (col < uniforms.K && row < uniforms.M) {\n"
-                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.K + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n\n"
-                                << "    col = k_start + local_id.x;\n"
-                                << "    row = tile_col_start + local_id.y;\n"
-                                << "    if (col < uniforms.K && row < uniforms.N) {\n"
-                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.K + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n";
-    } else {
-      shader.MainFunctionBody() << "    var col = k_start + local_id.x;\n"
-                                << "    var row = tile_row_start + local_id.y;\n"
-                                << "    if (col < uniforms.K && row < uniforms.M) {\n"
-                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.K + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n\n"
-                                << "    col = tile_col_start + local_id.x;\n"
-                                << "    row = k_start + local_id.y;\n"
-                                << "    if (col < uniforms.N && row < uniforms.K) {\n"
-                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.N + col") << ";\n"
-                                << "    } else {\n"
-                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
-                                << "    }\n";
-    }
-
-    shader.MainFunctionBody() << "    k_start = k_start + " << TILE_SIZE << "u;\n"
-                              << "    workgroupBarrier();\n\n"
-                              << "    for (var k = 0u; k < " << TILE_SIZE << "u; k = k + 1u) {\n";
+    shader.MainFunctionBody() << "  for (var k = 0u; k < uniforms.K; k = k + 1u) {\n";
 
     if (transA_ && transB_) {
-      shader.MainFunctionBody() << "      value = value + tile_a[k][local_id.y] * tile_b[local_id.x][k];\n";
+      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("k * uniforms.M + m")
+                                << " * " << B.GetByOffset("n * uniforms.K + k") << ";\n";
     } else if (transA_ && !transB_) {
-      shader.MainFunctionBody() << "      value = value + tile_a[k][local_id.y] * tile_b[k][local_id.x];\n";
+      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("k * uniforms.M + m")
+                                << " * " << B.GetByOffset("k * uniforms.N + n") << ";\n";
     } else if (!transA_ && transB_) {
-      shader.MainFunctionBody() << "      value = value + tile_a[local_id.y][k] * tile_b[local_id.x][k];\n";
+      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("m * uniforms.K + k")
+                                << " * " << B.GetByOffset("n * uniforms.K + k") << ";\n";
     } else {
-      shader.MainFunctionBody() << "      value = value + tile_a[local_id.y][k] * tile_b[k][local_id.x];\n";
+      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("m * uniforms.K + k")
+                                << " * " << B.GetByOffset("k * uniforms.N + n") << ";\n";
     }
-
-    shader.MainFunctionBody() << "    }\n"
-                              << "    workgroupBarrier();\n"
-                              << "  }\n\n";
+    shader.MainFunctionBody() << "  }\n"
+                              << "\n";
   }
 
   // Calculate Alpha
@@ -148,9 +76,6 @@ Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
     shader.MainFunctionBody() << "  value = value * output_value_t(uniforms.alpha);\n";
   }
 
-  shader.MainFunctionBody() << "  let m = tile_row_start + local_id.y;\n"
-                            << "  let n = tile_col_start + local_id.x;\n";
-
   // Calculate Bias
   if (need_handle_bias_) {
     const ShaderVariableHelper& C = shader.AddInput("C", ShaderUsage::UseUniform);
@@ -158,10 +83,7 @@ Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
                               << C.GetByOffset(C.BroadcastedIndicesToOffset("vec2(m, n)", output)) << ";\n";
   }
 
-  // Write output
-  shader.MainFunctionBody() << "  if (m < uniforms.M && n < uniforms.N) {\n"
-                            << "    " << output.SetByOffset("m * uniforms.N + n", "value") << "\n"
-                            << "  }\n";
+  shader.MainFunctionBody() << output.SetByOffset("global_idx", "value") << "\n";
 
   return Status::OK();
 }
@@ -182,14 +104,14 @@ Status Gemm::ComputeInternal(ComputeContext& context) const {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input tensors A and B must be 2 dimensional.");
   }
 
-  uint32_t M = onnxruntime::narrow<uint32_t>(transA_ ? A_shape[1] : A_shape[0]);
-  uint32_t K = onnxruntime::narrow<uint32_t>(transA_ ? A_shape[0] : A_shape[1]);
-  uint32_t N = onnxruntime::narrow<uint32_t>(transB_ ? B_shape[0] : B_shape[1]);
-
   if ((transA_ ? A_shape[0] : A_shape[1]) != (transB_ ? B_shape[1] : B_shape[0])) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inner dimensions of A and B must match.");
   }
 
+  int64_t M = transA_ ? A_shape[1] : A_shape[0];
+  int64_t K = transA_ ? A_shape[0] : A_shape[1];
+  int64_t N = transB_ ? B_shape[0] : B_shape[1];
+
   std::vector<int64_t> output_dims{M, N};
   auto* Y = context.Output(0, output_dims);
   int64_t output_size = Y->Shape().Size();
@@ -198,42 +120,36 @@ Status Gemm::ComputeInternal(ComputeContext& context) const {
     return Status::OK();
   }
 
-  // First try vec4 optimization if possible
-  if (CanApplyGemmVec4(A, B)) {
-    return ApplyGemmVec4(A, B, C, transA_, transB_, alpha_, beta_, context, Y);
-  }
-
   // WebGPU doesn't support binding a zero-sized buffer, so we need to check if A or B is empty.
   bool need_handle_matmul = A_shape.Size() > 0 && B_shape.Size() > 0;
   bool need_handle_bias = C && beta_;
 
-  GemmProgram program{transA_, transB_, alpha_, need_handle_bias, need_handle_matmul};
+  if (M <= 8 && N <= 8 && K <= 8) {
+    // Use naive implementation for small matrices
+    GemmNaiveProgram program{transA_, transB_, alpha_, need_handle_bias, need_handle_matmul};
+    if (need_handle_matmul) {
+      program.AddInputs({{A, ProgramTensorMetadataDependency::Type},
+                         {B, ProgramTensorMetadataDependency::Type}});
+    }
 
-  if (need_handle_matmul) {
-    program.AddInputs({{A, ProgramTensorMetadataDependency::Type},
-                       {B, ProgramTensorMetadataDependency::Type}});
-  }
+    if (need_handle_bias) {
+      program.AddInput({C, ProgramTensorMetadataDependency::Rank});
+    }
 
-  if (need_handle_bias) {
-    program.AddInput({C, ProgramTensorMetadataDependency::Rank});
+    program.CacheHint(alpha_, transA_, transB_)
+        .AddOutputs({{Y, ProgramTensorMetadataDependency::Type}})
+        .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+        .SetWorkgroupSize(WORKGROUP_SIZE)
+        .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                              {static_cast<uint32_t>(M)},
+                              {static_cast<uint32_t>(N)},
+                              {static_cast<uint32_t>(K)},
+                              {alpha_},
+                              {beta_}});
+    return context.RunProgram(program);
   }
 
-  const uint32_t TILE_SIZE = 16;
-  const uint32_t num_tile_n = (N + TILE_SIZE - 1) / TILE_SIZE;
-  const uint32_t num_tile_m = (M + TILE_SIZE - 1) / TILE_SIZE;
-
-  program.CacheHint(alpha_, transA_, transB_)
-      .AddOutputs({{Y, ProgramTensorMetadataDependency::Type}})
-      .SetDispatchGroupSize(num_tile_n * num_tile_m)
-      .SetWorkgroupSize(TILE_SIZE, TILE_SIZE)
-      .AddUniformVariables({{num_tile_n},
-                            {M},
-                            {N},
-                            {K},
-                            {alpha_},
-                            {beta_}});
-
-  return context.RunProgram(program);
+  return ApplyGemmPacked(A, B, C, transA_, transB_, alpha_, beta_, context);
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/math/gemm.h b/onnxruntime/core/providers/webgpu/math/gemm.h
index 7fee1091de5f8..06e6587050604 100644
--- a/onnxruntime/core/providers/webgpu/math/gemm.h
+++ b/onnxruntime/core/providers/webgpu/math/gemm.h
@@ -10,10 +10,10 @@
 namespace onnxruntime {
 namespace webgpu {
 
-class GemmProgram final : public Program<GemmProgram> {
+class GemmNaiveProgram final : public Program<GemmNaiveProgram> {
  public:
-  GemmProgram(bool transA, bool transB, float alpha, bool need_handle_bias, bool need_handle_matmul)
-      : Program{"Gemm"},
+  GemmNaiveProgram(bool transA, bool transB, float alpha, bool need_handle_bias, bool need_handle_matmul)
+      : Program{"GemmNaive"},
         transA_{transA},
         transB_{transB},
         alpha_{alpha},
@@ -23,7 +23,7 @@ class GemmProgram final : public Program<GemmProgram> {
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
-      {"num_tile_n", ProgramUniformVariableDataType::Uint32},
+      {"output_size", ProgramUniformVariableDataType::Uint32},
       {"M", ProgramUniformVariableDataType::Uint32},
       {"N", ProgramUniformVariableDataType::Uint32},
       {"K", ProgramUniformVariableDataType::Uint32},
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_packed.cc b/onnxruntime/core/providers/webgpu/math/gemm_packed.cc
new file mode 100644
index 0000000000000..6aefa90a59285
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/gemm_packed.cc
@@ -0,0 +1,114 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/gemm_packed.h"
+
+#include "core/providers/webgpu/webgpu_utils.h"
+
+#include "core/providers/webgpu/math/matmul_utils.h"
+#include "core/providers/webgpu/math/gemm_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+
+  // Each thread compute 4*4 elements
+  InlinedVector<int64_t> elements_per_thread = InlinedVector<int64_t>({4, 4, 1});
+
+  const std::string data_type = "output_element_t";
+
+  if (need_handle_matmul_) {
+    const auto& a = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+    const auto& b = shader.AddInput("b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+
+    MatMulReadFnSource(shader, a, b, nullptr, transA_, transB_, is_vec4_);
+  }
+  if (is_vec4_) {
+    ORT_RETURN_IF_ERROR(MakeMatMulPackedVec4Source(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, nullptr, transA_, transB_, alpha_, need_handle_matmul_, output_components_));
+  } else {
+    ORT_RETURN_IF_ERROR(MakeMatMulPackedSource(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, nullptr, transA_, transB_, alpha_, need_handle_matmul_));
+  }
+  MatMulWriteFnSource(shader, output, need_handle_bias_, true, c_components_, output_components_, c_is_scalar_);
+
+  return Status::OK();
+}
+
+Status ApplyGemmPacked(const Tensor* a,
+                       const Tensor* b,
+                       const Tensor* c,
+                       bool transA,
+                       bool transB,
+                       float alpha,
+                       float beta,
+                       ComputeContext& context) {
+  const auto& a_shape = a->Shape();
+  const auto& b_shape = b->Shape();
+
+  uint32_t M = onnxruntime::narrow<uint32_t>(transA ? a_shape[1] : a_shape[0]);
+  uint32_t K = onnxruntime::narrow<uint32_t>(transA ? a_shape[0] : a_shape[1]);
+  uint32_t N = onnxruntime::narrow<uint32_t>(transB ? b_shape[0] : b_shape[1]);
+
+  std::vector<int64_t> output_dims{M, N};
+  auto* y = context.Output(0, output_dims);
+  int64_t output_size = y->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  // WebGPU doesn't support binding a zero-sized buffer, so we need to check if A or B is empty.
+  bool need_handle_matmul = a_shape.Size() > 0 && b_shape.Size() > 0;
+  bool need_handle_bias = c && beta;
+
+  const bool is_vec4 = a_shape[1] % 4 == 0 && b_shape[1] % 4 == 0;
+
+  // Components for A, B
+  int components = is_vec4 ? 4 : 1;
+  // Components for Y
+  int output_components = (is_vec4 && N % 4 == 0) ? 4 : 1;
+  // Components for C.
+  int c_components = 1;
+
+  bool c_is_scalar = false;
+  if (need_handle_bias) {
+    const auto& c_shape = c->Shape();
+    int64_t c_last_dim = c_shape[c_shape.NumDimensions() - 1];
+    // `C` in GEMM might be broadcast to the output, and broadcasting requires the components to be consistent.
+    // So we use vec4 for C when its last dimension is N, and the output is also a vec4.
+    c_components = (c_last_dim == N && output_components == 4) ? 4 : 1;
+    c_is_scalar = c_shape.Size() == 1;
+  }
+
+  GemmProgram program{transA, transB, alpha, need_handle_bias, need_handle_matmul, c_components, c_is_scalar, output_components, is_vec4};
+
+  if (need_handle_matmul) {
+    program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, components},
+                       {b, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  }
+
+  if (need_handle_bias) {
+    program.AddInput({c, ProgramTensorMetadataDependency::TypeAndRank, c_components});
+  }
+
+  const uint32_t TILE_SIZE = 32;
+  const uint32_t num_tile_n = (N + TILE_SIZE - 1) / TILE_SIZE;
+  const uint32_t num_tile_m = (M + TILE_SIZE - 1) / TILE_SIZE;
+
+  program.CacheHint(alpha, transA, transB, c_is_scalar)
+      .AddOutputs({{y, ProgramTensorMetadataDependency::TypeAndRank, output_components}})
+      .SetDispatchGroupSize(num_tile_n, num_tile_m, 1)
+      .SetWorkgroupSize(GemmProgram::MATMUL_PACKED_WORKGROUP_SIZE_X, GemmProgram::MATMUL_PACKED_WORKGROUP_SIZE_Y, GemmProgram::MATMUL_PACKED_WORKGROUP_SIZE_Z)
+      .AddUniformVariables({{alpha},
+                            {beta},
+                            {M}, /* dim_a_outer */
+                            {N}, /* dim_b_outer */
+                            {K}} /*dim_inner */
+      );
+
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_packed.h b/onnxruntime/core/providers/webgpu/math/gemm_packed.h
new file mode 100644
index 0000000000000..dce5164693aa8
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/gemm_packed.h
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+#include "core/providers/webgpu/shader_helper.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class GemmProgram final : public Program<GemmProgram> {
+ public:
+  GemmProgram(bool transA, bool transB, float alpha, bool need_handle_bias, bool need_handle_matmul, int c_components, bool c_is_scalar, int output_components, bool is_vec4 = false)
+      : Program{"Gemm"},
+        transA_{transA},
+        transB_{transB},
+        alpha_{alpha},
+        need_handle_bias_{need_handle_bias},
+        need_handle_matmul_{need_handle_matmul},
+        c_components_(c_components),
+        c_is_scalar_(c_is_scalar),
+        output_components_(output_components),
+        is_vec4_(is_vec4) {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"alpha", ProgramUniformVariableDataType::Float32},
+      {"beta", ProgramUniformVariableDataType::Float32},
+      {"dim_a_outer", ProgramUniformVariableDataType::Uint32},
+      {"dim_b_outer", ProgramUniformVariableDataType::Uint32},
+      {"dim_inner", ProgramUniformVariableDataType::Uint32});
+
+  constexpr static uint32_t MATMUL_PACKED_WORKGROUP_SIZE_X = 8;
+  constexpr static uint32_t MATMUL_PACKED_WORKGROUP_SIZE_Y = 8;
+  constexpr static uint32_t MATMUL_PACKED_WORKGROUP_SIZE_Z = 1;
+
+ private:
+  bool transA_;
+  bool transB_;
+  float alpha_;
+  bool need_handle_bias_;
+  bool need_handle_matmul_;
+  int c_components_;
+  bool c_is_scalar_ = false;
+  int output_components_;
+  bool is_vec4_ = false;
+};
+
+Status ApplyGemmPacked(const Tensor* a,
+                       const Tensor* b,
+                       const Tensor* c,
+                       bool transA,
+                       bool transB,
+                       float alpha,
+                       float beta,
+                       ComputeContext& context);
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_utils.cc b/onnxruntime/core/providers/webgpu/math/gemm_utils.cc
new file mode 100644
index 0000000000000..a1adb1a016b73
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/gemm_utils.cc
@@ -0,0 +1,482 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/gemm_utils.h"
+
+#include "core/providers/webgpu/webgpu_utils.h"
+#include "core/providers/webgpu/math/matmul_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+// Some helper functions to handle the bias for GEMM and MatMul,
+// which are used in the MatMulWriteFnSource function.
+namespace {
+
+void HanldeMaybeHaveBiasForGEMM(ShaderHelper& shader,
+                                const ShaderVariableHelper& output,
+                                bool has_bias,
+                                int c_components,
+                                int output_components,
+                                bool c_is_scalar) {
+  shader.AdditionalImplementation() << "    let coords = vec2(u32(row), u32(colIn));\n";
+
+  if (has_bias) {
+    const ShaderVariableHelper& C = shader.AddInput("C", ShaderUsage::UseUniform);
+    shader.AdditionalImplementation() << "    value += output_element_t(uniforms.beta) * ";
+    // We can be allowed to use broadcasting only when both components are equal.
+    // There is only one case for c_components_ is not equal output_components.
+    // I.g. the former is `1` and the latter is `4`.
+    // That means the shape of C is either {M,1} or {1,1}
+    if (c_components == output_components) {
+      shader.AdditionalImplementation() << "output_value_t("
+                                        << C.GetByOffset(C.BroadcastedIndicesToOffset("vec2(u32(row), u32(colIn))", output)) << ");\n";
+    } else if (c_is_scalar) {
+      shader.AdditionalImplementation() << "output_value_t(C[0]);\n";
+    } else {
+      shader.AdditionalImplementation() << "output_value_t(C[row]);\n";
+    }
+  }
+  shader.AdditionalImplementation() << output.SetByIndices("coords", "value") << "\n";
+}
+
+void HandleMaybeBiasForMatMul(ShaderHelper& shader,
+                              const ShaderVariableHelper& output,
+                              bool has_bias,
+                              std::string activation_snippet,
+                              bool is_channels_last) {
+  shader.AdditionalImplementation() << "    let coords = vec3(u32(batch), u32(row), u32(colIn));\n";
+  if (has_bias) {
+    shader.AdditionalImplementation() << "    value = value + output_value_t(" << (is_channels_last ? "bias[colIn]" : "bias[row]") << ");\n";
+  }
+  shader.AdditionalImplementation() << "    " << activation_snippet << "\n"
+                                    << output.SetByIndices("coords", "value") << "\n";
+}
+
+}  // namespace
+
+void MatMulReadFnSource(ShaderHelper& shader,
+                        const ShaderVariableHelper& a,
+                        const ShaderVariableHelper& b,
+                        const ShaderIndicesHelper* batch_dims,
+                        bool transA,
+                        bool transB,
+                        bool is_vec4) {
+  int components = is_vec4 ? 4 : 1;
+  const std::string data_type = "output_element_t";
+  const std::string type_string = MakeScalarOrVectorType(components, data_type);
+
+  shader.AdditionalImplementation()
+      << "fn mm_readA(batch: i32, row: i32, colIn: i32 "
+      << (batch_dims
+              ? ", batch_indices: batch_dims_indices_t"
+              : "")
+      << ") -> " << type_string << " {\n "
+      << "    var value = " << type_string << "(0.0);\n"
+      << "    let col = colIn * " << components << ";\n";
+  if (transA) {
+    shader.AdditionalImplementation() << "    if(row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_a_outer)) {\n";
+  } else {
+    shader.AdditionalImplementation() << "    if(row < i32(uniforms.dim_a_outer) && col < i32(uniforms.dim_inner)) {\n";
+  }
+  shader.AdditionalImplementation() << "        var a_indices: a_indices_t;\n";
+
+  if (batch_dims) {
+    shader.AdditionalImplementation() << ConvertOutputBatchIndicesToInputBatchIndices("a", a, a.Rank() - 2, batch_dims ? batch_dims->Rank() : 0, " batch_indices ") << "\n";
+  }
+  shader.AdditionalImplementation() << a.IndicesSet("a_indices", a.Rank() - 2, "u32(row)") << "\n"
+                                    << a.IndicesSet("a_indices", a.Rank() - 1, "u32(colIn)") << "\n"
+                                    << "        value = " << a.GetByIndices("a_indices") << ";\n"
+                                    << "    }\n"
+                                    << "    return value;\n"
+                                    << "}\n\n";
+
+  // Add the mm_readB function
+  shader.AdditionalImplementation()
+      << "fn mm_readB(batch: i32, row: i32, colIn: i32 "
+      << (batch_dims
+              ? ", batch_indices: batch_dims_indices_t"
+              : "")
+      << ") -> " << type_string << " {\n "
+      << "    var value = " << type_string << "(0.0);\n"
+      << "    let col = colIn * " << components << ";\n";
+
+  if (transB) {
+    shader.AdditionalImplementation() << "    if(row < i32(uniforms.dim_b_outer) && col < i32(uniforms.dim_inner)) {\n";
+  } else {
+    shader.AdditionalImplementation() << "    if(row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_b_outer)) {\n";
+  }
+
+  shader.AdditionalImplementation() << "        var b_indices: b_indices_t;\n"
+                                    << ConvertOutputBatchIndicesToInputBatchIndices("b", b, b.Rank() - 2, batch_dims ? batch_dims->Rank() : 0, "batch_indices")
+                                    << b.IndicesSet("b_indices", b.Rank() - 2, "u32(row)") << "\n"
+                                    << b.IndicesSet("b_indices", b.Rank() - 1, "u32(colIn)") << "\n"
+                                    << "        value = " << b.GetByIndices("b_indices") << ";\n"
+                                    << "    }\n"
+                                    << "    return value;\n"
+                                    << "}\n\n";
+}
+
+void MatMulWriteFnSource(ShaderHelper& shader,
+                         const ShaderVariableHelper& output,
+                         bool has_bias,
+                         bool is_gemm,
+                         int c_components,
+                         int output_components,
+                         bool c_is_scalar,
+                         std::string activation_snippet,
+                         bool is_channels_last) {
+  shader.AdditionalImplementation()
+      << "fn mm_write(batch: i32, row: i32, colIn: i32, valueIn: output_value_t) { \n";
+
+  shader.AdditionalImplementation() << "  let col = colIn * " << output_components << ";\n";
+
+  shader.AdditionalImplementation() << "if(row < i32(uniforms.dim_a_outer) && col < i32(uniforms.dim_b_outer)) { \n"
+                                    << "    var value = valueIn; \n";
+
+  if (is_gemm) {
+    HanldeMaybeHaveBiasForGEMM(shader, output, has_bias, c_components, output_components, c_is_scalar);
+  } else {
+    HandleMaybeBiasForMatMul(shader, output, has_bias, activation_snippet, is_channels_last);
+  }
+
+  shader.AdditionalImplementation()
+      << "  }\n"
+      << "}\n\n";
+}
+
+Status MakeMatMulPackedVec4Source(ShaderHelper& shader,
+                                  const InlinedVector<int64_t>& elements_per_thread,
+                                  uint32_t workgroup_size_x,
+                                  uint32_t workgroup_size_y,
+                                  const std::string& data_type,
+                                  const ShaderIndicesHelper* batch_dims,
+                                  bool transpose_a,
+                                  bool transpose_b,
+                                  float alpha,
+                                  bool need_handle_matmul,
+                                  int output_components,
+                                  uint32_t tile_inner,
+                                  bool split_k,
+                                  uint32_t splitted_dim_inner) {
+  ORT_UNUSED_PARAMETER(split_k);
+  ORT_UNUSED_PARAMETER(splitted_dim_inner);
+
+  const std::string type_string = MakeScalarOrVectorType(4 /*components */, data_type);
+
+  std::string write_data_to_sub_a_vec4_snippet =
+      transpose_a ? std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, kStart + inputRow, globalRowStart / innerElementSize + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n"
+                  : std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, globalRow + innerRow, kStart / innerElementSize + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n";
+  std::string write_data_to_sub_b_vec4_snippet =
+      transpose_b ? std::string("mm_Bsub[inputRow][inputCol] = mm_readB(batch, globalColStart + tileRowB + innerRow, kStart / innerElementSize + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n"
+                  : std::string("mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol") + (batch_dims ? ", batchIndices" : "") + ");\n";
+
+  // elements per thread
+  const auto elements_per_thread_x = elements_per_thread[0];
+  const auto elements_per_thread_y = elements_per_thread[1];
+
+  const auto tile_a_outer = workgroup_size_y * elements_per_thread_y;
+  const auto tile_b_outer = workgroup_size_x * elements_per_thread_x;
+  const auto tile_a_width = transpose_a ? tile_a_outer : tile_inner;
+  const auto tile_a_height = transpose_a ? tile_inner : tile_a_outer;
+  const auto inner_elements_size = tile_a_width / workgroup_size_x;
+  const auto row_per_thread_b = tile_inner / workgroup_size_y;
+
+  if (!((transpose_a && inner_elements_size == 4 && elements_per_thread[1] == 4) ||
+        (!transpose_a && (inner_elements_size == 3 || inner_elements_size == 4))) &&
+      tile_a_width % workgroup_size_x == 0 &&
+      tile_inner % workgroup_size_y == 0 &&
+      elements_per_thread_x == 4) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Invalid matrix multiplication configuration inner_elements_size: ", inner_elements_size,
+                           " must be 3 or 4. tile_a_width: ", tile_a_width, " must be divisible by WorkgroupSizeX: ",
+                           workgroup_size_x, ". tile_inner: ", tile_inner, " must be divisible by WorkgroupSizeY: ",
+                           workgroup_size_y, ". elements_per_thread_x: ", elements_per_thread_x, " must be 4.");
+  }
+
+  shader.AdditionalImplementation()
+      << "var<workgroup> mm_Asub: array<array<vec" << inner_elements_size << "<" << data_type << ">, " << tile_a_width / inner_elements_size << ">, " << tile_a_height << ">;\n"
+      << "var<workgroup> mm_Bsub: array<array<vec4<" << data_type << ">, " << tile_b_outer / elements_per_thread_x << ">, " << tile_inner << ">;\n"
+      << "const rowPerThread = " << elements_per_thread_y << ";\n"
+      << "const colPerThread = " << elements_per_thread_x << ";\n"
+      << "const innerElementSize = " << inner_elements_size << ";\n"
+      << "const tileInner = " << tile_inner << ";\n";
+
+  shader.MainFunctionBody()
+      << "  let localRow = i32(local_id.y);\n"
+      << "  let tileRow = localRow * rowPerThread;\n"
+      << "  let tileCol = i32(local_id.x);\n"
+      << "  let globalRow = i32(global_id.y) * rowPerThread;\n"
+      << "  let globalCol = i32(global_id.x);\n"
+      << "  let batch = i32(global_id.z);\n"
+      << (nullptr != batch_dims ? "  let batchIndices = " + batch_dims->OffsetToIndices("u32(batch)") + ";\n" : "")
+      << "  let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
+      << "  let globalColStart = i32(workgroup_id.x) * " << tile_b_outer << ";\n"
+      << "  let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
+      << "  var kStart = 0;\n"
+      << "  var acc: array<vec4<" << data_type << ">, rowPerThread>;\n";
+
+  // Loop over shared dimension.
+  shader.MainFunctionBody() << "  let tileRowB = localRow * " << row_per_thread_b << ";\n";
+
+  if (need_handle_matmul) {
+    shader.MainFunctionBody() << "  for (var t = 0; t < i32(num_tiles); t = t + 1) {\n";
+
+    // Load one tile of A into local memory.
+    shader.MainFunctionBody()
+        << "    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
+        << "      let inputRow = tileRow + innerRow;\n"
+        << "      let inputCol = tileCol;\n"
+        << "      " << write_data_to_sub_a_vec4_snippet
+        << "    }\n";
+
+    // Load one tile of B into local memory.
+    shader.MainFunctionBody()
+        << "    for (var innerRow = 0; innerRow < " << row_per_thread_b << "; innerRow = innerRow + 1) {\n"
+        << "      let inputRow = tileRowB + innerRow;\n"
+        << "      let inputCol = tileCol;\n"
+        << "     " << write_data_to_sub_b_vec4_snippet
+        << "    }\n"
+        << "    kStart = kStart + tileInner;\n"
+        << "    workgroupBarrier();\n";
+
+    if (transpose_a && transpose_b) {
+      shader.MainFunctionBody()
+          << "    for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {\n"
+          << "      let BCached0 = mm_Bsub[tileCol * innerElementSize][k];\n"
+          << "      let BCached1 =  mm_Bsub[tileCol * innerElementSize + 1][k];\n"
+          << "      let BCached2 =  mm_Bsub[tileCol * innerElementSize + 2][k];\n";
+      if (inner_elements_size != 3) {
+        shader.MainFunctionBody() << "      let BCached3 = mm_Bsub[tileCol * innerElementSize + 3][k];\n";
+      }
+      shader.MainFunctionBody()
+          << "      let ACached0 = mm_Asub[k * innerElementSize][localRow];\n"
+          << "      let ACached1 = mm_Asub[k * innerElementSize + 1][localRow];\n"
+          << "      let ACached2 = mm_Asub[k * innerElementSize + 2][localRow];\n"
+          << (inner_elements_size == 3 ? "" : "      let ACached3 = mm_Asub[k * innerElementSize + 3][localRow];\n")
+          << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
+          << "             acc[i].x += ACached0[i] * BCached0.x + ACached1[i] * BCached0.y + ACached2[i] * BCached0.z + ACached3[i] * BCached0.w;\n"
+          << "             acc[i].y += ACached0[i] * BCached1.x + ACached1[i] * BCached1.y + ACached2[i] * BCached1.z + ACached3[i] * BCached1.w;\n"
+          << "             acc[i].z += ACached0[i] * BCached2.x + ACached1[i] * BCached2.y + ACached2[i] * BCached2.z + ACached3[i] * BCached2.w;\n"
+          << "             acc[i].w += ACached0[i] * BCached3.x + ACached1[i] * BCached3.y + ACached2[i] * BCached3.z + ACached3[i] * BCached3.w;\n"
+          << "      }\n";
+    } else if (transpose_a && !transpose_b) {
+      shader.MainFunctionBody()
+          << "    for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {\n"
+          << "      let BCached0 = mm_Bsub[k * innerElementSize][tileCol];\n"
+          << "      let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];\n"
+          << "      let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];\n";
+      if (inner_elements_size != 3) {
+        shader.MainFunctionBody() << "      let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];\n";
+      }
+      shader.MainFunctionBody()
+          << "      let ACached0 = mm_Asub[k * innerElementSize][localRow];\n"
+          << "      let ACached1 = mm_Asub[k * innerElementSize + 1][localRow];\n"
+          << "      let ACached2 = mm_Asub[k * innerElementSize + 2][localRow];\n"
+          << (inner_elements_size == 3 ? "" : "      let ACached3 = mm_Asub[k * innerElementSize + 3][localRow];\n")
+          << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
+          << "        let ACached = mm_Asub[tileCol][i];\n"
+          << "        acc[i] = BCached0 * ACached0[i] + acc[i];\n"
+          << "        acc[i] = BCached1 * ACached1[i] + acc[i];\n"
+          << "        acc[i] = BCached2 * ACached2[i] + acc[i];\n"
+          << "        " << (inner_elements_size == 3 ? "" : "acc[i] = BCached3 * ACached3[i] + acc[i];") << "\n"
+          << "      }\n";
+    } else if (!transpose_a && transpose_b) {
+      shader.MainFunctionBody()
+          << "    for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {\n"
+          << "      let BCached0 = mm_Bsub[tileCol * innerElementSize][k];\n"
+          << "      let BCached1 =  mm_Bsub[tileCol * innerElementSize + 1][k];\n"
+          << "      let BCached2 =  mm_Bsub[tileCol * innerElementSize + 2][k];\n";
+      if (inner_elements_size != 3) {
+        shader.MainFunctionBody() << "      let BCached3 = mm_Bsub[tileCol * innerElementSize + 3][k];\n";
+      }
+      shader.MainFunctionBody()
+          << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
+          << "        let ACached = mm_Asub[tileRow + i][k];\n"
+          << "        acc[i].x += dot(ACached, BCached0);\n"
+          << "        acc[i].y += dot(ACached, BCached1);\n"
+          << "        acc[i].z += dot(ACached, BCached2);\n"
+          << "        " << (inner_elements_size == 3 ? "" : "acc[i].w += dot(ACached, BCached3);") << "\n"
+          << "      }\n";
+    } else {
+      shader.MainFunctionBody()
+          << "    for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {\n"
+          << "      let BCached0 = mm_Bsub[k * innerElementSize][tileCol];\n"
+          << "      let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];\n"
+          << "      let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];\n";
+      if (inner_elements_size != 3) {
+        shader.MainFunctionBody() << "      let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];\n";
+      }
+      shader.MainFunctionBody()
+          << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
+          << "        let ACached = mm_Asub[tileRow + i][k];\n"
+          << "        acc[i] = BCached0 * ACached.x + acc[i];\n"
+          << "        acc[i] = BCached1 * ACached.y + acc[i];\n"
+          << "        acc[i] = BCached2 * ACached.z + acc[i];\n"
+          << "        " << (inner_elements_size == 3 ? "" : "acc[i] = BCached3 * ACached.w + acc[i];") << "\n"
+          << "      }\n";
+    }
+    shader.MainFunctionBody()
+        << "    }\n"
+        << "    workgroupBarrier();\n"
+        << "  }\n";  // main for loop
+
+    // Calculate alpha * acc
+    if (alpha != 1.0f) {
+      shader.MainFunctionBody() << "  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
+                                << "    acc[innerRow] = output_element_t(uniforms.alpha) * acc[innerRow];\n"
+                                << "  }\n";
+    }
+  }
+
+  // Write the results to the output buffer
+  shader.MainFunctionBody() << "  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n";
+  if (output_components == 1) {
+    shader.MainFunctionBody() << " for (var i = 0; i < innerElementSize; i = i + 1) {\n"
+                              << "    mm_write(batch, globalRow + innerRow, globalCol * innerElementSize + i, acc[innerRow][i]);\n"
+                              << "  }\n";
+  } else {
+    shader.MainFunctionBody() << "    mm_write(batch, globalRow + innerRow, globalCol, acc[innerRow]);\n";
+  }
+
+  shader.MainFunctionBody() << "  }\n";
+
+  return Status::OK();
+}
+
+Status MakeMatMulPackedSource(ShaderHelper& shader,
+                              const InlinedVector<int64_t>& elements_per_thread,
+                              uint32_t workgroup_size_x,
+                              uint32_t workgroup_size_y,
+                              const std::string& data_type,
+                              const ShaderIndicesHelper* batch_dims,
+                              bool transpose_a,
+                              bool transpose_b,
+                              float alpha,
+                              bool need_handle_matmul,
+                              uint32_t tile_inner,
+                              bool split_k,
+                              uint32_t splitted_dim_inner) {
+  ORT_UNUSED_PARAMETER(split_k);
+  ORT_UNUSED_PARAMETER(splitted_dim_inner);
+
+  const auto elements_per_thread_x = elements_per_thread[0];
+  const auto elements_per_thread_y = elements_per_thread[1];
+
+  const auto tile_a_outer = workgroup_size_y * elements_per_thread_y;
+  const auto tile_b_outer = workgroup_size_x * elements_per_thread_x;
+  const auto tile_a_width = transpose_a ? tile_a_outer : tile_inner;
+  const auto tile_a_height = transpose_a ? tile_inner : tile_a_outer;
+
+  if (!(tile_a_height % workgroup_size_y == 0 && tile_a_width % workgroup_size_x == 0 && tile_inner % workgroup_size_y == 0)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "tile_a_height: ", tile_a_height, " must be divisible by WorkgroupSizeY: ", workgroup_size_y,
+                           ", tile_a_width: ", tile_a_width, " must be divisible by WorkgroupSizeX: ", workgroup_size_x,
+                           ", tile_inner: ", tile_inner, " must be divisible by WorkgroupSizeY: ", workgroup_size_y);
+  }
+
+  const auto row_per_thread_a = tile_a_height / workgroup_size_y;
+  const auto col_per_thread_a = tile_a_width / workgroup_size_x;
+  const auto row_per_thread_b = tile_inner / workgroup_size_y;
+  std::string write_data_to_sub_a_snippet = transpose_a ? std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, kStart + inputRow, globalRowStart + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n"
+                                                        : std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, globalRowStart + inputRow, kStart + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n";
+  std::string write_data_to_sub_b_snippet = transpose_b ? std::string("mm_Bsub[inputRow][inputCol] = mm_readB(batch, globalColStart + tileRowB + innerRow, kStart + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n"
+                                                        : std::string("mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalColStart + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n";
+  shader.AdditionalImplementation()
+      << "var<workgroup> mm_Asub: array<array<" << data_type << ", " << tile_a_width << ">, " << tile_a_height << ">;\n"
+      << "var<workgroup> mm_Bsub: array<array<" << data_type << ", " << tile_b_outer << ">, " << tile_inner << ">;\n"
+      << "const rowPerThread = " << elements_per_thread_y << ";\n"
+      << "const colPerThread = " << elements_per_thread_x << ";\n"
+      << "const tileInner = " << tile_inner << ";\n";
+
+  shader.MainFunctionBody() << " let batch = i32(global_id.z);\n"
+                            << (nullptr != batch_dims ? "  let batchIndices = " + batch_dims->OffsetToIndices("u32(batch)") + ";\n" : "")
+                            << " let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
+                            << " var kStart = 0;\n"
+                            << " var acc: array<array<" << data_type << ", colPerThread>, rowPerThread>;\n";
+
+  shader.MainFunctionBody()
+      << "let tileRow = i32(local_id.y) * rowPerThread;\n"
+      << "let tileCol = i32(local_id.x) * colPerThread;\n"
+      << "let globalRow = i32(global_id.y) * rowPerThread;\n"
+      << "let globalCol = i32(global_id.x) * colPerThread;\n"
+      << "let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
+      << "let globalColStart = i32(workgroup_id.x) * " << tile_b_outer << ";\n"
+      << "let tileRowA = i32(local_id.y) * " << row_per_thread_a << ";\n"
+      << "let tileColA = i32(local_id.x) * " << col_per_thread_a << ";\n"
+      << "let tileRowB = i32(local_id.y) * " << row_per_thread_b << ";\n";
+
+  if (need_handle_matmul) {
+    // Loop over shared dimension.
+    shader.MainFunctionBody()
+        << "for (var t = 0; t < i32(num_tiles); t = t + 1) {\n";
+
+    // Load one tile of A into local memory.
+    shader.MainFunctionBody()
+        << "  for (var innerRow = 0; innerRow < i32(" << row_per_thread_a << "); innerRow = innerRow + 1) {\n"
+        << "    for (var innerCol = 0; innerCol < i32(" << col_per_thread_a << "); innerCol = innerCol + 1) {\n"
+        << "      let inputRow = tileRowA + innerRow;\n"
+        << "      let inputCol = tileColA + innerCol;\n"
+        << "      " << write_data_to_sub_a_snippet << "\n"
+        << "    }\n"
+        << "  }\n";
+
+    // Load one tile of B into local memory.
+    shader.MainFunctionBody()
+        << "  for (var innerRow = 0; innerRow < i32(" << row_per_thread_b << "); innerRow = innerRow + 1) {\n"
+        << "    for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
+        << "      let inputRow = tileRowB + innerRow;\n"
+        << "      let inputCol = tileCol + innerCol;\n"
+        << "           " << write_data_to_sub_b_snippet << "\n "
+        << "    }\n"
+        << "  }\n"
+        << "  kStart = kStart + tileInner;\n"
+        << "  workgroupBarrier();\n";
+
+    // Compute acc values for a single thread.
+    shader.MainFunctionBody()
+        << "var BCached: array<" << data_type << ", colPerThread>;\n"
+        << "  for (var k = 0; k < tileInner; k = k + 1) {\n"
+        << "    for (var inner = 0; inner < i32(colPerThread); inner = inner + 1) {\n";
+    if (transpose_b) {
+      shader.MainFunctionBody() << "      BCached[inner] = mm_Bsub[tileCol + inner][k];\n";
+    } else {
+      shader.MainFunctionBody()
+          << "      BCached[inner] = mm_Bsub[k][tileCol + inner];\n";
+    }
+    shader.MainFunctionBody() << "    }\n"
+                              << "    for (var innerRow = 0; innerRow < i32(rowPerThread); innerRow = innerRow + 1) {\n";
+    if (transpose_a) {
+      shader.MainFunctionBody() << "      let ACached = mm_Asub[k][tileRow + innerRow];\n";
+    } else {
+      shader.MainFunctionBody() << "      let ACached = mm_Asub[tileRow + innerRow][k];\n";
+    }
+    shader.MainFunctionBody() << "      for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
+                              << "        acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol];\n"
+                              << "      }\n"
+                              << "    }\n"
+                              << "  }\n"
+                              << "  workgroupBarrier();\n"
+                              << "}\n";
+
+    // Calculate alpha * acc
+    if (alpha != 1.0f) {
+      shader.MainFunctionBody() << "for (var innerRow = 0; innerRow < i32(rowPerThread); innerRow = innerRow + 1) {\n"
+                                << "  for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
+                                << "    acc[innerRow][innerCol] = output_element_t(uniforms.alpha) * acc[innerRow][innerCol];\n"
+                                << "  }\n"
+                                << "}\n";
+    }
+  }
+  // Write the results to the output buffer
+  shader.MainFunctionBody()
+      << "for (var innerRow = 0; innerRow < i32(rowPerThread); innerRow = innerRow + 1) {\n"
+      << "  for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
+      << "    mm_write(batch, globalRow + innerRow, globalCol + innerCol, acc[innerRow][innerCol]);\n"
+      << "  }\n"
+      << "}\n";
+  return Status::OK();
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_utils.h b/onnxruntime/core/providers/webgpu/math/gemm_utils.h
new file mode 100644
index 0000000000000..2244f2810c3bf
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/gemm_utils.h
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/shader_helper.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+void MatMulReadFnSource(ShaderHelper& shader,
+                        const ShaderVariableHelper& a,
+                        const ShaderVariableHelper& b,
+                        const ShaderIndicesHelper* batch_dims,
+                        bool transA,
+                        bool transB,
+                        bool is_vec4);
+
+void MatMulWriteFnSource(ShaderHelper& shader,
+                         const ShaderVariableHelper& output,
+                         bool has_bias,
+                         bool is_gemm,
+                         int c_components,
+                         int output_components,
+                         bool c_is_scalar,
+                         std::string activation_snippet = "",
+                         bool is_channels_last = false);
+
+// The two following functions are used to generate shader code for vec4 and scalar.
+// It is used in GEMM, Matmul, and Conv.
+Status MakeMatMulPackedVec4Source(ShaderHelper& shader,
+                                  const InlinedVector<int64_t>& elements_per_thread,
+                                  uint32_t workgroup_size_x,
+                                  uint32_t workgroup_size_y,
+                                  const std::string& data_type,
+                                  const ShaderIndicesHelper* batch_dims,
+                                  bool transpose_a = false,
+                                  bool transpose_b = false,
+                                  float alpha = 1.0f,
+                                  bool need_handle_matmul = true,
+                                  // When B is transposed, the components of output is might 1 though A and B is vec4.
+                                  // e.g. A{32, 32}, B{33, 32} => Y{32, 33}
+                                  int output_components = 4,
+                                  uint32_t tile_inner = 32,
+                                  bool split_k = false,
+                                  uint32_t splitted_dim_inner = 32);
+
+Status MakeMatMulPackedSource(ShaderHelper& shader,
+                              const InlinedVector<int64_t>& elements_per_thread,
+                              uint32_t workgroup_size_x,
+                              uint32_t workgroup_size_y,
+                              const std::string& data_type,
+                              const ShaderIndicesHelper* batch_dims,
+                              bool transpose_a = false,
+                              bool transpose_b = false,
+                              float alpha = 1.0f,
+                              bool need_handle_matmul = true,
+                              uint32_t tile_inner = 32,
+                              bool split_k = false,
+                              uint32_t splitted_dim_inner = 32);
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_vec4.cc b/onnxruntime/core/providers/webgpu/math/gemm_vec4.cc
deleted file mode 100644
index 6ba93df8247d2..0000000000000
--- a/onnxruntime/core/providers/webgpu/math/gemm_vec4.cc
+++ /dev/null
@@ -1,314 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/webgpu/math/gemm_vec4.h"
-
-#include "core/providers/webgpu/webgpu_utils.h"
-
-namespace onnxruntime {
-namespace webgpu {
-
-void GemmVec4Program::MatMulReadFnSource(ShaderHelper& shader) const {
-  // We can’t treat `output_value_t` as the type of A and B, because output might not be a vec4, while A or B is.
-  const std::string data_type = "output_element_t";
-  const std::string type_string = MakeScalarOrVectorType(4 /*components */, data_type);
-
-  shader.AdditionalImplementation()
-      << "fn mm_readA(row: u32, col: u32, total_rows: u32, total_cols: u32) -> " << type_string << " { \n"
-      << " if(col < total_cols && row < total_rows) {\n"
-      << "    return A[row * total_cols + col];\n"
-      << "  } else {\n"
-      << "    return " << type_string << "(0);\n"
-      << "  }\n"
-      << "}\n\n";
-
-  shader.AdditionalImplementation()
-      << "fn mm_readB(row: u32, col: u32, total_rows: u32, total_cols: u32) -> " << type_string << "{ \n"
-      << "  if(col < total_cols && row < total_rows) {\n"
-      << "    return B[row * total_cols + col];\n"
-      << "  } else {\n"
-      << "    return " << type_string << "(0);\n"
-      << "  }\n"
-      << "}\n\n";
-}
-
-void GemmVec4Program::MatMulWriteFnSource(ShaderHelper& shader, const ShaderVariableHelper& output) const {
-  shader.AdditionalImplementation()
-      << "fn mm_write(row: u32, col: u32, valuesIn: output_value_t) { \n";
-
-  if (output_components_ == 1) {
-    shader.AdditionalImplementation() << "  let total_cols = uniforms.N; \n";
-  } else {
-    shader.AdditionalImplementation() << "  let total_cols = uniforms.N4; \n";
-  }
-
-  shader.AdditionalImplementation() << "var values = valuesIn; \n"
-                                    << "if(col < total_cols && row < uniforms.M) { \n";
-  if (need_handle_bias_) {
-    const ShaderVariableHelper& C = shader.AddInput("C", ShaderUsage::UseUniform);
-    shader.AdditionalImplementation() << "    values += output_element_t(uniforms.beta) * ";
-    // We can be allowed to use broadcasting only when both components are equal.
-    // There is only one case for c_components_ is not equal output_components_.
-    // I.g. the former is `1` and the latter is `4`.
-    // That means the shape of C is either {M,1} or {1,1}
-    if (c_components_ == output_components_) {
-      shader.AdditionalImplementation() << "output_value_t("
-                                        << C.GetByOffset(C.BroadcastedIndicesToOffset("vec2(row, col)", output)) << ");\n";
-    } else if (c_is_scalar_) {
-      shader.AdditionalImplementation() << "output_value_t(C[0]);\n";
-    } else {
-      shader.AdditionalImplementation() << "output_value_t(C[row]);\n";
-    }
-  }
-  shader.AdditionalImplementation() << "    output[row * total_cols + col] = values;\n"
-                                    << "  }\n"
-                                    << "}\n";
-}
-
-Status GemmVec4Program::GenerateShaderCode(ShaderHelper& shader) const {
-  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-
-  // We can’t treat `output_value_t` as the type of A and B, because output might not be a vec4, while A or B is.
-  const std::string data_type = "output_element_t";
-  const std::string type_string = MakeScalarOrVectorType(4 /*components */, data_type);
-
-  shader.MainFunctionBody() << "  var values = " << type_string << "(0);\n\n"
-                            << "  let tile_col_start = (workgroup_idx % uniforms.num_tile_n) * 8u;\n"
-                            << "  let tile_row_start = (workgroup_idx / uniforms.num_tile_n) * 32u;\n";
-
-  if (need_handle_matmul_) {
-    shader.AddInput("A", ShaderUsage::UseUniform);
-    shader.AddInput("B", ShaderUsage::UseUniform);
-
-    MatMulReadFnSource(shader);
-
-    // Add shared memory arrays for tiling
-    shader.AdditionalImplementation() << "var<workgroup> tile_a: array<array< " << type_string << ", 8 >, 32 >;\n "
-                                      << "var<workgroup> tile_b: array<array< " << type_string << ", 8 >, 32 >;\n ";
-
-    shader.MainFunctionBody()
-        << "  var k_start_a = 0u;\n"
-        << "  var k_start_b = 0u;\n\n"
-        << "  let num_tiles = (uniforms.K + 32 - 1) / 32;\n";
-
-    // Main loop for matrix multiplication
-    shader.MainFunctionBody()
-        << "  for (var t = 0u; t < num_tiles; t = t + 1u) {\n";
-    // Load TILE_A
-    if (transA_) {
-      shader.MainFunctionBody() << R"TILE_A(
-        var row = k_start_a + (local_idx / 8u);
-        var col =  tile_row_start/4 + local_idx % 8u;
-        tile_a[local_idx / 8u][local_idx % 8u] = mm_readA(row, col, uniforms.K, uniforms.M4);
-        )TILE_A";
-    } else {
-      shader.MainFunctionBody() << R"TILE_A(
-        var row = tile_row_start + local_idx / 8u;
-        var col = k_start_a + (local_idx % 8u);
-        tile_a[local_idx / 8u][local_idx % 8u] = mm_readA(row, col, uniforms.M, uniforms.K4);
-        )TILE_A";
-    }
-    // Load TILE_B
-    if (transB_) {
-      shader.MainFunctionBody() << R"TILE_B(
-        row = tile_col_start * 4 + (local_idx / 8u);
-        col = k_start_b + (local_idx % 8u);
-        // load 1 vec4 into tile_b
-        tile_b[local_idx / 8u][local_idx % 8u] = mm_readB(row, col, uniforms.N, uniforms.K4);
-        )TILE_B";
-    } else {
-      shader.MainFunctionBody() << R"TILE_B(
-        row = k_start_b + (local_idx / 8u);
-        col = tile_col_start + (local_idx % 8u);
-        // load 1 vec4 into tile_b
-        tile_b[local_idx / 8u][local_idx % 8u] = mm_readB(row, col, uniforms.K, uniforms.N4);
-        )TILE_B";
-    }
-
-    shader.MainFunctionBody() << "    workgroupBarrier();\n\n";
-
-    if (transA_) {
-      shader.MainFunctionBody() << "k_start_a = k_start_a + 32u; \n";
-    } else {
-      shader.MainFunctionBody() << "k_start_a = k_start_a + 8u; \n";
-    }
-
-    if (transB_) {
-      shader.MainFunctionBody() << "k_start_b = k_start_b + 8u; \n";
-    } else {
-      shader.MainFunctionBody() << "k_start_b = k_start_b + 32u; \n";
-    }
-
-    // Calculate output according to TILE_A and TILE_B
-    if (transA_ && transB_) {
-      shader.MainFunctionBody() << R"CALC(
-        // Calculate 4 output for each thread
-        // We read 32 vec4 from tile_a and 32 vec4 from tile_b in total.
-        for (var i = 0u; i < 32; i = i + 4u) {
-            let a1 = tile_a[i][local_idx / 32u];
-            let a2 = tile_a[i + 1u][local_idx / 32u];
-            let a3 = tile_a[i + 2u][local_idx / 32u];
-            let a4 = tile_a[i + 3u][local_idx / 32u];
-            let b1 = tile_b[(local_idx % 8) * 4][i / 4u];
-            let b2 = tile_b[(local_idx % 8) * 4 + 1u][i / 4u];
-            let b3 = tile_b[(local_idx % 8) * 4 + 2u][i / 4u];
-            let b4 = tile_b[(local_idx % 8) * 4 + 3u][i / 4u];
-
-            var vec_idx = local_idx / 8u % 4;
-
-            values[0] += a1[vec_idx] * b1[0] + a2[vec_idx] * b1[1] + a3[vec_idx] * b1[2] + a4[vec_idx] * b1[3];
-            values[1] += a1[vec_idx] * b2[0] + a2[vec_idx] * b2[1] + a3[vec_idx] * b2[2] + a4[vec_idx] * b2[3];
-            values[2] += a1[vec_idx] * b3[0] + a2[vec_idx] * b3[1] + a3[vec_idx] * b3[2] + a4[vec_idx] * b3[3];
-            values[3] += a1[vec_idx] * b4[0] + a2[vec_idx] * b4[1] + a3[vec_idx] * b4[2] + a4[vec_idx] * b4[3];
-        }
-        )CALC";
-    } else if (transA_ && !transB_) {
-      shader.MainFunctionBody() << R"CALC(
-        // Calculate 4 output for each thread
-        // We read 32 vec4 from tile_a and 32 vec4 from tile_b in total.
-        for (var i = 0u; i < 32; i = i + 1u) {
-            let a = tile_a[i][local_idx / 32u];
-            let b = tile_b[i][local_idx % 8u];
-            values += a[(local_idx / 8u) % 4] * b;
-        })CALC";
-    } else if (!transA_ && transB_) {
-      shader.MainFunctionBody() << R"CALC(
-         for (var i = 0u; i < 32; i = i + 4u) {
-            let a = tile_a[local_idx / 8u][i/4u];
-            let b1 = tile_b[(local_idx % 8) * 4][i / 4u];
-            let b2 = tile_b[(local_idx % 8) * 4 + 1u][i / 4u];
-            let b3 = tile_b[(local_idx % 8) * 4 + 2u][i / 4u];
-            let b4 = tile_b[(local_idx % 8) * 4 + 3u][i / 4u];
-
-            values += vec4<output_element_t>(
-                dot(a, b1),
-                dot(a, b2),
-                dot(a, b3),
-                dot(a, b4)
-            );
-        }
-            )CALC";
-    } else {
-      shader.MainFunctionBody() << R"CALC(
-        for (var i = 0u; i < 32; i = i + 4u) {
-            let a = tile_a[local_idx / 8u][i/4u];
-            let b1 = tile_b[i][local_idx % 8u];
-            let b2 = tile_b[i+1][local_idx % 8u];
-            let b3 = tile_b[i+2][local_idx % 8u];
-            let b4 = tile_b[i+3][local_idx % 8u];
-
-            values += a.x * b1 + a.y * b2 + a.z * b3 + a.w * b4;
-        }
-        )CALC";
-    }
-    shader.MainFunctionBody() << "    workgroupBarrier();\n"
-                              << "  }\n\n";
-
-    // Calculate alpha
-    if (alpha_ != 1.0f) {
-      shader.MainFunctionBody() << "  values = output_element_t(uniforms.alpha) * values;\n";
-    }
-  }
-
-  MatMulWriteFnSource(shader, output);
-  shader.MainFunctionBody() << "  let m = tile_row_start + local_idx / 8u;\n"
-                            << "  let n = tile_col_start + local_idx % 8u;\n\n";
-
-  // Write output
-  if (output_components_ == 1) {
-    shader.MainFunctionBody() << " for (var i = 0u; i < 4u; i = i + 1u) {\n"
-                              << "    mm_write(m, 4 * n + i, values[i]);\n"
-                              << "  }\n";
-  } else {
-    shader.MainFunctionBody() << "  mm_write(m, n, values);\n";
-  }
-
-  return Status::OK();
-}
-
-bool CanApplyGemmVec4(const Tensor* a,
-                      const Tensor* b) {
-  const auto& a_shape = a->Shape();
-  const auto& b_shape = b->Shape();
-
-  // When the number of columns in A and B is divisible by 4, we apply vec4 optimization to A and B.
-  // However, this doesn't necessarily mean that C and Y will use vec4.
-  // For example, C/output won't be vec4 if B is transposed and N is not divisible by 4.
-  // Also, C won't use vec4 when it's a scalar.
-  // The code would be simpler if we avoided vec4 optimization for C/output.
-  // But to maximize performance, we still apply vec4 when possible — even though it adds some complexity.
-  // I've added detailed comments explaining this logic.
-  // See MatMulReadFnSource and MatMulWriteFnSource, especially the parts related to broadcasting.
-  return a_shape[1] % 4 == 0 && b_shape[1] % 4 == 0;
-}
-
-Status ApplyGemmVec4(const Tensor* a,
-                     const Tensor* b,
-                     const Tensor* c,
-                     bool transA,
-                     bool transB,
-                     float alpha,
-                     float beta,
-                     ComputeContext& context,
-                     Tensor* y) {
-  const auto& a_shape = a->Shape();
-  const auto& b_shape = b->Shape();
-
-  uint32_t M = onnxruntime::narrow<uint32_t>(transA ? a_shape[1] : a_shape[0]);
-  uint32_t K = onnxruntime::narrow<uint32_t>(transA ? a_shape[0] : a_shape[1]);
-  uint32_t N = onnxruntime::narrow<uint32_t>(transB ? b_shape[0] : b_shape[1]);
-
-  // WebGPU doesn't support binding a zero-sized buffer, so we need to check if A or B is empty.
-  bool need_handle_matmul = a_shape.Size() > 0 && b_shape.Size() > 0;
-  bool need_handle_bias = c && beta;
-
-  int c_components = 4;
-  bool c_is_scalar = false;
-
-  // We use vec4 for C when its last dimension equals N and N is divisible by 4.
-  if (need_handle_bias) {
-    const auto& c_shape = c->Shape();
-    int64_t c_last_dim = c_shape[c_shape.NumDimensions() - 1];
-    c_components = (c_last_dim == N && N % 4 == 0) ? 4 : 1;
-    c_is_scalar = c_shape.Size() == 1;
-  }
-
-  // We use vec4 for Y when N is divisible by 4.
-  const int output_components = N % 4 == 0 ? 4 : 1;
-
-  GemmVec4Program program{transA, transB, alpha, need_handle_bias, need_handle_matmul, c_components, c_is_scalar, output_components};
-
-  const int components = 4;
-
-  if (need_handle_matmul) {
-    program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, components},
-                       {b, ProgramTensorMetadataDependency::TypeAndRank, components}});
-  }
-
-  if (need_handle_bias) {
-    program.AddInput({c, ProgramTensorMetadataDependency::TypeAndRank, c_components});
-  }
-
-  const uint32_t TILE_SIZE = 32;
-  const uint32_t num_tile_n = (N + TILE_SIZE - 1) / TILE_SIZE;
-  const uint32_t num_tile_m = (M + TILE_SIZE - 1) / TILE_SIZE;
-
-  program.CacheHint(alpha, transA, transB, c_is_scalar)
-      .AddOutputs({{y, ProgramTensorMetadataDependency::TypeAndRank, output_components}})
-      .SetDispatchGroupSize(num_tile_n * num_tile_m)
-      .SetWorkgroupSize(256, 1, 1)
-      .AddUniformVariables({{num_tile_n},
-                            {M},
-                            {N},
-                            {K},
-                            {M / 4},
-                            {N / 4},
-                            {K / 4},
-                            {alpha},
-                            {beta}});
-
-  return context.RunProgram(program);
-}
-
-}  // namespace webgpu
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_vec4.h b/onnxruntime/core/providers/webgpu/math/gemm_vec4.h
deleted file mode 100644
index ae7be49ce9218..0000000000000
--- a/onnxruntime/core/providers/webgpu/math/gemm_vec4.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/providers/webgpu/program.h"
-#include "core/providers/webgpu/webgpu_kernel.h"
-
-#include "core/providers/webgpu/shader_helper.h"
-
-namespace onnxruntime {
-namespace webgpu {
-
-class GemmVec4Program final : public Program<GemmVec4Program> {
- public:
-  GemmVec4Program(bool transA, bool transB, float alpha, bool need_handle_bias, bool need_handle_matmul, int c_components, bool c_is_scalar, int output_components)
-      : Program{"GemmVec4"},
-        transA_{transA},
-        transB_{transB},
-        alpha_{alpha},
-        need_handle_bias_{need_handle_bias},
-        need_handle_matmul_{need_handle_matmul},
-        c_components_(c_components),
-        c_is_scalar_(c_is_scalar),
-        output_components_(output_components) {}
-
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-
-  void MatMulReadFnSource(ShaderHelper& shader) const;
-  void MatMulWriteFnSource(ShaderHelper& shader, const ShaderVariableHelper& output) const;
-
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
-      {"num_tile_n", ProgramUniformVariableDataType::Uint32},
-      {"M", ProgramUniformVariableDataType::Uint32},
-      {"N", ProgramUniformVariableDataType::Uint32},
-      {"K", ProgramUniformVariableDataType::Uint32},
-      {"M4", ProgramUniformVariableDataType::Uint32},
-      {"N4", ProgramUniformVariableDataType::Uint32},
-      {"K4", ProgramUniformVariableDataType::Uint32},
-      {"alpha", ProgramUniformVariableDataType::Float32},
-      {"beta", ProgramUniformVariableDataType::Float32});
-
- private:
-  bool transA_;
-  bool transB_;
-  float alpha_;
-  bool need_handle_bias_;
-  bool need_handle_matmul_;
-  int c_components_;
-  bool c_is_scalar_ = false;
-  int output_components_;
-};
-
-Status ApplyGemmVec4(const Tensor* a,
-                     const Tensor* b,
-                     const Tensor* c,
-                     bool transA,
-                     bool transB,
-                     float alpha,
-                     float beta,
-                     ComputeContext& context,
-                     Tensor* y);
-
-bool CanApplyGemmVec4(const Tensor* a,
-                      const Tensor* b);
-
-}  // namespace webgpu
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/matmul.cc b/onnxruntime/core/providers/webgpu/math/matmul.cc
index 4499f4a2432b9..81fcdbb068984 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul.cc
+++ b/onnxruntime/core/providers/webgpu/math/matmul.cc
@@ -55,7 +55,7 @@ Status MatMulNaiveProgram::GenerateShaderCode(ShaderHelper& shader) const {
   std::string process_bias;
   if (has_bias_) {
     shader.AddInput("bias", ShaderUsage::UseUniform);
-    process_bias = is_channels_last_ ? "value += output_value_t(bias[col])" : "value += output_value_t(bias[row + i]);";
+    process_bias = is_channels_last_ ? "value += output_value_t(bias[col]);" : "value += output_value_t(bias[row + i]);";
   }
 
   std::string apply_activation = GetActivationSnippet(activation_, "output_value_t", "output_element_t");
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
index ea5c263618001..585f8f1e011c4 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
+++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
@@ -2,6 +2,8 @@
 // Licensed under the MIT License.
 
 #include "core/providers/webgpu/math/matmul_packed.h"
+
+#include "core/providers/webgpu/math/gemm_utils.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 #include "core/providers/webgpu/webgpu_utils.h"
@@ -9,356 +11,6 @@
 namespace onnxruntime {
 namespace webgpu {
 
-void MatMulProgram::MatMulReadWriteFnSource(ShaderHelper& shader,
-                                            const ShaderVariableHelper& a,
-                                            const ShaderVariableHelper& b,
-                                            const ShaderVariableHelper& output,
-                                            const ShaderIndicesHelper& batch_dims,
-                                            std::string activation_snippet) const {
-  int components = is_vec4_ ? 4 : 1;
-  const std::string data_type = "a_element_t";
-  const std::string type_string = MakeScalarOrVectorType(components, data_type);
-
-  // Add the mm_readA function
-  shader.AdditionalImplementation()
-      << "fn mm_readA(batch: i32, row: i32, colIn: i32, batch_indices: batch_dims_indices_t) -> " << type_string << " {\n"
-      << "    var value = " << type_string << "(0.0);\n"
-      << "    let col = colIn * " << components << ";\n"
-      << "    if(row < i32(uniforms.dim_a_outer) && col < i32(uniforms.dim_inner)) {\n"
-      << "        var a_indices: a_indices_t;\n"
-      << ConvertOutputBatchIndicesToInputBatchIndices("a", a, a.Rank() - 2, batch_dims.Rank(), "batch_indices")
-      << a.IndicesSet("a_indices", a.Rank() - 2, "u32(row)") << "\n"
-      << a.IndicesSet("a_indices", a.Rank() - 1, "u32(colIn)") << "\n"
-      << "        value = " << a.GetByIndices("a_indices") << ";\n"
-      << "    }\n"
-      << "    return value;\n"
-      << "}\n\n";
-
-  // Add the mm_readB function
-  shader.AdditionalImplementation()
-      << "fn mm_readB(batch: i32, row: i32, colIn: i32, batch_indices: batch_dims_indices_t) -> " << type_string << " {\n"
-      << "    var value = " << type_string << "(0.0);\n"
-      << "    let col = colIn * " << components << ";\n"
-      << "    if(row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_b_outer)) {\n"
-      << "        var b_indices: b_indices_t;\n"
-      << ConvertOutputBatchIndicesToInputBatchIndices("b", b, b.Rank() - 2, batch_dims.Rank(), "batch_indices")
-      << b.IndicesSet("b_indices", b.Rank() - 2, "u32(row)") << "\n"
-      << b.IndicesSet("b_indices", b.Rank() - 1, "u32(colIn)") << "\n"
-      << "        value = " << b.GetByIndices("b_indices") << ";\n"
-      << "    }\n"
-      << "    return value;\n"
-      << "}\n\n";
-
-  // Add the mm_write function
-  shader.AdditionalImplementation()
-      << "fn mm_write(batch: i32, row: i32, colIn: i32, valueIn: " << type_string << ") {\n"
-      << "  let col = colIn * " << components << ";\n"
-      << "  if (row < i32(uniforms.dim_a_outer) && col < i32(uniforms.dim_b_outer)) {\n"
-      << "    var value = valueIn;\n"
-      << "    let coords = vec3<i32>(batch, row, colIn);\n";
-
-  if (has_bias_) {
-    shader.AdditionalImplementation() << "    value = value + " << (is_channels_last_ ? "bias[colIn]" : type_string + "(bias[row])") << ";\n";
-  }
-
-  shader.AdditionalImplementation() << "    " << activation_snippet << "\n";
-
-  shader.AdditionalImplementation()
-      << output.SetByIndices("vec3<u32>(coords)", "value") << "\n"
-      << "  }\n"
-      << "}\n\n";
-}
-
-Status MatMulProgram::MakeMatMulPackedVec4Source(ShaderHelper& shader,
-                                                 const InlinedVector<int64_t>& elements_per_thread,
-                                                 uint32_t workgroup_size_x,
-                                                 uint32_t workgroup_size_y,
-                                                 const std::string& data_type,
-                                                 const ShaderIndicesHelper* batch_dims,
-                                                 bool transpose_a,
-                                                 uint32_t tile_inner,
-                                                 bool split_k,
-                                                 uint32_t splitted_dim_inner) {
-  ORT_UNUSED_PARAMETER(split_k);
-  ORT_UNUSED_PARAMETER(splitted_dim_inner);
-  std::string write_data_to_sub_a_vec4_snippet =
-      transpose_a ? std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, kStart + inputRow, globalRowStart / innerElementSize + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n"
-                  : std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, globalRow + innerRow, kStart / innerElementSize + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n";
-  // elements per thread
-  const auto elements_per_thread_x = elements_per_thread[0];
-  const auto elements_per_thread_y = elements_per_thread[1];
-
-  const auto tile_a_outer = workgroup_size_y * elements_per_thread_y;
-  const auto tile_b_outer = workgroup_size_x * elements_per_thread_x;
-  const auto tile_a_width = transpose_a ? tile_a_outer : tile_inner;
-  const auto tile_a_height = transpose_a ? tile_inner : tile_a_outer;
-  const auto inner_elements_size = tile_a_width / workgroup_size_x;
-  const auto row_per_thread_b = tile_inner / workgroup_size_y;
-
-  if (!((transpose_a && inner_elements_size == 4 && elements_per_thread[1] == 4) ||
-        (!transpose_a && (inner_elements_size == 3 || inner_elements_size == 4))) &&
-      tile_a_width % workgroup_size_x == 0 &&
-      tile_inner % workgroup_size_y == 0 &&
-      elements_per_thread_x == 4) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Invalid matrix multiplication configuration inner_elements_size: ", inner_elements_size,
-                           " must be 3 or 4. tile_a_width: ", tile_a_width, " must be divisible by WorkgroupSizeX: ",
-                           workgroup_size_x, ". tile_inner: ", tile_inner, " must be divisible by WorkgroupSizeY: ",
-                           workgroup_size_y, ". elements_per_thread_x: ", elements_per_thread_x, " must be 4.");
-  }
-
-  shader.AdditionalImplementation()
-      << "var<workgroup> mm_Asub: array<array<vec" << inner_elements_size << "<" << data_type << ">, " << tile_a_width / inner_elements_size << ">, " << tile_a_height << ">;\n"
-      << "var<workgroup> mm_Bsub: array<array<vec4<" << data_type << ">, " << tile_b_outer / elements_per_thread_x << ">, " << tile_inner << ">;\n"
-      << "const rowPerThread = " << elements_per_thread_y << ";\n"
-      << "const colPerThread = " << elements_per_thread_x << ";\n"
-      << "const innerElementSize = " << inner_elements_size << ";\n"
-      << "const tileInner = " << tile_inner << ";\n";
-
-  shader.MainFunctionBody()
-      << "  let localRow = i32(local_id.y);\n"
-      << "  let tileRow = localRow * rowPerThread;\n"
-      << "  let tileCol = i32(local_id.x);\n"
-      << "  let globalRow = i32(global_id.y) * rowPerThread;\n"
-      << "  let globalCol = i32(global_id.x);\n"
-      << "  let batch = i32(global_id.z);\n"
-      << (nullptr != batch_dims ? "  let batchIndices = " + batch_dims->OffsetToIndices("u32(batch)") + ";\n" : "")
-      << "  let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
-      << "  let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
-      << "  var kStart = 0;\n"
-      << "  var acc: array<vec4<" << data_type << ">, rowPerThread>;\n";
-
-  // Loop over shared dimension.
-  shader.MainFunctionBody()
-      << "  let tileRowB = localRow * " << row_per_thread_b << ";\n"
-      << "  for (var t = 0; t < i32(num_tiles); t = t + 1) {\n";
-
-  // Load one tile of A into local memory.
-  shader.MainFunctionBody()
-      << "    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
-      << "      let inputRow = tileRow + innerRow;\n"
-      << "      let inputCol = tileCol;\n"
-      << "      " << write_data_to_sub_a_vec4_snippet
-      << "    }\n";
-
-  // Load one tile of B into local memory.
-  shader.MainFunctionBody()
-      << "    for (var innerRow = 0; innerRow < " << row_per_thread_b << "; innerRow = innerRow + 1) {\n"
-      << "      let inputRow = tileRowB + innerRow;\n"
-      << "      let inputCol = tileCol;\n"
-      << "      mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol" << (nullptr != batch_dims ? ", batchIndices" : "") << ");\n"
-      << "    }\n"
-      << "    kStart = kStart + tileInner;\n"
-      << "    workgroupBarrier();\n";
-
-  // Compute acc values for a single thread.
-  shader.MainFunctionBody()
-      << "    for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {\n"
-      << "      let BCached0 = mm_Bsub[k * innerElementSize][tileCol];\n"
-      << "      let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];\n"
-      << "      let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];\n";
-
-  if (inner_elements_size != 3) {
-    shader.MainFunctionBody() << "      let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];\n";
-  }
-
-  if (transpose_a) {
-    shader.MainFunctionBody()
-        << "      let Acached0 = mm_Asub[k * innerElementSize][localRow];\n"
-        << "      let Acached1 = mm_Asub[k * innerElementSize + 1][localRow];\n"
-        << "      let Acached2 = mm_Asub[k * innerElementSize + 2][localRow];\n"
-        << (inner_elements_size == 3 ? "" : "      let Acached3 = mm_Asub[k * innerElementSize + 3][localRow];\n")
-        << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
-        << "        let ACached = mm_Asub[tileCol][i];\n"
-        << "        acc[i] = BCached0 * ACached0[i] + acc[i];\n"
-        << "        acc[i] = BCached1 * ACached1[i] + acc[i];\n"
-        << "        acc[i] = BCached2 * ACached2[i] + acc[i];\n"
-        << "        " << (inner_elements_size == 3 ? "" : "acc[i] = BCached3 * ACached3[i] + acc[i];") << "\n"
-        << "      }\n";
-  } else {
-    shader.MainFunctionBody()
-        << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
-        << "        let ACached = mm_Asub[tileRow + i][k];\n"
-        << "        acc[i] = BCached0 * ACached.x + acc[i];\n"
-        << "        acc[i] = BCached1 * ACached.y + acc[i];\n"
-        << "        acc[i] = BCached2 * ACached.z + acc[i];\n"
-        << "        " << (inner_elements_size == 3 ? "" : "acc[i] = BCached3 * ACached.w + acc[i];") << "\n"
-        << "      }\n";
-  }
-  shader.MainFunctionBody()
-      << "    }\n"
-      << "    workgroupBarrier();\n"
-      << "  }\n";  // main for loop
-
-  // Write the results to the output buffer
-  shader.MainFunctionBody()
-      << "  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
-      << "    mm_write(batch, globalRow + innerRow, globalCol, acc[innerRow]);\n"
-      << "  }\n";
-
-  return Status::OK();
-}
-
-Status MatMulProgram::MakeMatMulPackedSource(ShaderHelper& shader,
-                                             const InlinedVector<int64_t>& elements_per_thread,
-                                             uint32_t workgroup_size_x,
-                                             uint32_t workgroup_size_y,
-                                             const std::string& data_type,
-                                             const ShaderIndicesHelper* batch_dims,
-                                             bool transpose_a,
-                                             uint32_t tile_inner,
-                                             bool split_k,
-                                             uint32_t splitted_dim_inner,
-                                             bool sequentially_access_by_threads) {
-  ORT_UNUSED_PARAMETER(split_k);
-  ORT_UNUSED_PARAMETER(splitted_dim_inner);
-
-  const auto elements_per_thread_x = elements_per_thread[0];
-  const auto elements_per_thread_y = elements_per_thread[1];
-
-  const auto tile_a_outer = workgroup_size_y * elements_per_thread_y;
-  const auto tile_b_outer = workgroup_size_x * elements_per_thread_x;
-  const auto tile_a_width = transpose_a ? tile_a_outer : tile_inner;
-  const auto tile_a_height = transpose_a ? tile_inner : tile_a_outer;
-
-  if (!(tile_a_height % workgroup_size_y == 0 && tile_a_width % workgroup_size_x == 0 && tile_inner % workgroup_size_y == 0)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "tile_a_height: ", tile_a_height, " must be divisible by WorkgroupSizeY: ", workgroup_size_y,
-                           ", tile_a_width: ", tile_a_width, " must be divisible by WorkgroupSizeX: ", workgroup_size_x,
-                           ", tile_inner: ", tile_inner, " must be divisible by WorkgroupSizeY: ", workgroup_size_y);
-  }
-
-  const auto row_per_thread_a = tile_a_height / workgroup_size_y;
-  const auto col_per_thread_a = tile_a_width / workgroup_size_x;
-  const auto row_per_thread_b = tile_inner / workgroup_size_y;
-  std::string write_data_to_sub_a_snippet = transpose_a ? std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, kStart + inputRow, globalRowStart + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n"
-                                                        : std::string("mm_Asub[inputRow][inputCol] = mm_readA(batch, globalRowStart + inputRow, kStart + inputCol") + (batch_dims ? ", batchIndices" : "") + ");\n";
-  shader.AdditionalImplementation()
-      << "var<workgroup> mm_Asub: array<array<" << data_type << ", " << tile_a_width << ">, " << tile_a_height << ">;\n"
-      << "var<workgroup> mm_Bsub: array<array<" << data_type << ", " << tile_b_outer << ">, " << tile_inner << ">;\n"
-      << "const rowPerThread = " << elements_per_thread_y << ";\n"
-      << "const colPerThread = " << elements_per_thread_x << ";\n"
-      << "const tileInner = " << tile_inner << ";\n";
-
-  shader.MainFunctionBody() << " let batch = i32(global_id.z);\n"
-                            << (nullptr != batch_dims ? "  let batchIndices = " + batch_dims->OffsetToIndices("u32(batch)") + ";\n" : "")
-                            << " let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
-                            << " var kStart = 0;\n"
-                            << " var acc: array<array<" << data_type << ", colPerThread>, rowPerThread>;\n";
-
-  if (sequentially_access_by_threads) {
-    shader.MainFunctionBody() << "let localRow = i32(local_id.y);\n"
-                              << "let localCol = i32(local_id.x);\n"
-                              << "let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
-                              << "let globalColStart = i32(workgroup_id.x) * " << tile_b_outer << ";\n"
-                              << "\n"
-                              << "// Loop over shared dimension.\n"
-                              << "for (var t = 0; t < i32(num_tiles); t = t + 1) {\n"
-                              << "  // Load one tile of A into local memory.\n"
-                              << "  for (var inputRow = localRow; inputRow < " << tile_a_height << "; inputRow = inputRow + " << workgroup_size_y << ") {\n"
-                              << "    for (var inputCol = localCol; inputCol < " << tile_a_width << "; inputCol = inputCol + " << workgroup_size_x << ") {\n"
-                              << "      " << write_data_to_sub_a_snippet << "\n"
-                              << "    }\n"
-                              << "  }\n"
-                              << "  // Load one tile of B into local memory.\n"
-                              << "  for (var inputRow = localRow; inputRow < " << tile_inner << "; inputRow = inputRow + " << workgroup_size_y << ") {\n"
-                              << "        for (var inputCol = localCol; inputCol < " << tile_b_outer << "; inputCol = inputCol + " << workgroup_size_x << ") {\n"
-                              << "      mm_Bsub[inputRow][inputCol] = mm_readB(batch,\n"
-                              << "        kStart + inputRow,\n"
-                              << "        globalColStart + inputCol" << (batch_dims ? ", batchIndices" : "") << ");\n "
-                              << "    }\n"
-                              << "  }\n"
-                              << "  kStart = kStart + tileInner;\n"
-                              << "  workgroupBarrier();\n"
-                              << "\n"
-                              << "  // Compute acc values for a single thread.\n"
-                              << "  var BCached : array<" << data_type << ", colPerThread>;\n"
-                              << "  for (var k = 0; k < tileInner; k = k + 1) {\n"
-                              << "    for (var inner = 0; inner < colPerThread; inner = inner + 1) {\n"
-                              << "      BCached[inner] = mm_Bsub[k][localCol + inner * " << workgroup_size_x << "];\n"
-                              << "    }\n"
-                              << "    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
-                              << "      let ACached = " << (transpose_a ? "mm_Asub[k][localRow + innerRow * " + std::to_string(workgroup_size_y) + "];" : "mm_Asub[localRow + innerRow * " + std::to_string(workgroup_size_y) + "][k];") << "\n"
-                              << "      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {\n"
-                              << "        acc[innerRow][innerCol] = acc[innerRow][innerCol] +\n"
-                              << "            ACached * BCached[innerCol];\n"
-                              << "      }\n"
-                              << "    }\n"
-                              << "  }\n"
-                              << "  workgroupBarrier();\n"
-                              << "}\n"
-                              << "for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
-                              << "  let gRow = globalRowStart + localRow + innerRow * " << workgroup_size_y << ";\n"
-                              << "  for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {\n"
-                              << "    let gCol = globalColStart + localCol + innerCol * " << workgroup_size_x << ";\n"
-                              << "    mm_write(batch, gRow, gCol, acc[innerRow][innerCol]);\n"
-                              << "  }\n"
-                              << "}\n";
-  } else {
-    shader.MainFunctionBody()
-        << "let tileRow = i32(local_id.y) * rowPerThread;\n"
-        << "let tileCol = i32(local_id.x) * colPerThread;\n"
-        << "let globalRow = i32(global_id.y) * rowPerThread;\n"
-        << "let globalCol = i32(global_id.x) * colPerThread;\n"
-        << "let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
-        << "let tileRowA = i32(local_id.y) * " << row_per_thread_a << ";\n"
-        << "let tileColA = i32(local_id.x) * " << col_per_thread_a << ";\n"
-        << "let tileRowB = i32(local_id.y) * " << row_per_thread_b << ";\n";
-
-    // Loop over shared dimension.
-    shader.MainFunctionBody()
-        << "for (var t = 0; t < i32(num_tiles); t = t + 1) {\n";
-
-    // Load one tile of A into local memory.
-    shader.MainFunctionBody()
-        << "  for (var innerRow = 0; innerRow < i32(" << row_per_thread_a << "); innerRow = innerRow + 1) {\n"
-        << "    for (var innerCol = 0; innerCol < i32(" << col_per_thread_a << "); innerCol = innerCol + 1) {\n"
-        << "      let inputRow = tileRowA + innerRow;\n"
-        << "      let inputCol = tileColA + innerCol;\n"
-        << "      " << write_data_to_sub_a_snippet << "\n"
-        << "    }\n"
-        << "  }\n";
-
-    // Load one tile of B into local memory.
-    shader.MainFunctionBody()
-        << "  for (var innerRow = 0; innerRow < i32(" << row_per_thread_b << "); innerRow = innerRow + 1) {\n"
-        << "    for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
-        << "      let inputRow = tileRowB + innerRow;\n"
-        << "      let inputCol = tileCol + innerCol;\n"
-        << "      mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol + innerCol" << (nullptr != batch_dims ? ", batchIndices" : "") << ");\n"
-        << "    }\n"
-        << "  }\n"
-        << "  kStart = kStart + tileInner;\n"
-        << "  workgroupBarrier();\n";
-
-    // Compute acc values for a single thread.
-    shader.MainFunctionBody()
-        << "var BCached: array<" << data_type << ", colPerThread>;\n"
-        << "  for (var k = 0; k < tileInner; k = k + 1) {\n"
-        << "    for (var inner = 0; inner < i32(colPerThread); inner = inner + 1) {\n"
-        << "      BCached[inner] = mm_Bsub[k][tileCol + inner];\n"
-        << "    }\n"
-        << "    for (var innerRow = 0; innerRow < i32(rowPerThread); innerRow = innerRow + 1) {\n"
-        << "      let ACached = mm_Asub[tileRow + innerRow][k];\n"
-        << "      for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
-        << "        acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol];\n"
-        << "      }\n"
-        << "    }\n"
-        << "  }\n"
-        << "  workgroupBarrier();\n"
-        << "}\n";
-
-    // Write the results to the output buffer
-    shader.MainFunctionBody()
-        << "for (var innerRow = 0; innerRow < i32(rowPerThread); innerRow = innerRow + 1) {\n"
-        << "  for (var innerCol = 0; innerCol < i32(colPerThread); innerCol = innerCol + 1) {\n"
-        << "    mm_write(batch, globalRow + innerRow, globalCol + innerCol, acc[innerRow][innerCol]);\n"
-        << "  }\n"
-        << "}\n";
-  }
-  return Status::OK();
-}
-
 Status MatMulProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& a = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   const auto& b = shader.AddInput("b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
@@ -370,7 +22,8 @@ Status MatMulProgram::GenerateShaderCode(ShaderHelper& shader) const {
   }
   std::string apply_activation = GetActivationSnippet(activation_, "output_value_t", "output_element_t");
   // declare the read and write functions
-  MatMulReadWriteFnSource(shader, a, b, output, batch_dims, apply_activation);
+  MatMulReadFnSource(shader, a, b, &batch_dims, /*transA = */ false, /*transB = */ false, is_vec4_);
+  MatMulWriteFnSource(shader, output, has_bias_, /* is_gemm = */ false, 1, is_vec4_ ? 4 : 1, false, apply_activation, is_channels_last_);
   std::string data_type = "a_element_t";
   // generate the main function
   if (is_vec4_) {
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.h b/onnxruntime/core/providers/webgpu/math/matmul_packed.h
index d3a68ff8a57fa..162f26982edc4 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul_packed.h
+++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.h
@@ -25,35 +25,12 @@ class MatMulProgram final : public Program<MatMulProgram> {
                                           {"dim_b_outer", ProgramUniformVariableDataType::Uint32},
                                           {"dim_inner", ProgramUniformVariableDataType::Uint32});
 
-  static Status MakeMatMulPackedVec4Source(ShaderHelper& shader,
-                                           const InlinedVector<int64_t>& elements_per_thread,
-                                           uint32_t workgroup_size_x,
-                                           uint32_t workgroup_size_y,
-                                           const std::string& data_type,
-                                           const ShaderIndicesHelper* batch_dims,
-                                           bool transpose_a = false,
-                                           uint32_t tile_inner = 32,
-                                           bool split_k = false,
-                                           uint32_t splitted_dim_inner = 32);
-  static Status MakeMatMulPackedSource(ShaderHelper& shader,
-                                       const InlinedVector<int64_t>& elements_per_thread,
-                                       uint32_t workgroup_size_x,
-                                       uint32_t workgroup_size_y,
-                                       const std::string& data_type,
-                                       const ShaderIndicesHelper* batch_dims,
-                                       bool transpose_a = false,
-                                       uint32_t tile_inner = 32,
-                                       bool split_k = false,
-                                       uint32_t splitted_dim_inner = 32,
-                                       bool sequentially_access_by_threads = false);
-
  private:
   const Activation& activation_;
   const bool has_bias_;
   const bool is_vec4_;
   const InlinedVector<int64_t> elements_per_thread_;
   bool is_channels_last_ = false;
-  void MatMulReadWriteFnSource(ShaderHelper& shader, const ShaderVariableHelper& a, const ShaderVariableHelper& b, const ShaderVariableHelper& output, const ShaderIndicesHelper& batch_dims, std::string apply_activation) const;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/nn/conv.cc b/onnxruntime/core/providers/webgpu/nn/conv.cc
index 467e2f3f3a5ce..1aa7b5d161b87 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv.cc
@@ -192,7 +192,7 @@ Status Conv<is_channels_last, is_fused>::ComputeInternal(ComputeContext& context
       uint32_t output_size = static_cast<uint32_t>(output_shape.Size() / components / output_number);
       const size_t output_rank = matmul_output_shape.NumDimensions();
       TensorShape outer_dims = output_rank > 2 ? matmul_output_shape.Slice(0, output_rank - 2) : TensorShape({});
-      MatMulNaiveProgram program(activation_, output_rank, output_number, has_bias);
+      MatMulNaiveProgram program(activation_, output_rank, output_number, has_bias, is_channels_last);
       program
           .CacheHint(std::to_string(components), std::to_string(a_components), std::to_string(output_number))
           .AddInputs({{matmul_inputs[0], ProgramTensorMetadataDependency::TypeAndRank, ReduceShapeByComponents(matmul_input_reshapes[0], a_components), int(a_components)},
@@ -211,7 +211,6 @@ Status Conv<is_channels_last, is_fused>::ComputeInternal(ComputeContext& context
       return context.RunProgram(program);
     }
   }
-  const bool sequentially_access_by_threads = true;
   // Transpose weights
   Tensor transposed_kernel;
   ORT_RETURN_IF_ERROR(TransposeKernel(context, kernel, kernel_shape, &transposed_kernel, perm));
@@ -221,7 +220,7 @@ Status Conv<is_channels_last, is_fused>::ComputeInternal(ComputeContext& context
   inputs[1] = &transposed_kernel;
   TensorShape transposed_kernel_shape = transposed_kernel.Shape();
   modified_input_output_shapes[1] = transposed_kernel.Shape();
-  Conv2dMMProgram conv2d_mm_program = CreateConv2dMMProgram(activation_, inputs, pads, strides, dilations, output, dim_a_outer, dim_b_outer, dim_inner, is_channels_last, sequentially_access_by_threads, modified_input_output_shapes);
+  Conv2dMMProgram conv2d_mm_program = CreateConv2dMMProgram(activation_, inputs, pads, strides, dilations, output, dim_a_outer, dim_b_outer, dim_inner, is_channels_last, modified_input_output_shapes);
   return context.RunProgram(conv2d_mm_program);
 }
 
diff --git a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
index 311808fdd9e09..ee7a36d17cf55 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
@@ -12,6 +12,7 @@
 #include "core/providers/webgpu/nn/conv_utils.h"
 #include "core/providers/webgpu/nn/fuse_utils.h"
 #include "core/providers/webgpu/webgpu_utils.h"
+#include "core/providers/webgpu/math/gemm_utils.h"
 
 namespace onnxruntime {
 namespace webgpu {
@@ -159,10 +160,12 @@ Status Conv2dMMProgram::GenerateShaderCode(ShaderHelper& shader) const {
       << declaration_functions.str()
       << Conv2dCommonSnippet(x, w, activation_, "x_element_t", element_size_[0], element_size_[1], element_size_[2]);
   std::string data_type = "x_element_t";
-  return is_vec4_ ? MatMulProgram::MakeMatMulPackedVec4Source(shader, elements_per_thread_, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, /* transpose_a = */ !is_channels_last_, tile_inner_) : MatMulProgram::MakeMatMulPackedSource(shader, elements_per_thread_, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, !is_channels_last_, tile_inner_, /* split_t = */ false, 0, sequentially_access_by_threads_);
+
+  return is_vec4_ ? MakeMatMulPackedVec4Source(shader, elements_per_thread_, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, /* transpose_a = */ !is_channels_last_, /* transpose_b = */ false, 1.0f, true, 4, tile_inner_)
+                  : MakeMatMulPackedSource(shader, elements_per_thread_, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, /*transpose_a = */ !is_channels_last_, /* transpose_b = */ false, 1.0f, true, tile_inner_, /* split_t = */ false, 0);
 }
 
-Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::vector<const Tensor*>& inputs, const std::vector<uint32_t>& pads, const std::vector<uint32_t>& strides, const std::vector<uint32_t>& dilations, Tensor* output, uint32_t dim_a_outer, uint32_t dim_b_outer, uint32_t dim_inner, bool is_channels_last, bool sequentially_access_by_threads, const std::vector<TensorShape>& input_output_shapes) {
+Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::vector<const Tensor*>& inputs, const std::vector<uint32_t>& pads, const std::vector<uint32_t>& strides, const std::vector<uint32_t>& dilations, Tensor* output, uint32_t dim_a_outer, uint32_t dim_b_outer, uint32_t dim_inner, bool is_channels_last, const std::vector<TensorShape>& input_output_shapes) {
   const auto* input = inputs[0];
   const auto* weight = inputs[1];
   bool has_bias = inputs.size() > 2;
@@ -200,7 +203,7 @@ Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::v
   std::vector<uint32_t> element_size = {is_vec4 ? inner_element_size : 1, static_cast<uint32_t>(is_vec4 ? 4 : 1), static_cast<uint32_t>(is_vec4 ? 4 : 1)};
   const auto components = is_vec4 ? 4 : 1;
   const auto input_components = static_cast<int>(inner_element_size == 3 ? 1 : inner_element_size);
-  Conv2dMMProgram program(activation, tile_inner, fit_a_outer, fit_b_outer, fit_inner, is_channels_last, is_vec4, has_bias, std::move(element_size), std::move(elements_per_thread), sequentially_access_by_threads);
+  Conv2dMMProgram program(activation, tile_inner, fit_a_outer, fit_b_outer, fit_inner, is_channels_last, is_vec4, has_bias, std::move(element_size), std::move(elements_per_thread));
   TensorShape reduced_input_shape = ReduceShapeByComponents(input_output_shapes[0], input_components);
   TensorShape reduced_weight_shape = ReduceShapeByComponents(input_output_shapes[1], components);
   TensorShape reduced_output_shape = ReduceShapeByComponents(input_output_shapes[has_bias ? 3 : 2], components);
diff --git a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.h b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.h
index 0087d11db179d..d7cc08aae26f3 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.h
+++ b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.h
@@ -17,18 +17,17 @@ namespace onnxruntime {
 namespace webgpu {
 class Conv2dMMProgram final : public Program<Conv2dMMProgram> {
  public:
-  Conv2dMMProgram(const Activation& activation, uint32_t tile_inner, bool fit_a_outer, bool fit_b_outer, bool fit_inner, bool is_channels_last, bool is_vec4, bool has_bias, std::vector<uint32_t>&& element_size, InlinedVector<int64_t>&& elements_per_thread, bool sequentially_access_by_threads) : Program("Conv2dMM"),
-                                                                                                                                                                                                                                                                                                        activation_(activation),
-                                                                                                                                                                                                                                                                                                        tile_inner_(tile_inner),
-                                                                                                                                                                                                                                                                                                        fit_a_outer_(fit_a_outer),
-                                                                                                                                                                                                                                                                                                        fit_b_outer_(fit_b_outer),
-                                                                                                                                                                                                                                                                                                        fit_inner_(fit_inner),
-                                                                                                                                                                                                                                                                                                        is_channels_last_(is_channels_last),
-                                                                                                                                                                                                                                                                                                        is_vec4_(is_vec4),
-                                                                                                                                                                                                                                                                                                        has_bias_(has_bias),
-                                                                                                                                                                                                                                                                                                        element_size_(std::move(element_size)),
-                                                                                                                                                                                                                                                                                                        elements_per_thread_(std::move(elements_per_thread)),
-                                                                                                                                                                                                                                                                                                        sequentially_access_by_threads_(sequentially_access_by_threads) {}
+  Conv2dMMProgram(const Activation& activation, uint32_t tile_inner, bool fit_a_outer, bool fit_b_outer, bool fit_inner, bool is_channels_last, bool is_vec4, bool has_bias, std::vector<uint32_t>&& element_size, InlinedVector<int64_t>&& elements_per_thread) : Program("Conv2dMM"),
+                                                                                                                                                                                                                                                                   activation_(activation),
+                                                                                                                                                                                                                                                                   tile_inner_(tile_inner),
+                                                                                                                                                                                                                                                                   fit_a_outer_(fit_a_outer),
+                                                                                                                                                                                                                                                                   fit_b_outer_(fit_b_outer),
+                                                                                                                                                                                                                                                                   fit_inner_(fit_inner),
+                                                                                                                                                                                                                                                                   is_channels_last_(is_channels_last),
+                                                                                                                                                                                                                                                                   is_vec4_(is_vec4),
+                                                                                                                                                                                                                                                                   has_bias_(has_bias),
+                                                                                                                                                                                                                                                                   element_size_(std::move(element_size)),
+                                                                                                                                                                                                                                                                   elements_per_thread_(std::move(elements_per_thread)) {}
 
   std::string Conv2dCommonSnippet(const ShaderVariableHelper& x, const ShaderVariableHelper& w, const Activation& activation, std::string data_type, uint32_t inner_element_size_x = 4, uint32_t inner_element_size_w = 4, uint32_t inner_element_size = 4) const;
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -52,10 +51,9 @@ class Conv2dMMProgram final : public Program<Conv2dMMProgram> {
   bool has_bias_;
   std::vector<uint32_t> element_size_;
   InlinedVector<int64_t> elements_per_thread_;
-  bool sequentially_access_by_threads_;
 };
 
-Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::vector<const Tensor*>& inputs, const std::vector<uint32_t>& pads, const std::vector<uint32_t>& strides, const std::vector<uint32_t>& dilations, Tensor* output, uint32_t dim_a_outer, uint32_t dim_b_outer, uint32_t dim_inner, bool is_channels_last, bool sequentially_access_by_threads, const std::vector<TensorShape>& modified_input_output_shapes);
+Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::vector<const Tensor*>& inputs, const std::vector<uint32_t>& pads, const std::vector<uint32_t>& strides, const std::vector<uint32_t>& dilations, Tensor* output, uint32_t dim_a_outer, uint32_t dim_b_outer, uint32_t dim_inner, bool is_channels_last, const std::vector<TensorShape>& modified_input_output_shapes);
 
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.cc b/onnxruntime/core/providers/webgpu/webgpu_utils.cc
index 9b16767475c0c..53b96dfe7a346 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_utils.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_utils.cc
@@ -7,7 +7,8 @@ namespace webgpu {
 TensorShape ReduceShapeByComponents(const TensorShape& shape, int64_t components) {
   // Reduce the last dimensions by components creating a new tensor shape.
   TensorShapeVector shape_vector = shape.AsShapeVector();
-  auto reduce_index = shape_vector.size() - 1;
+  ORT_ENFORCE(!shape_vector.empty(), "The input shape must not be empty.");
+  size_t reduce_index = shape_vector.size() - 1;
   // Find the last dimension that is divisible by components.
   while (shape_vector[reduce_index] % components != 0 && reduce_index > 0) {
     ORT_ENFORCE(components % shape_vector[reduce_index] == 0, "The components must divide dims");
@@ -15,7 +16,7 @@ TensorShape ReduceShapeByComponents(const TensorShape& shape, int64_t components
     shape_vector[reduce_index] = 1;
     reduce_index--;
   }
-  ORT_ENFORCE(reduce_index >= 0 && shape_vector[reduce_index] % components == 0, "The last non-unit dimension of the input shape must be divisible by the number of components.");
+  ORT_ENFORCE(shape_vector[reduce_index] % components == 0, "The last non-unit dimension of the input shape must be divisible by the number of components.");
   shape_vector[reduce_index] /= components;
   return TensorShape(shape_vector);
 }
diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index 79875cbe6f76c..e821265fff80d 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -99,6 +99,72 @@ bool IsTensorShapeSupported(const NodeArg& node_arg, const std::string& parent_n
   return true;
 }
 
+// Check if all input tensor ranks of the given node are supported by WebNN.
+bool IsInputRankSupportedByOp(const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) {
+  const std::string_view op_type = node.OpType();
+  const auto it = op_inputs_map.find(op_type);
+  if (it == op_inputs_map.end()) {
+    LOGS(logger, VERBOSE) << "Operator type: [" << op_type << "] is not found in the op inputs map.";
+    return false;
+  }
+
+  const auto& input_defs = node.InputDefs();
+  const std::string_view webnn_op_type = it->second.opType;
+  const std::string webnn_op_type_str(webnn_op_type);
+
+  for (const auto& input : it->second.inputs) {
+    if (static_cast<size_t>(input.index) >= input_defs.size() || input_defs[input.index] == nullptr) {
+      LOGS(logger, VERBOSE) << "Input index [" << input.index
+                            << "] for operator type [" << op_type
+                            << "], corresponding WebNN op type [" << webnn_op_type
+                            << "], WebNN input name [" << input.name
+                            << "] is invalid.";
+      return false;
+    }
+
+    std::vector<int64_t> input_shape;
+    if (!GetShape(*input_defs[input.index], input_shape, logger)) {
+      return false;
+    }
+
+    const std::string input_name_str(input.name);
+    if (wnn_limits[webnn_op_type_str].isUndefined() ||
+        wnn_limits[webnn_op_type_str][input_name_str].isUndefined()) {
+      LOGS(logger, VERBOSE) << "Operator type: [" << op_type
+                            << "], input index: [" << input.index
+                            << "], corresponding WebNN op type: " << webnn_op_type
+                            << ", WebNN input name " << input.name
+                            << " is not defined in wnn_limits.";
+      return false;
+    }
+
+    const auto& input_limits = wnn_limits[webnn_op_type_str][input_name_str];
+    if (input_limits["rankRange"].isUndefined()) {
+      LOGS(logger, VERBOSE) << "Operator type: [" << op_type
+                            << "], input index: [" << input.index
+                            << "], corresponding WebNN op type: " << webnn_op_type
+                            << ", WebNN input name " << input.name
+                            << "'s rankRange is not defined.";
+      return false;
+    }
+
+    int input_dim_size = static_cast<int>(input_shape.size());
+    int min_rank = input_limits["rankRange"]["min"].as<int>();
+    int max_rank = input_limits["rankRange"]["max"].as<int>();
+
+    if (input_dim_size < min_rank || input_dim_size > max_rank) {
+      LOGS(logger, VERBOSE) << "Operator type: [" << op_type
+                            << "], input index: [" << input.index
+                            << "], corresponding WebNN op type: " << webnn_op_type
+                            << ", WebNN input name: " << input.name
+                            << ", input size " << input_dim_size
+                            << " is not in supported range [" << min_rank << ", " << max_rank << "]";
+      return false;
+    }
+  }
+  return true;
+}
+
 std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
                                                   const emscripten::val& wnn_builder,
                                                   const WebnnDeviceType device_type,
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index f124e90580353..cec218ea94e58 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -194,6 +194,8 @@ inline bool TensorExists(const ConstPointerContainer<std::vector<NodeArg*>>& def
 bool IsTensorShapeSupported(const NodeArg& node_arg, const std::string& parent_name,
                             const logging::Logger& logger, bool allow_empty_input = false);
 
+bool IsInputRankSupportedByOp(const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger);
+
 // Get a set of nodes supported by WebNN EP.
 std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
                                                   const emscripten::val& wnn_builder,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index 130fa068219a2..280ffc83eae89 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -75,7 +75,7 @@ bool BinaryOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& nod
 
   std::string webnn_input_name = op_type == "PRelu" ? "input" : "a";
   std::string onnx_input_name = op_type == "PRelu" || op_type == "Pow" ? "X" : "A";
-  return IsDataTypeSupportedByOp(op_type, input0_type, wnn_limits, webnn_input_name, onnx_input_name, logger);
+  return IsInputRankSupportedByOp(node, wnn_limits, logger) && IsDataTypeSupportedByOp(op_type, input0_type, wnn_limits, webnn_input_name, onnx_input_name, logger);
 }
 
 void CreateBinaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc
index f574238195ffd..9083b9c22f64a 100644
--- a/onnxruntime/core/providers/xnnpack/math/matmul.cc
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc
@@ -41,9 +41,8 @@ bool MatMul::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& g
       break;
     }
 
-    if (A_shape == nullptr || A_shape->dim_size() > 2 ||
-        (A_shape->dim_size() == 2 && A_shape->dim(1).dim_value() == 0) ||
-        A_shape->dim(0).dim_value() == 0) {
+    // A must at-least be 1-D
+    if (A_shape == nullptr || A_shape->dim_size() < 1) {
       break;
     }
 
@@ -162,10 +161,28 @@ Status MatMul::Compute(OpKernelContext* ctx) const {
   xnn_status status = xnn_status_success;
 
   pthreadpool_t threadpool = GetThreadPool();
+
+  // If the input 'A' is 1-D, then it is prepended with 1 and hence batch will be 1
+  size_t batch = 1;
+
+  const auto& a_dims = a->Shape();
+  int64_t rank = a_dims.NumDimensions();
+
+  if (rank == 2) {
+    batch = a_dims[0];
+  } else if (rank > 2) {
+    // Input 'A' is N-dimensional, the batch is made up of the product of the outermost dims
+    // (excluding the actual inner reduction dim)
+
+    for (int64_t i = 0; i < rank - 1; ++i) {
+      batch *= a_dims[i];
+    }
+  }
+
   if (op_type_ == OpComputeType::op_compute_type_fp32) {
-    status = xnn_reshape_fully_connected_nc_f32(op0_.get(), a->Shape()[0], threadpool);
+    status = xnn_reshape_fully_connected_nc_f32(op0_.get(), batch, threadpool);
   } else if (op_type_ == OpComputeType::op_compute_type_fp16) {
-    status = xnn_reshape_fully_connected_nc_f16(op0_.get(), a->Shape()[0], threadpool);
+    status = xnn_reshape_fully_connected_nc_f16(op0_.get(), batch, threadpool);
   }
 
   if (status != xnn_status_success) {
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index c205e05baadb9..695819457bc79 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -205,6 +205,9 @@ ORT_API_STATUS_IMPL(OrtApis::SetSessionGraphOptimizationLevel, _In_ OrtSessionOp
     case ORT_ENABLE_EXTENDED:
       options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level2;
       break;
+    case ORT_ENABLE_LAYOUT:
+      options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level3;
+      break;
     case ORT_ENABLE_ALL:
       options->value.graph_optimization_level = onnxruntime::TransformerLevel::MaxLevel;
       break;
diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc
index bebf6e98ff3fa..1ff3487113358 100644
--- a/onnxruntime/core/session/allocator_adapters.cc
+++ b/onnxruntime/core/session/allocator_adapters.cc
@@ -3,6 +3,7 @@
 
 #include "allocator_adapters.h"
 #include "core/framework/error_code_helper.h"
+#include "core/session/abi_key_value_pairs.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_env.h"
 #include "core/session/ort_apis.h"
@@ -26,6 +27,16 @@ OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxrunti
     OrtAllocator::Reserve =
         [](OrtAllocator* this_, size_t size) { return static_cast<OrtAllocatorImplWrappingIAllocator*>(this_)->Reserve(size); };
   }
+  OrtAllocator::GetStats =
+      [](const OrtAllocator* this_, OrtKeyValuePairs** stats) noexcept -> OrtStatusPtr {
+    API_IMPL_BEGIN
+    auto kvp = std::make_unique<OrtKeyValuePairs>();
+    auto stats_map = static_cast<const OrtAllocatorImplWrappingIAllocator*>(this_)->Stats();
+    kvp->Copy(stats_map);
+    *stats = reinterpret_cast<OrtKeyValuePairs*>(kvp.release());
+    return nullptr;
+    API_IMPL_END
+  };
 }
 
 void* OrtAllocatorImplWrappingIAllocator::Alloc(size_t size) {
@@ -44,6 +55,26 @@ const OrtMemoryInfo* OrtAllocatorImplWrappingIAllocator::Info() const {
   return &i_allocator_->Info();
 }
 
+std::unordered_map<std::string, std::string> OrtAllocatorImplWrappingIAllocator::Stats() const {
+  AllocatorStats stats{};
+  i_allocator_->GetStats(&stats);
+
+  // Allocators which does not implement GetStats() will return empty stats
+  std::unordered_map<std::string, std::string> entries;
+  if (stats.num_allocs > 0 || stats.bytes_limit != 0) {
+    entries.insert_or_assign("Limit", std::to_string(stats.bytes_limit));
+    entries.insert_or_assign("InUse", std::to_string(stats.bytes_in_use));
+    entries.insert_or_assign("TotalAllocated", std::to_string(stats.total_allocated_bytes));
+    entries.insert_or_assign("MaxInUse", std::to_string(stats.max_bytes_in_use));
+    entries.insert_or_assign("NumAllocs", std::to_string(stats.num_allocs));
+    entries.insert_or_assign("NumReserves", std::to_string(stats.num_reserves));
+    entries.insert_or_assign("NumArenaExtensions", std::to_string(stats.num_arena_extensions));
+    entries.insert_or_assign("NumArenaShrinkages", std::to_string(stats.num_arena_shrinkages));
+    entries.insert_or_assign("MaxAllocSize", std::to_string(stats.max_alloc_size));
+  }
+  return entries;
+}
+
 onnxruntime::AllocatorPtr OrtAllocatorImplWrappingIAllocator::GetWrappedIAllocator() {
   return i_allocator_;
 }
diff --git a/onnxruntime/core/session/allocator_adapters.h b/onnxruntime/core/session/allocator_adapters.h
index 48f4ea03118c8..eb2ce20244da9 100644
--- a/onnxruntime/core/session/allocator_adapters.h
+++ b/onnxruntime/core/session/allocator_adapters.h
@@ -6,6 +6,8 @@
 #include "core/framework/allocator.h"
 #include "core/session/onnxruntime_cxx_api.h"
 
+#include <string>
+
 namespace onnxruntime {
 
 // Since all allocators are of type 'OrtAllocator' and there is a single
@@ -29,6 +31,8 @@ struct OrtAllocatorImplWrappingIAllocator final : public OrtAllocatorImpl {
   const OrtMemoryInfo* Info() const;
   void* Reserve(size_t size);
 
+  std::unordered_map<std::string, std::string> Stats() const;
+
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtAllocatorImplWrappingIAllocator);
 
   onnxruntime::AllocatorPtr GetWrappedIAllocator();
diff --git a/onnxruntime/core/session/default_cpu_allocator_c_api.cc b/onnxruntime/core/session/default_cpu_allocator_c_api.cc
index 1fb1ee4e96ecd..64b0726902996 100644
--- a/onnxruntime/core/session/default_cpu_allocator_c_api.cc
+++ b/onnxruntime/core/session/default_cpu_allocator_c_api.cc
@@ -3,6 +3,8 @@
 
 #include "core/framework/utils.h"
 #include "core/framework/allocator.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/session/abi_key_value_pairs.h"
 #include "core/session/allocator_adapters.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/ort_apis.h"
@@ -17,16 +19,25 @@ struct OrtDefaultCpuAllocator : onnxruntime::OrtAllocatorImpl {
         [](OrtAllocator* this_, void* p) { static_cast<OrtDefaultCpuAllocator*>(this_)->Free(p); };
     OrtAllocator::Info =
         [](const OrtAllocator* this_) { return static_cast<const OrtDefaultCpuAllocator*>(this_)->Info(); };
+    OrtAllocator::Reserve =
+        [](OrtAllocator* this_, size_t size) { return static_cast<OrtDefaultCpuAllocator*>(this_)->Alloc(size); };
+    OrtAllocator::GetStats =
+        [](const OrtAllocator* /*this_*/, OrtKeyValuePairs** stats) noexcept -> OrtStatusPtr {
+      // Default allocator does not support stats, return an empty OrtKeyValuePairs.
+      auto kvp = std::make_unique<OrtKeyValuePairs>();
+      *stats = reinterpret_cast<OrtKeyValuePairs*>(kvp.release());
+      return nullptr;
+    };
     Ort::ThrowOnError(OrtApis::CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info));
   }
 
   ~OrtDefaultCpuAllocator() { OrtApis::ReleaseMemoryInfo(cpu_memory_info); }
 
   void* Alloc(size_t size) {
-    return onnxruntime::utils::DefaultAlloc(size);
+    return onnxruntime::AllocatorDefaultAllocAligned(size, alignment_);
   }
   void Free(void* p) {
-    onnxruntime::utils::DefaultFree(p);
+    return onnxruntime::AllocatorDefaultFreeAligned(p, alignment_);
   }
   const OrtMemoryInfo* Info() const {
     return cpu_memory_info;
@@ -36,6 +47,7 @@ struct OrtDefaultCpuAllocator : onnxruntime::OrtAllocatorImpl {
 
  private:
   OrtMemoryInfo* cpu_memory_info;
+  const size_t alignment_ = MlasGetPreferredBufferAlignment();
 };
 
 ORT_API_STATUS_IMPL(OrtApis::GetAllocatorWithDefaultOptions, _Outptr_ OrtAllocator** out) {
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index d4b7ec1ff99fe..1ded826638250 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -287,16 +287,17 @@ Status Environment::Initialize(std::unique_ptr<logging::LoggingManager> logging_
     // Register MemCpy schema;
 
     // These ops are internal-only, so register outside of onnx
-    static std::vector<std::string> all_fixed_size_types = []() {
-      std::vector<std::string> all_types;
-      std::vector<std::string> all_tensor_types = OpSchema::all_tensor_types_ir9();
-      std::vector<std::string> all_sequence_types = OpSchema::all_tensor_sequence_types();
-      all_types.insert(all_types.end(), all_tensor_types.begin(), all_tensor_types.end());
-      all_types.insert(all_types.end(), all_sequence_types.begin(), all_sequence_types.end());
-      all_types.emplace_back("seq(tensor(bfloat16))");
-      all_types.erase(std::remove_if(all_types.begin(), all_types.end(),
-                      [](const std::string& s) { return s.find("string") != std::string::npos; }), all_types.end());
-      return all_types; }();
+
+    std::vector<std::string> all_fixed_size_types;
+
+    std::vector<std::string> all_tensor_types = OpSchema::all_tensor_types_ir9();
+    std::vector<std::string> all_sequence_types = OpSchema::all_tensor_sequence_types();
+    all_fixed_size_types.insert(all_fixed_size_types.end(), all_tensor_types.begin(), all_tensor_types.end());
+    all_fixed_size_types.insert(all_fixed_size_types.end(), all_sequence_types.begin(), all_sequence_types.end());
+    all_fixed_size_types.emplace_back("seq(tensor(bfloat16))");
+    all_fixed_size_types.erase(std::remove_if(all_fixed_size_types.begin(), all_fixed_size_types.end(),
+                                              [](const std::string& s) { return s.find("string") != std::string::npos; }),
+                               all_fixed_size_types.end());
 
     ORT_ATTRIBUTE_UNUSED ONNX_OPERATOR_SCHEMA(MemcpyFromHost)
         .Input(0, "X", "input", "T")
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index df70856a64e99..c34769f43ae1d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -251,10 +251,10 @@ Status GetMinimalBuildOptimizationHandling(
 }  // namespace
 
 std::atomic<uint32_t> InferenceSession::global_session_id_{1};
-std::map<uint32_t, InferenceSession*> InferenceSession::active_sessions_;
 #ifdef _WIN32
 std::mutex InferenceSession::active_sessions_mutex_;  // Protects access to active_sessions_
-onnxruntime::WindowsTelemetry::EtwInternalCallback InferenceSession::callback_ML_ORT_provider_;
+std::map<uint32_t, InferenceSession*> InferenceSession::active_sessions_;
+const std::string InferenceSession::callback_etw_provider_key_{"InferenceSessionML_ORT_provider"};
 #endif
 
 static Status FinalizeSessionOptions(const SessionOptions& user_provided_session_options,
@@ -375,7 +375,6 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
   session_id_ = global_session_id_.fetch_add(1);
 
   SetLoggingManager(session_options, session_env);
-
   // The call to InitLogger depends on the final state of session_options_. Hence it should be invoked
   // after the invocation of FinalizeSessionOptions.
   InitLogger(logging_manager_);  // this sets session_logger_ so that it can be used for logging after this point.
@@ -508,92 +507,48 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
   telemetry_ = {};
 
 #ifdef _WIN32
-  std::lock_guard<std::mutex> lock(active_sessions_mutex_);
-  active_sessions_[session_id_] = this;
-
-  // Register callback for ETW capture state (rundown) for Microsoft.ML.ONNXRuntime provider
-  callback_ML_ORT_provider_ = onnxruntime::WindowsTelemetry::EtwInternalCallback(
-      [](LPCGUID SourceId,
-         ULONG IsEnabled,
-         UCHAR Level,
-         ULONGLONG MatchAnyKeyword,
-         ULONGLONG MatchAllKeyword,
-         PEVENT_FILTER_DESCRIPTOR FilterData,
-         PVOID CallbackContext) {
-        (void)SourceId;
-        (void)Level;
-        (void)MatchAnyKeyword;
-        (void)MatchAllKeyword;
-        (void)FilterData;
-        (void)CallbackContext;
-        ORT_UNUSED_PARAMETER(SourceId);
-        ORT_UNUSED_PARAMETER(Level);
-        ORT_UNUSED_PARAMETER(MatchAnyKeyword);
-        ORT_UNUSED_PARAMETER(MatchAllKeyword);
-        ORT_UNUSED_PARAMETER(FilterData);
-        ORT_UNUSED_PARAMETER(CallbackContext);
-
-        // Check if this callback is for capturing state
-        if ((IsEnabled == EVENT_CONTROL_CODE_CAPTURE_STATE) &&
-            ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)) != 0)) {
-          InferenceSession::LogAllSessions();
-        }
-      });
-  WindowsTelemetry::RegisterInternalCallback(callback_ML_ORT_provider_);
+
+  {
+    std::lock_guard<std::mutex> lock(active_sessions_mutex_);
+    auto result = active_sessions_.insert_or_assign(session_id_, this);
+    ORT_ENFORCE(result.second, "active_sessions has not been cleaned up for session_id", session_id_);
+  }
+
+  WindowsTelemetry::RegisterInternalCallback(callback_etw_provider_key_, EtwProviderCallbackLogAllSessions);
+
+  // If the __ctor does not finish for some reason, make sure that we still unregister
+  // whatever has been registered.
+  auto_etw_unregistrar_.emplace([this]() { UnregisterEtwCallbacks(); });
 
   // Register callback for ETW start / stop so that LOGS tracing can be adjusted dynamically after session start
+  callback_etw_sink_key_ = "InferenceSession_Start_Stop_";
+  callback_etw_sink_key_.append(std::to_string(session_id_));
   auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
-  callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
-      [&etwRegistrationManager, this](LPCGUID SourceId,
-                                      ULONG IsEnabled,
-                                      UCHAR Level,
-                                      ULONGLONG MatchAnyKeyword,
-                                      ULONGLONG MatchAllKeyword,
-                                      PEVENT_FILTER_DESCRIPTOR FilterData,
-                                      PVOID CallbackContext) {
-        ORT_UNUSED_PARAMETER(SourceId);
-        ORT_UNUSED_PARAMETER(Level);
-        ORT_UNUSED_PARAMETER(MatchAnyKeyword);
-        ORT_UNUSED_PARAMETER(MatchAllKeyword);
-        ORT_UNUSED_PARAMETER(FilterData);
-        ORT_UNUSED_PARAMETER(CallbackContext);
-
-        if (logging_manager_ != nullptr) {
-          auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
-
-          if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0 &&
-              IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
-            LOGS(*session_logger_, VERBOSE) << "Adding ETW Sink to logger with severity level: " << (ULONG)ortETWSeverity;
-            logging_manager_->AddSinkOfType(
-                onnxruntime::logging::SinkType::EtwSink,
-                []() -> std::unique_ptr<onnxruntime::logging::ISink> { return std::make_unique<onnxruntime::logging::EtwSink>(); },
-                ortETWSeverity);
-            onnxruntime::logging::LoggingManager::GetDefaultInstance()->AddSinkOfType(
-                onnxruntime::logging::SinkType::EtwSink,
-                []() -> std::unique_ptr<onnxruntime::logging::ISink> { return std::make_unique<onnxruntime::logging::EtwSink>(); },
-                ortETWSeverity);
-            LOGS(*session_logger_, INFO) << "Done Adding ETW Sink to logger with severity level: " << (ULONG)ortETWSeverity;
-          }
-          if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
-            LOGS(*session_logger_, INFO) << "Removing ETW Sink from logger";
-            logging_manager_->RemoveSink(onnxruntime::logging::SinkType::EtwSink);
-            LOGS(*session_logger_, VERBOSE) << "Done Removing ETW Sink from logger";
-          }
-        }
-      });
 
   // Register callback for ETW capture state (rundown)
-  etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
-
+  etwRegistrationManager.RegisterInternalCallback(callback_etw_sink_key_,
+                                                  [&etwRegistrationManager, this](
+                                                      LPCGUID SourceId,
+                                                      ULONG IsEnabled,
+                                                      UCHAR Level,
+                                                      ULONGLONG MatchAnyKeyword,
+                                                      ULONGLONG MatchAllKeyword,
+                                                      PEVENT_FILTER_DESCRIPTOR FilterData,
+                                                      PVOID CallbackContext) {
+                                                    EtwProviderSinkControlCallback(etwRegistrationManager, SourceId, IsEnabled, Level,
+                                                                                   MatchAnyKeyword, MatchAllKeyword, FilterData,
+                                                                                   CallbackContext);
+                                                  });
 #endif
 }
 
-void InferenceSession::TraceSessionOptions(const SessionOptions& session_options, bool captureState, const logging::Logger& logger) {
+void InferenceSession::TraceSessionOptions(const SessionOptions& session_options, bool captureState,
+                                           const logging::Logger& logger) {
   ORT_UNUSED_PARAMETER(captureState);  // Otherwise Linux build error
 
   LOGS(logger, INFO) << session_options;
 
-#ifdef _WIN32
+#if defined(_WIN32) && defined(ONNXRUNTIME_ENABLE_INSTRUMENT)
   std::string optimized_model_filepath = ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(session_options.optimized_model_filepath);
   std::string profile_file_prefix = ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(session_options.profile_file_prefix);
 
@@ -737,18 +692,6 @@ InferenceSession::~InferenceSession() {
     }
   }
 
-  // Unregister the session and ETW callbacks
-#ifdef _WIN32
-  std::lock_guard<std::mutex> lock(active_sessions_mutex_);
-  if (callback_ML_ORT_provider_ != nullptr) {
-    WindowsTelemetry::UnregisterInternalCallback(callback_ML_ORT_provider_);
-  }
-  if (callback_ETWSink_provider_ != nullptr) {
-    logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
-  }
-#endif
-  active_sessions_.erase(session_id_);
-
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
   if (session_activity_started_)
     TraceLoggingWriteStop(session_activity, "OrtInferenceSessionActivity");
@@ -1286,7 +1229,9 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
   // 4. partition nodes based on EP capabilities. EPs may fuse nodes during this process.
   // 5. run level 2+ optimizations. level 2 and 3 optimizations use contrib ops.
   // 6. insert cast nodes (required transformer).
-  // 7. insert copy nodes (required transformer).
+  // 7. run level 4 optimizations.
+  // 8. Repeat steps 5 to 7 depending on the graph optimizations loop level.
+  // 9. insert copy nodes (required transformer).
 
   // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
   auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&session_options_,
@@ -1307,9 +1252,24 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
   }
 
   auto apply_transformer_once = [](const GraphTransformer& transformer, const logging::Logger& logger,
-                                   Graph& graph) {
+                                   Graph& graph, bool* is_graph_modified = nullptr) -> onnxruntime::common::Status {
     bool modified = false;
-    return transformer.Apply(graph, modified, logger);
+    auto status = transformer.Apply(graph, modified, logger);
+    if (is_graph_modified) {
+      *is_graph_modified = *is_graph_modified || modified;
+    }
+    return status;
+  };
+
+  auto apply_transformer_at_level = [](onnxruntime::GraphTransformerManager& graph_transformer_mgr,
+                                       const TransformerLevel& level, const logging::Logger& logger, Graph& graph,
+                                       bool* is_graph_modified = nullptr) -> onnxruntime::common::Status {
+    graph_transformer_mgr.ClearGraphModified();
+    auto status = graph_transformer_mgr.ApplyTransformers(graph, level, logger);
+    if (is_graph_modified) {
+      *is_graph_modified = *is_graph_modified || graph_transformer_mgr.IsGraphModified();
+    }
+    return status;
   };
 
   // ensure potential QDQ node units have unique DQ nodes
@@ -1396,23 +1356,70 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
                                                        session_options_.config_options, *session_logger_,
                                                        mode, session_options_.GetEpContextGenerationOptions(), debug_graph_fn));
 
-  // apply Level2 and higher transformers.
-  // we do not run Level 1 again as those transformers assume partitioning will run later to do node assignment.
-  for (int i = static_cast<int>(TransformerLevel::Level2); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
+  // Get graph optimizations loop level from session config, if not present, set to default value of 1 as per
+  // the definition of kOrtSessionOptionsGraphOptimizationsLoopLevel.
+  unsigned int graph_optimizations_loop_level = static_cast<unsigned int>(std::stoi(
+      session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsGraphOptimizationsLoopLevel, "1")));
+
+  // Running for an arbitrary number of time to mitigate if graph optimization is stuck in the loop.
+  // Generating warning once we have gone through this loop for more than 3 times, it can be changed in the future
+  for (int steps = 0; steps < 10; steps++) {
+    // Warning that we are running this loop too many times
+    if (steps >= 3) {
+      LOGS(*session_logger_, WARNING) << "Running graph optimizations in loop " << (steps + 1) << " time/s"
+                                      << " (Graph Optimizations Loop Level : " << graph_optimizations_loop_level << ")";
+    } else {
+      LOGS(*session_logger_, INFO) << "Running graph optimizations in loop " << (steps + 1) << " time/s"
+                                   << " (Graph Optimizations Loop Level : " << graph_optimizations_loop_level << ")";
+    }
+
+    // Flag to check if applying optimizations should be repeated on basis of if the graph is changed.
+    // If graph is not changed it will remain false and we will exit out of this loop.
+    bool is_graph_modified = false;
+
+    // Apply Level2 and higher transformers.
+    // We do not run Level 1 again as those transformers assume partitioning will run later to do node assignment.
+    // Re-Run the Level2+ optimizations. The idea behind re-running Level2 and Level3 graph transforms is that,
+    // after the fusion, the nodes are can be in a format which might be supported by other graph transforms which
+    // were skipped before. Hence, some of the transforms not applied before is now valid and can be applied to
+    // create a more optimal graph for execution.
     ORT_RETURN_IF_ERROR_SESSIONID_(
-        graph_transformer_mgr_.ApplyTransformers(graph, static_cast<TransformerLevel>(i), *session_logger_));
-  }
+        apply_transformer_at_level(graph_transformer_mgr_, TransformerLevel::Level2,
+                                   *session_logger_, graph,
+                                   ((graph_optimizations_loop_level > 1) ? &is_graph_modified : nullptr)));
+    ORT_RETURN_IF_ERROR_SESSIONID_(
+        apply_transformer_at_level(graph_transformer_mgr_, TransformerLevel::Level3,
+                                   *session_logger_, graph,
+                                   ((graph_optimizations_loop_level > 1) ? &is_graph_modified : nullptr)));
 
-  // Insert cast node/s.
-  {
-    const InlinedVector<gsl::not_null<const KernelRegistry*>> kernel_regs =
-        kernel_registry_manager_.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
-    const KernelRegistry* cpu_regs = nullptr;
-    if (!kernel_regs.empty()) {
-      cpu_regs = kernel_regs[0];
+    // Insert cast node/s.
+    {
+      const InlinedVector<gsl::not_null<const KernelRegistry*>> kernel_regs =
+          kernel_registry_manager_.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
+
+      const KernelRegistry* cpu_regs = nullptr;
+      if (!kernel_regs.empty()) {
+        // NOTE: This assumes that CPU kernels are always at the n-1 index of kernel registries vector as per the design
+        //       of GetKernelRegistriesByProviderType function.
+        cpu_regs = kernel_regs[kernel_regs.size() - 1];
+      }
+
+      InsertCastTransformer insert_cast_transformer{"CastFloat16Transformer", cpu_regs};
+      ORT_RETURN_IF_ERROR_SESSIONID_(
+          apply_transformer_once(insert_cast_transformer, *session_logger_, graph,
+                                 ((graph_optimizations_loop_level > 1) ? &is_graph_modified : nullptr)));
+    }
+
+    // Level 4 Transforms must be run after Insert Cast Node/s
+    ORT_RETURN_IF_ERROR_SESSIONID_(
+        apply_transformer_at_level(graph_transformer_mgr_, TransformerLevel::Level4,
+                                   *session_logger_, graph,
+                                   ((graph_optimizations_loop_level > 0) ? &is_graph_modified : nullptr)));
+
+    // Break if no more optimizations are made
+    if (!is_graph_modified) {
+      break;
     }
-    InsertCastTransformer insert_cast_transformer{"CastFloat16Transformer", cpu_regs};
-    ORT_RETURN_IF_ERROR_SESSIONID_(apply_transformer_once(insert_cast_transformer, *session_logger_, graph));
   }
 
   // Insert copy node/s.
@@ -2453,7 +2460,6 @@ common::Status InferenceSession::ValidateInputsOutputs(gsl::span<const std::stri
       }
     } else if (input_output_ml_value.IsSparseTensor()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
-
       const SparseTensor& sparse_tensor = input_output_ml_value.Get<SparseTensor>();
       if (expected_type->IsSparseTensorType()) {
         auto expected_element_type = expected_type->AsSparseTensorType()->GetElementType();
@@ -3456,6 +3462,32 @@ IOBinding* SessionIOBinding::Get() {
 }
 
 #ifdef _WIN32
+
+void InferenceSession::UnregisterEtwCallbacks() {
+  if (!callback_etw_sink_key_.empty()) {
+    logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_etw_sink_key_);
+  }
+  WindowsTelemetry::UnregisterInternalCallback(callback_etw_provider_key_);
+  {
+    std::lock_guard<std::mutex> lock(active_sessions_mutex_);
+    active_sessions_.erase(session_id_);
+  }
+}
+
+void InferenceSession::EtwProviderCallbackLogAllSessions(LPCGUID /* SourceId */,
+                                                         ULONG IsEnabled,
+                                                         UCHAR /* Level */,
+                                                         ULONGLONG MatchAnyKeyword,
+                                                         ULONGLONG /* MatchAllKeyword */,
+                                                         PEVENT_FILTER_DESCRIPTOR /* FilterData */,
+                                                         PVOID /* CallbackContext */) {
+  // Check if this callback is for capturing state
+  if ((IsEnabled == EVENT_CONTROL_CODE_CAPTURE_STATE) &&
+      ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)) != 0)) {
+    InferenceSession::LogAllSessions();
+  }
+}
+
 void InferenceSession::LogAllSessions() {
   const Env& env = Env::Default();
 
@@ -3469,8 +3501,8 @@ void InferenceSession::LogAllSessions() {
 
     auto model = session->model_;
     if (nullptr != model) {
-      onnxruntime::Graph& graph = model->MainGraph();
-      bool model_has_fp16_inputs = ModelHasFP16Inputs(graph);
+      const onnxruntime::Graph& graph = model->MainGraph();
+      const bool model_has_fp16_inputs = ModelHasFP16Inputs(graph);
       env.GetTelemetryProvider().LogSessionCreation(
           session->session_id_, model->IrVersion(), model->ProducerName(), model->ProducerVersion(), model->Domain(),
           graph.DomainToVersionMap(), graph.Name(), model->MetaData(),
@@ -3480,6 +3512,34 @@ void InferenceSession::LogAllSessions() {
     InferenceSession::TraceSessionOptions(session->session_options_, true, *session->session_logger_);
   }
 }
+
+void InferenceSession::EtwProviderSinkControlCallback(logging::EtwRegistrationManager& etwRegistrationManager,
+                                                      LPCGUID, ULONG IsEnabled, UCHAR, ULONGLONG MatchAnyKeyword,
+                                                      ULONGLONG, PEVENT_FILTER_DESCRIPTOR, PVOID) {
+  if (logging_manager_ != nullptr) {
+    auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
+
+    if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0 &&
+        IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
+      LOGS(*session_logger_, VERBOSE) << "Adding ETW Sink to logger with severity level: " << (ULONG)ortETWSeverity;
+      logging_manager_->AddSinkOfType(
+          onnxruntime::logging::SinkType::EtwSink,
+          []() -> std::unique_ptr<onnxruntime::logging::ISink> { return std::make_unique<onnxruntime::logging::EtwSink>(); },
+          ortETWSeverity);
+      onnxruntime::logging::LoggingManager::GetDefaultInstance()->AddSinkOfType(
+          onnxruntime::logging::SinkType::EtwSink,
+          []() -> std::unique_ptr<onnxruntime::logging::ISink> { return std::make_unique<onnxruntime::logging::EtwSink>(); },
+          ortETWSeverity);
+      LOGS(*session_logger_, INFO) << "Done Adding ETW Sink to logger with severity level: " << (ULONG)ortETWSeverity;
+    }
+    if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
+      LOGS(*session_logger_, INFO) << "Removing ETW Sink from logger";
+      logging_manager_->RemoveSink(onnxruntime::logging::SinkType::EtwSink);
+      LOGS(*session_logger_, VERBOSE) << "Done Removing ETW Sink from logger";
+    }
+  }
+}
+
 #endif
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 51350390a0456..244fbac1bd9a8 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -134,12 +134,6 @@ class InferenceSession {
   };
 
   using InputOutputDefMetaMap = InlinedHashMap<std::string_view, InputOutputDefMetaData>;
-  static std::map<uint32_t, InferenceSession*> active_sessions_;
-#ifdef _WIN32
-  static std::mutex active_sessions_mutex_;  // Protects access to active_sessions_
-  static onnxruntime::WindowsTelemetry::EtwInternalCallback callback_ML_ORT_provider_;
-  onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_;
-#endif
 
  public:
 #if !defined(ORT_MINIMAL_BUILD)
@@ -764,10 +758,6 @@ class InferenceSession {
    */
   void ShrinkMemoryArenas(gsl::span<const AllocatorPtr> arenas_to_shrink);
 
-#ifdef _WIN32
-  static void LogAllSessions();
-#endif
-
 #if !defined(ORT_MINIMAL_BUILD)
   virtual common::Status AddPredefinedTransformers(
       GraphTransformerManager& transformer_manager,
@@ -984,6 +974,57 @@ class InferenceSession {
   // Enable nodestats collection
   std::optional<NodeStatsRecorder> node_stats_recorder_;
 #endif
+
+#ifdef _WIN32
+  static std::mutex active_sessions_mutex_;  // Protects access to active_sessions_
+  static std::map<uint32_t, InferenceSession*> active_sessions_;
+  // Single callback for all sessions. Registers when the first session comes up
+  // and unregister when the last session goes away.
+  static const std::string callback_etw_provider_key_;
+  std::string callback_etw_sink_key_;  // Session Start Stop
+
+  void UnregisterEtwCallbacks();
+
+  struct AutoEtwUnregistrar {
+    std::function<void()> unregister_callback;
+    explicit AutoEtwUnregistrar(std::function<void()> func)
+        : unregister_callback(std::move(func)) {}
+    ~AutoEtwUnregistrar() {
+      if (unregister_callback) {
+        unregister_callback();
+      }
+    }
+  };
+
+  // Automatically cleans up all outstanding registrations
+  // in case session loading fails and ETW callbacks are already registered.
+  // We want callbacks to stop before any other members of the object are
+  // destroyed.
+  std::optional<AutoEtwUnregistrar> auto_etw_unregistrar_;
+
+  // This callback is registered globally for all sessions
+  // It is unregistered when the last session goes away.
+  static void EtwProviderCallbackLogAllSessions(LPCGUID SourceId,
+                                                ULONG IsEnabled,
+                                                UCHAR Level,
+                                                ULONGLONG MatchAnyKeyword,
+                                                ULONGLONG MatchAllKeyword,
+                                                PEVENT_FILTER_DESCRIPTOR FilterData,
+                                                PVOID CallbackContext);
+
+  static void LogAllSessions();
+
+  // This callback is registered per session
+  void EtwProviderSinkControlCallback(logging::EtwRegistrationManager& etwRegistrationManager,
+                                      LPCGUID /*SourceId */,
+                                      ULONG IsEnabled,
+                                      UCHAR /* Level */,
+                                      ULONGLONG MatchAnyKeyword,
+                                      ULONGLONG /* MatchAllKeyword */,
+                                      PEVENT_FILTER_DESCRIPTOR /* FilterData */,
+                                      PVOID /* CallbackContext */);
+
+#endif
 };
 
 struct SessionIOBinding {
diff --git a/onnxruntime/core/session/inference_session_utils.cc b/onnxruntime/core/session/inference_session_utils.cc
index 8b9de0c604441..579a4fc11d3ee 100644
--- a/onnxruntime/core/session/inference_session_utils.cc
+++ b/onnxruntime/core/session/inference_session_utils.cc
@@ -74,6 +74,11 @@ static Status SetGraphOptimizationLevel(SessionOptions& session_options,
       session_options.graph_optimization_level = TransformerLevel::Level2;
       return Status::OK();
 
+    case ORT_ENABLE_LAYOUT:
+      LOGS(logger, INFO) << "Setting graph_optimization_level to ORT_ENABLE_LAYOUT";
+      session_options.graph_optimization_level = TransformerLevel::Level3;
+      return Status::OK();
+
     case ORT_ENABLE_ALL:
       LOGS(logger, INFO) << "Setting graph_optimization_level to ORT_ENABLE_ALL";
       session_options.graph_optimization_level = TransformerLevel::MaxLevel;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 868fab767fa7b..9e2705e3cfb57 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -1567,6 +1567,12 @@ ORT_API_STATUS_IMPL(OrtApis::AllocatorGetInfo, _In_ const OrtAllocator* ptr, _Ou
   API_IMPL_END
 }
 
+ORT_API_STATUS_IMPL(OrtApis::AllocatorGetStats, _In_ const OrtAllocator* ptr, _Outptr_ OrtKeyValuePairs** out) {
+  API_IMPL_BEGIN
+  return ptr->GetStats(ptr, out);
+  API_IMPL_END
+}
+
 template <typename T>
 ORT_STATUS_PTR OrtGetNumSequenceElements(const OrtValue* p_ml_value, size_t* out) {
   auto& data = p_ml_value->Get<T>();
@@ -3024,6 +3030,7 @@ static constexpr OrtApi ort_api_1_to_23 = {
     &OrtApis::GetEpApi,
     // End of Version 22 - DO NOT MODIFY ABOVE (see above text for more information)
     &OrtApis::GetTensorSizeInBytes,
+    &OrtApis::AllocatorGetStats,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 81af6694f6273..dcd1b3069bcac 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -601,4 +601,5 @@ ORT_API(const OrtEpApi*, GetEpApi);
 
 ORT_API_STATUS_IMPL(GetTensorSizeInBytes, _In_ const OrtValue* ort_value, _Out_ size_t* size);
 
+ORT_API_STATUS_IMPL(AllocatorGetStats, _In_ const OrtAllocator* ptr, _Outptr_ OrtKeyValuePairs** out);
 }  // namespace OrtApis
diff --git a/onnxruntime/core/session/ort_env.cc b/onnxruntime/core/session/ort_env.cc
index 335ebbf203e7c..57d97d1b862d6 100644
--- a/onnxruntime/core/session/ort_env.cc
+++ b/onnxruntime/core/session/ort_env.cc
@@ -14,6 +14,8 @@
 #include "core/framework/provider_shutdown.h"
 #include "core/platform/logging/make_platform_default_log_sink.h"
 
+// Whether the process is shutting down
+std::atomic<bool> g_is_shutting_down(false);
 using namespace onnxruntime;
 using namespace onnxruntime::logging;
 
@@ -25,7 +27,7 @@ void CleanupWebGpuContexts();
 }  // namespace onnxruntime
 #endif
 
-std::unique_ptr<OrtEnv> OrtEnv::p_instance_;
+OrtEnv* OrtEnv::p_instance_;
 int OrtEnv::ref_count_ = 0;
 std::mutex OrtEnv::m_;
 
@@ -77,23 +79,43 @@ OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_inf
     if (!status.IsOK()) {
       return nullptr;
     }
-    p_instance_ = std::make_unique<OrtEnv>(std::move(env));
+    // Use 'new' to allocate OrtEnv, as it will be managed by p_instance_
+    // and deleted in ReleaseEnv or leaked if g_is_process_shutting_down is true.
+    p_instance_ = new OrtEnv(std::move(env));
   }
 
   ++ref_count_;
-  return p_instance_.get();
+  return p_instance_;
 }
 
 void OrtEnv::Release(OrtEnv* env_ptr) {
   if (!env_ptr) {
-    return;
-  }
-  std::lock_guard<std::mutex> lock(m_);
-  ORT_ENFORCE(env_ptr == p_instance_.get());  // sanity check
-  --ref_count_;
-  if (ref_count_ == 0) {
-    p_instance_.reset();
+    return;  // nothing to release
   }
+
+  OrtEnv* instance_to_delete = nullptr;
+
+  {  // Scope for the lock guard
+    std::lock_guard<std::mutex> lock(m_);
+    assert(p_instance_ == env_ptr);
+
+    --ref_count_;
+    if (ref_count_ == 0) {
+      if (!g_is_shutting_down.load(std::memory_order_acquire)) {
+        instance_to_delete = p_instance_;  // Point to the instance to be deleted.
+        p_instance_ = nullptr;             // Set the static instance pointer to nullptr under the lock.
+      } else {
+        // Process is shutting down, let it leak.
+        // p_instance_ remains as is (though ref_count_ is 0), future CreateEnv calls
+        // would increment ref_count_ on this "leaked" instance.
+        // This behavior matches the requirement to "just let the memory leak out".
+      }
+    }
+  }  // Mutex m_ is released here when lock_guard goes out of scope.
+
+  // Perform the deletion outside the lock if an instance was marked for deletion.
+  // instance_to_delete can be null here, but it's perfectly safe to delete a nullptr
+  delete instance_to_delete;
 }
 
 onnxruntime::logging::LoggingManager* OrtEnv::GetLoggingManager() const {
diff --git a/onnxruntime/core/session/ort_env.h b/onnxruntime/core/session/ort_env.h
index b99d43b389e4b..20ac1b633e29d 100644
--- a/onnxruntime/core/session/ort_env.h
+++ b/onnxruntime/core/session/ort_env.h
@@ -70,7 +70,16 @@ struct OrtEnv {
   onnxruntime::common::Status CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo& mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg = nullptr);
 
  private:
-  static std::unique_ptr<OrtEnv> p_instance_;
+  // p_instance_ holds the single, global instance of OrtEnv.
+  // This is a raw pointer to allow for intentional memory leaking when
+  // the process is shutting down (g_is_shutting_down is true).
+  // Using a smart pointer like std::unique_ptr would complicate this specific
+  // shutdown scenario, as it would attempt to deallocate the memory even if
+  // Release() hasn't been called or if a leak is desired.
+  // Management is handled by GetInstance() and Release(), with ref_count_
+  // tracking active users. It is set to nullptr when the last reference is released
+  // (and not shutting down).
+  static OrtEnv* p_instance_;
   static std::mutex m_;
   static int ref_count_;
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 7fcaee48581f6..197fb4320e6bf 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -462,13 +462,14 @@ struct ProviderHostImpl : ProviderHost {
   }
   void logging__EtwRegistrationManager__RegisterInternalCallback(
       logging::EtwRegistrationManager* p,
-      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
-    p->RegisterInternalCallback(callback);
+      const std::string& cb_key,
+      logging::EtwRegistrationManager_EtwInternalCallback callback) override {
+    p->RegisterInternalCallback(cb_key, std::move(callback));
   }
   void logging__EtwRegistrationManager__UnregisterInternalCallback(
       logging::EtwRegistrationManager* p,
-      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
-    p->UnregisterInternalCallback(callback);
+      const std::string& cb_key) override {
+    p->UnregisterInternalCallback(cb_key);
   }
 #endif  // defined(_WIN32)
 
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index 2108626e36853..1f74ee3b3f2ee 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -132,7 +132,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     return status;
   }
 
-#ifdef _WIN32
+#if defined(_WIN32) && defined(ONNXRUNTIME_ENABLE_INSTRUMENT)
   for (const auto& config_pair : provider_options) {
     TraceLoggingWrite(
         telemetry_provider_handle,
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 5742b4db42512..0428b19357d51 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -115,6 +115,19 @@ OrtMemoryInfo GetMemoryInfoPerDeviceType(const OrtDevice& ort_device) {
     }
     mem_info = GetCudaAllocator(ort_device.Id())->Info();
   }
+#endif
+#if USE_ROCM
+  else if (ort_device.Type() == OrtDevice::GPU) {
+    if (!IsRocmDeviceIdValid(logging::LoggingManager::DefaultLogger(), ort_device.Id())) {
+      ORT_THROW("The provided device id doesn't match any available GPUs on the machine: ", ort_device.Id());
+    }
+    mem_info = GetRocmAllocator(ort_device.Id())->Info();
+  }
+#endif
+#if USE_MIGRAPHX
+  else if (ort_device.Type() == OrtDevice::GPU) {
+    mem_info = GetMIGraphXAllocator(ort_device.Id())->Info();
+  }
 #endif
   else {
     ORT_THROW("Unsupported OrtDevice type: ", ort_device.Type());
@@ -193,6 +206,38 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer() {
 
 #endif
 
+#ifdef USE_MIGRAPHX
+void CpuToMIGraphXMemCpy(void* dst, const void* src, size_t num_bytes) {
+  GetProviderInfo_MIGraphX().MIGraphXMemcpy_HostToDevice(dst, src, num_bytes);
+}
+
+void MIGraphXToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
+  GetProviderInfo_MIGraphX().MIGraphXMemcpy_DeviceToHost(dst, src, num_bytes);
+}
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetMIGraphXToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
+      {OrtDevice::GPU, MIGraphXToCpuMemCpy}};
+
+  return &map;
+}
+
+AllocatorPtr GetMIGraphXAllocator(OrtDevice::DeviceId id) {
+  // Current approach is not thread-safe, but there are some bigger infra pieces to put together in order to make
+  // multi-threaded MIGraphX allocation work we need to maintain a per-thread MIGraphX allocator
+
+  static auto* id_to_allocator_map = new std::unordered_map<OrtDevice::DeviceId, AllocatorPtr>();
+
+  if (id_to_allocator_map->find(id) == id_to_allocator_map->end()) {
+    // TODO: Expose knobs so that users can set fields associated with OrtArenaCfg so that we can pass it to the following method
+    id_to_allocator_map->insert({id, GetProviderInfo_MIGraphX().CreateMIGraphXAllocator(id, gpu_mem_limit, arena_extend_strategy, migx_external_allocator_info, nullptr)});
+  }
+
+  return (*id_to_allocator_map)[id];
+}
+
+#endif
+
 #ifdef USE_DML
 
 constexpr GUID dml_readback_heap_guid = {0x00d32df8, 0xea2d, 0x40bf, {0xa4, 0x47, 0x9c, 0xb4, 0xbc, 0xf1, 0x1d, 0x5e}};
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index 78a5ea4368ae9..e9bafea2ed1b5 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -91,6 +91,18 @@ const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyF
 
 #endif
 
+#ifdef USE_MIGRAPHX
+
+void CpuToMIGraphXMemCpy(void* dst, const void* src, size_t num_bytes);
+
+void MIGraphXToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetMIGraphXToHostMemCpyFunction();
+
+AllocatorPtr GetMIGraphXAllocator(OrtDevice::DeviceId id);
+
+#endif
+
 #ifdef USE_CANN
 
 void CpuToCannMemCpy(void* dst, const void* src, size_t num_bytes);
diff --git a/onnxruntime/python/onnxruntime_pybind_model_compiler.cc b/onnxruntime/python/onnxruntime_pybind_model_compiler.cc
index 4676efa13440b..e2b069b01f95b 100644
--- a/onnxruntime/python/onnxruntime_pybind_model_compiler.cc
+++ b/onnxruntime/python/onnxruntime_pybind_model_compiler.cc
@@ -14,7 +14,7 @@ namespace onnxruntime {
 namespace python {
 
 onnxruntime::Status PyModelCompiler::Create(/*out*/ std::unique_ptr<PyModelCompiler>& out,
-                                            std::shared_ptr<onnxruntime::Environment> env,
+                                            onnxruntime::Environment& env,
                                             const PySessionOptions& sess_options,
                                             std::string&& input_model_path_or_bytes, bool input_model_is_path,
                                             bool embed_compiled_data_into_model,
@@ -49,7 +49,7 @@ onnxruntime::Status PyModelCompiler::Create(/*out*/ std::unique_ptr<PyModelCompi
 
 onnxruntime::Status PyModelCompiler::CompileToFile(const std::string& output_model_path) {
   ORT_RETURN_IF_ERROR(model_compile_options_.SetOutputModelPath(output_model_path));
-  ORT_RETURN_IF_ERROR(onnxruntime::CompileModel(*env_, model_compile_options_));
+  ORT_RETURN_IF_ERROR(onnxruntime::CompileModel(env_, model_compile_options_));
   return Status::OK();
 }
 
@@ -68,7 +68,7 @@ onnxruntime::Status PyModelCompiler::CompileToBytes(std::string& output_buffer)
   void* buffer_data = nullptr;
   size_t buffer_size = 0;
   ORT_RETURN_IF_ERROR(model_compile_options_.SetOutputModelBuffer(allocator, &buffer_data, &buffer_size));
-  ORT_RETURN_IF_ERROR(onnxruntime::CompileModel(*env_, model_compile_options_));
+  ORT_RETURN_IF_ERROR(onnxruntime::CompileModel(env_, model_compile_options_));
 
   // Copy into output buffer.
   output_buffer.reserve(buffer_size);
@@ -77,9 +77,9 @@ onnxruntime::Status PyModelCompiler::CompileToBytes(std::string& output_buffer)
   return Status::OK();
 }
 
-PyModelCompiler::PyModelCompiler(std::shared_ptr<onnxruntime::Environment> env, const PySessionOptions& sess_options,
+PyModelCompiler::PyModelCompiler(onnxruntime::Environment& env, const PySessionOptions& sess_options,
                                  PrivateConstructorTag)
-    : env_(env), model_compile_options_(*env, sess_options) {
+    : env_(env), model_compile_options_(env, sess_options) {
 }
 }  // namespace python
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_model_compiler.h b/onnxruntime/python/onnxruntime_pybind_model_compiler.h
index a40addd3b7164..e61ae4674210b 100644
--- a/onnxruntime/python/onnxruntime_pybind_model_compiler.h
+++ b/onnxruntime/python/onnxruntime_pybind_model_compiler.h
@@ -40,7 +40,7 @@ class PyModelCompiler {
   /// Initializers with a size greater than this threshold are dumped into the external file.</param>
   /// <returns>A Status indicating error or success.</returns>
   static onnxruntime::Status Create(/*out*/ std::unique_ptr<PyModelCompiler>& out,
-                                    std::shared_ptr<onnxruntime::Environment> env,
+                                    onnxruntime::Environment& env,
                                     const PySessionOptions& sess_options,
                                     std::string&& input_model_path_or_bytes, bool input_model_is_path,
                                     bool embed_compiled_data_into_model = false,
@@ -50,7 +50,7 @@ class PyModelCompiler {
 
   // Note: Creation should be done via Create(). This constructor is public so that it can be called from
   // std::make_shared().
-  PyModelCompiler(std::shared_ptr<onnxruntime::Environment> env, const PySessionOptions& sess_options,
+  PyModelCompiler(onnxruntime::Environment& env, const PySessionOptions& sess_options,
                   PrivateConstructorTag);
 
   /// <summary>
@@ -71,7 +71,7 @@ class PyModelCompiler {
   onnxruntime::Status CompileToBytes(std::string& output_buffer);
 
  private:
-  std::shared_ptr<onnxruntime::Environment> env_;
+  onnxruntime::Environment& env_;
   onnxruntime::ModelCompilationOptions model_compile_options_;
   std::string input_model_bytes_;
 };
diff --git a/onnxruntime/python/onnxruntime_pybind_module.cc b/onnxruntime/python/onnxruntime_pybind_module.cc
index aea43c6048f84..43e5c30082be6 100644
--- a/onnxruntime/python/onnxruntime_pybind_module.cc
+++ b/onnxruntime/python/onnxruntime_pybind_module.cc
@@ -1,28 +1,97 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "onnxruntime_pybind.h"  // must use this for the include of <pybind11/pybind11.h>
+#include "onnxruntime_pybind_exceptions.h"
+#include "onnxruntime_pybind_module_functions.h"
 #include <pybind11/stl.h>
 #include "core/providers/get_execution_providers.h"
 #include "onnxruntime_config.h"
+#include "core/common/common.h"
+#include "core/session/ort_env.h"
+#include "core/session/inference_session.h"
+#include "core/session/provider_bridge_ort.h"
+#include "core/framework/provider_options.h"
+#include "core/platform/env.h"
 
 namespace onnxruntime {
 namespace python {
+std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
+    const SessionOptions& session_options,
+    const std::string& type,
+    const ProviderOptionsMap& provider_options_map);
+bool InitArray();
+static OrtEnv* ort_env = nullptr;
+onnxruntime::Environment& GetEnv() {
+  return ort_env->GetEnvironment();
+}
+
+OrtEnv* GetOrtEnv() {
+  return ort_env;
+}
+static Status CreateOrtEnv() {
+  Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
+  OrtEnv::LoggingManagerConstructionInfo lm_info{nullptr, nullptr, ORT_LOGGING_LEVEL_WARNING, "Default"};
+  Status status;
+  ort_env = OrtEnv::GetInstance(lm_info, status);
+  if (!status.IsOK()) return status;
+  // Keep the ort_env alive, don't free it. It's ok to leak the memory.
+#if !defined(__APPLE__) && !defined(ORT_MINIMAL_BUILD)
+  if (!InitProvidersSharedLibrary()) {
+    const logging::Logger& default_logger = ort_env->GetLoggingManager()->DefaultLogger();
+    LOGS(default_logger, WARNING) << "Init provider bridge failed.";
+  }
+#endif
+  return Status::OK();
+}
+
 namespace py = pybind11;
 
+/*
+ * Register execution provider with options.
+ */
+static void RegisterExecutionProviders(InferenceSession* sess, const std::vector<std::string>& provider_types,
+                                       const ProviderOptionsMap& provider_options_map) {
+  for (const std::string& type : provider_types) {
+    auto ep = CreateExecutionProviderInstance(sess->GetSessionOptions(), type, provider_options_map);
+    if (ep) {
+      OrtPybindThrowIfError(sess->RegisterExecutionProvider(std::move(ep)));
+    }
+  }
+}
+
+Status CreateInferencePybindStateModule(py::module& m) {
+  m.doc() = "pybind11 stateful interface to ONNX runtime";
+  RegisterExceptions(m);
+  if (!InitArray()) {
+    return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::FAIL, "import numpy failed");
+  }
+
+  ORT_RETURN_IF_ERROR(CreateOrtEnv());
+
+  addGlobalMethods(m);
+  addObjectMethods(m, RegisterExecutionProviders);
+  addOrtValueMethods(m);
+  addSparseTensorMethods(m);
+  addIoBindingMethods(m);
+  addAdapterFormatMethods(m);
+  addGlobalSchemaFunctions(m);
+  addOpSchemaSubmodule(m);
+  addOpKernelSubmodule(m);
+  return Status::OK();
+}
+
 #if defined(USE_MPI) && defined(ORT_USE_NCCL)
 static constexpr bool HAS_COLLECTIVE_OPS = true;
 #else
 static constexpr bool HAS_COLLECTIVE_OPS = false;
 #endif
 
-bool CreateInferencePybindStateModule(py::module& m);
 void CreateQuantPybindModule(py::module& m);
 
 PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
-  if (!CreateInferencePybindStateModule(m)) {
-    throw pybind11::import_error();
-  }
+  auto st = CreateInferencePybindStateModule(m);
+  if (!st.IsOK())
+    throw pybind11::import_error(st.ErrorMessage());
   // move it out of shared method since training build has a little different behavior.
   m.def(
       "get_available_providers", []() -> const std::vector<std::string>& { return GetAvailableExecutionProviderNames(); },
diff --git a/onnxruntime/python/onnxruntime_pybind_module_functions.h b/onnxruntime/python/onnxruntime_pybind_module_functions.h
new file mode 100644
index 0000000000000..d0d5c0ddceaad
--- /dev/null
+++ b/onnxruntime/python/onnxruntime_pybind_module_functions.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "onnxruntime_pybind.h"
+#include "core/framework/provider_options.h"
+
+namespace onnxruntime {
+class InferenceSession;
+namespace python {
+
+using ExecutionProviderRegistrationFn = std::function<void(InferenceSession*,
+                                                           const std::vector<std::string>&,
+                                                           const ProviderOptionsMap&)>;
+void addGlobalMethods(pybind11::module& m);
+void addObjectMethods(pybind11::module& m, ExecutionProviderRegistrationFn ep_registration_fn);
+void addOrtValueMethods(pybind11::module& m);
+void addSparseTensorMethods(pybind11::module& m);
+void addIoBindingMethods(pybind11::module& m);
+void addAdapterFormatMethods(pybind11::module& m);
+void addGlobalSchemaFunctions(pybind11::module& m);
+void addOpSchemaSubmodule(pybind11::module& m);
+void addOpKernelSubmodule(pybind11::module& m);
+}  // namespace python
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index 382cd742c96aa..23617f4fce76f 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -31,6 +31,13 @@ std::unique_ptr<OrtValue> OrtValueFromShapeAndType(const std::vector<int64_t>& s
       throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
     }
     allocator = GetCudaAllocator(device.Id());
+#elif USE_ROCM
+    if (!IsRocmDeviceIdValid(logging::LoggingManager::DefaultLogger(), device.Id())) {
+      throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
+    }
+    allocator = GetRocmAllocator(device.Id());
+#elif USE_MIGRAPHX
+    allocator = GetMIGraphXAllocator(device.Id());
 #else
     throw std::runtime_error(
         "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
@@ -91,8 +98,19 @@ void addOrtValueMethods(pybind11::module& m) {
 
       // InputDeflist is null because OrtValue creation is not tied to a specific model
       // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
-      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in CUDA
+      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in ROCm
       CreateGenericMLValue(nullptr, GetRocmAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToRocmMemCpy);
+#elif USE_MIGRAPHX
+      // InputDeflist is null because OrtValue creation is not tied to a specific model
+      // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
+      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in MIGraphX
+      CreateGenericMLValue(nullptr, GetMIGraphXAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToMIGraphXMemCpy);
+#elif USE_DML
+      // InputDeflist is null because OrtValue creation is not tied to a specific model
+      // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
+      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in DML
+      CreateGenericMLValue(
+        nullptr, GetDmlAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToDmlMemCpy);
 #else
           throw std::runtime_error(
               "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
@@ -167,6 +185,18 @@ void addOrtValueMethods(pybind11::module& m) {
               values_type,
               *(ml_value->GetMutable<Tensor>()),
               CpuToRocmMemCpy);
+#elif USE_MIGRAPHX
+          onnxruntime::python::CopyDataToTensor(
+              py_values,
+              values_type,
+              *(ml_value->GetMutable<Tensor>()),
+              CpuToMIGraphXMemCpy);
+#elif USE_DML
+          onnxruntime::python::CopyDataToTensor(
+              py_values,
+              values_type,
+              *(ml_value->GetMutable<Tensor>()),
+              CpuToDmlMemCpy);
 #else
           throw std::runtime_error(
               "Unsupported GPU device: Cannot find the supported GPU device.");
@@ -346,6 +376,8 @@ void addOrtValueMethods(pybind11::module& m) {
         py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetCannToHostMemCpyFunction());
 #elif USE_DML
         py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetDmlToHostMemCpyFunction());
+#elif USE_MIGRAPHX
+        py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetMIGraphXToHostMemCpyFunction());
 #else
         py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr);
 #endif
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 5c389a85e5316..151fb0c3db0c0 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -35,6 +35,7 @@
 #include "core/providers/providers.h"
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
 #include "core/session/IOBinding.h"
+#include "core/session/ort_env.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/provider_bridge_ort.h"
@@ -946,12 +947,15 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
           0,
           0,
           0,
+          0,
           nullptr,
           1,
           "./compiled_model.mxr",
           1,
           "./compiled_model.mxr",
-          1};
+          1,
+          SIZE_MAX,
+          0};
       for (auto option : it->second) {
         if (option.first == "device_id") {
           if (!option.second.empty()) {
@@ -966,7 +970,17 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_fp16_enable = false;
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'trt_fp16_enable' should be"
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_fp16_enable' should be"
+                " 'True' or 'False'. Default value is 'False'.\n");
+          }
+        } else if (option.first == "migraphx_fp8_enable") {
+          if (option.second == "True" || option.second == "true") {
+            params.migraphx_fp8_enable = true;
+          } else if (option.second == "False" || option.second == "false") {
+            params.migraphx_fp8_enable = false;
+          } else {
+            ORT_THROW(
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_fp8_enable' should be"
                 " 'True' or 'False'. Default value is 'False'.\n");
           }
         } else if (option.first == "migraphx_int8_enable") {
@@ -976,7 +990,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_int8_enable = false;
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_int8_enable' should be"
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_int8_enable' should be"
                 " 'True' or 'False'. Default value is 'False'.\n");
           }
         } else if (option.first == "migraphx_int8_calibration_table_name") {
@@ -985,7 +999,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_int8_calibration_table_name = calibration_table.c_str();
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_int8_calibration_table_name' should be a "
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_int8_calibration_table_name' should be a "
                 "file name i.e. 'cal_table'.\n");
           }
         } else if (option.first == "migraphx_use_native_calibration_table") {
@@ -995,7 +1009,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_use_native_calibration_table = false;
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_int8_use_native_calibration_table' should be"
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_use_native_calibration_table' should be"
                 " 'True' or 'False'. Default value is 'False'.\n");
           }
         } else if (option.first == "migraphx_save_compiled_model") {
@@ -1005,7 +1019,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_fp16_enable = false;
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_save_compiled_model' should be"
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_save_compiled_model' should be"
                 " 'True' or 'False'. Default value is 'False'.\n");
           }
         } else if (option.first == "migraphx_save_model_path") {
@@ -1014,7 +1028,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_save_model_path = save_model_path.c_str();
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_save_model_name' should be a "
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_save_model_name' should be a "
                 "file name i.e. 'compiled_model.mxr'.\n");
           }
         } else if (option.first == "migraphx_load_compiled_model") {
@@ -1024,7 +1038,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_fp16_enable = false;
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_load_compiled_model' should be"
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_load_compiled_model' should be"
                 " 'True' or 'False'. Default value is 'False'.\n");
           }
         } else if (option.first == "migraphx_load_model_path") {
@@ -1033,7 +1047,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
             params.migraphx_load_model_path = load_model_path.c_str();
           } else {
             ORT_THROW(
-                "[ERROR] [MIGraphX] The value for the key 'migx_load_model_name' should be a "
+                "[ERROR] [MIGraphX] The value for the key 'migraphx_load_model_name' should be a "
                 "file name i.e. 'compiled_model.mxr'.\n");
           }
         } else if (option.first == "migraphx_exhaustive_tune") {
@@ -1412,19 +1426,6 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(const Sessio
   return nullptr;
 }
 
-/*
- * Register execution provider with options.
- */
-static void RegisterExecutionProviders(InferenceSession* sess, const std::vector<std::string>& provider_types,
-                                       const ProviderOptionsMap& provider_options_map) {
-  for (const std::string& type : provider_types) {
-    auto ep = CreateExecutionProviderInstance(sess->GetSessionOptions(), type, provider_options_map);
-    if (ep) {
-      OrtPybindThrowIfError(sess->RegisterExecutionProvider(std::move(ep)));
-    }
-  }
-}
-
 /**
  * Adds an explicit execution provider factory to the session options.
  *
@@ -1500,7 +1501,7 @@ static void RegisterCustomOpDomains(PyInferenceSession* sess, const PySessionOpt
 static Status AddEpFactoryFromEpDevices(PySessionOptions& py_sess_options,
                                         const std::vector<const OrtEpDevice*>& ep_devices,
                                         const ProviderOptions& provider_options) {
-  std::shared_ptr<onnxruntime::Environment> env = GetEnv();
+  onnxruntime::Environment& env = GetEnv();
   const size_t num_ep_options = provider_options.size();
   std::vector<const char*> ep_option_keys;
   std::vector<const char*> ep_option_vals;
@@ -1513,7 +1514,7 @@ static Status AddEpFactoryFromEpDevices(PySessionOptions& py_sess_options,
   }
 
   std::unique_ptr<IExecutionProviderFactory> provider_factory = nullptr;
-  ORT_RETURN_IF_ERROR(CreateIExecutionProviderFactoryForEpDevices(*env,
+  ORT_RETURN_IF_ERROR(CreateIExecutionProviderFactoryForEpDevices(env,
                                                                   py_sess_options.value,
                                                                   ep_devices,
                                                                   ep_option_keys,
@@ -1543,7 +1544,7 @@ static Status InitializeSessionEpsFromSessionOptions(PyInferenceSession& py_sess
   // if there are no providers registered, and there's an ep selection policy set, do auto ep selection
   if (ort_session_options.provider_factories.empty() && ort_session_options.value.ep_selection_policy.enable) {
     ProviderPolicyContext context;
-    ORT_RETURN_IF_ERROR(context.SelectEpsForSession(*GetEnv(), ort_session_options, sess));
+    ORT_RETURN_IF_ERROR(context.SelectEpsForSession(GetEnv(), ort_session_options, sess));
   } else {
     for (const auto& provider_factory : ort_session_options.provider_factories) {
       std::unique_ptr<IExecutionProvider> ep = provider_factory->CreateProvider(ort_session_options,
@@ -1626,15 +1627,13 @@ void addGlobalMethods(py::module& m) {
       "set_default_logger_severity", [](int severity) {
         ORT_ENFORCE(severity >= 0 && severity <= 4,
                     "Invalid logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal");
-        auto env = GetEnv();
-        logging::LoggingManager* default_logging_manager = env->GetLoggingManager();
+        logging::LoggingManager* default_logging_manager = GetEnv().GetLoggingManager();
         default_logging_manager->SetDefaultLoggerSeverity(static_cast<logging::Severity>(severity));
       },
       "Sets the default logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal");
   m.def(
       "set_default_logger_verbosity", [](int vlog_level) {
-        auto env = GetEnv();
-        logging::LoggingManager* default_logging_manager = env->GetLoggingManager();
+        logging::LoggingManager* default_logging_manager = GetEnv().GetLoggingManager();
         default_logging_manager->SetDefaultLoggerVerbosity(vlog_level);
       },
       "Sets the default logging verbosity level. To activate the verbose log, "
@@ -1652,16 +1651,14 @@ void addGlobalMethods(py::module& m) {
       "Disables platform-specific telemetry collection.");
   m.def(
       "create_and_register_allocator", [](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void {
-        auto env = GetEnv();
-        auto st = env->CreateAndRegisterAllocator(mem_info, arena_cfg);
+        auto st = GetEnv().CreateAndRegisterAllocator(mem_info, arena_cfg);
         if (!st.IsOK()) {
           throw std::runtime_error("Error when creating and registering allocator: " + st.ErrorMessage());
         }
       });
   m.def(
       "create_and_register_allocator_v2", [](const std::string& provider_type, const OrtMemoryInfo& mem_info, const ProviderOptions& options, const OrtArenaCfg* arena_cfg = nullptr) -> void {
-        auto env = GetEnv();
-        auto st = env->CreateAndRegisterAllocatorV2(provider_type, mem_info, options, arena_cfg);
+        auto st = GetEnv().CreateAndRegisterAllocatorV2(provider_type, mem_info, options, arena_cfg);
         if (!st.IsOK()) {
           throw std::runtime_error("Error when creating and registering allocator in create_and_register_allocator_v2: " + st.ErrorMessage());
         }
@@ -1670,8 +1667,7 @@ void addGlobalMethods(py::module& m) {
       "register_execution_provider_library",
       [](const std::string& registration_name, const PathString& library_path) -> void {
 #if !defined(ORT_MINIMAL_BUILD)
-        std::shared_ptr<onnxruntime::Environment> env = GetEnv();
-        OrtPybindThrowIfError(env->RegisterExecutionProviderLibrary(registration_name, library_path.c_str()));
+        OrtPybindThrowIfError(GetEnv().RegisterExecutionProviderLibrary(registration_name, library_path.c_str()));
 #else
         ORT_UNUSED_PARAMETER(registration_name);
         ORT_UNUSED_PARAMETER(library_path);
@@ -1683,8 +1679,7 @@ void addGlobalMethods(py::module& m) {
       "unregister_execution_provider_library",
       [](const std::string& registration_name) -> void {
 #if !defined(ORT_MINIMAL_BUILD)
-        std::shared_ptr<onnxruntime::Environment> env = GetEnv();
-        OrtPybindThrowIfError(env->UnregisterExecutionProviderLibrary(registration_name));
+        OrtPybindThrowIfError(GetEnv().UnregisterExecutionProviderLibrary(registration_name));
 #else
         ORT_UNUSED_PARAMETER(registration_name);
         ORT_THROW("Execution provider libraries are not supported in this build.");
@@ -1695,8 +1690,7 @@ void addGlobalMethods(py::module& m) {
       "get_ep_devices",
       []() -> const std::vector<const OrtEpDevice*>& {
 #if !defined(ORT_MINIMAL_BUILD)
-        std::shared_ptr<onnxruntime::Environment> env = GetEnv();
-        return env->GetOrtEpDevices();
+        return GetEnv().GetOrtEpDevices();
 #else
         ORT_THROW("OrtEpDevices are not supported in this build");
 #endif
@@ -1876,6 +1870,7 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       .value("ORT_DISABLE_ALL", GraphOptimizationLevel::ORT_DISABLE_ALL)
       .value("ORT_ENABLE_BASIC", GraphOptimizationLevel::ORT_ENABLE_BASIC)
       .value("ORT_ENABLE_EXTENDED", GraphOptimizationLevel::ORT_ENABLE_EXTENDED)
+      .value("ORT_ENABLE_LAYOUT", GraphOptimizationLevel::ORT_ENABLE_LAYOUT)
       .value("ORT_ENABLE_ALL", GraphOptimizationLevel::ORT_ENABLE_ALL);
 
   py::enum_<ExecutionMode>(m, "ExecutionMode")
@@ -2254,6 +2249,9 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
                 retval = ORT_ENABLE_EXTENDED;
                 break;
               case onnxruntime::TransformerLevel::Level3:
+                retval = ORT_ENABLE_LAYOUT;
+                break;
+              case onnxruntime::TransformerLevel::MaxLevel:
                 retval = ORT_ENABLE_ALL;
                 break;
               default:
@@ -2275,9 +2273,12 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
               case ORT_ENABLE_EXTENDED:
                 options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level2;
                 break;
-              case ORT_ENABLE_ALL:
+              case ORT_ENABLE_LAYOUT:
                 options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level3;
                 break;
+              case ORT_ENABLE_ALL:
+                options->value.graph_optimization_level = onnxruntime::TransformerLevel::MaxLevel;
+                break;
             }
           },
           R"pbdoc(Graph optimization level for this session.)pbdoc")
@@ -2483,14 +2484,13 @@ including arg name, arg type (contains both type and shape).)pbdoc")
       // without any conversion. So this init method can be used for model file path (string) and model content (bytes)
       .def(py::init([](const PySessionOptions& so, const std::string arg, bool is_arg_file_name,
                        bool load_config_from_model = false) {
-        auto env = GetEnv();
         std::unique_ptr<PyInferenceSession> sess;
 
         // separate creation of the session from model loading unless we have to read the config from the model.
         // in a minimal build we only support load via Load(...) and not at session creation time
         if (load_config_from_model) {
 #if !defined(ORT_MINIMAL_BUILD)
-          sess = std::make_unique<PyInferenceSession>(std::move(env), so, arg, is_arg_file_name);
+          sess = std::make_unique<PyInferenceSession>(*GetOrtEnv(), so, arg, is_arg_file_name);
 
           RegisterCustomOpDomains(sess.get(), so);
 
@@ -2499,7 +2499,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           ORT_THROW("Loading configuration from an ONNX model is not supported in this build.");
 #endif
         } else {
-          sess = std::make_unique<PyInferenceSession>(std::move(env), so);
+          sess = std::make_unique<PyInferenceSession>(*GetOrtEnv(), so);
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
           RegisterCustomOpDomains(sess.get(), so);
 #endif
@@ -2851,110 +2851,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           R"pbdoc(Compile an ONNX model into a buffer.)pbdoc");
 }
 
-bool CreateInferencePybindStateModule(py::module& m) {
-  m.doc() = "pybind11 stateful interface to ONNX runtime";
-  RegisterExceptions(m);
-
-  import_array1(false);
-
-  auto env = GetEnv();
-
-  addGlobalMethods(m);
-  addObjectMethods(m, RegisterExecutionProviders);
-  addOrtValueMethods(m);
-  addSparseTensorMethods(m);
-  addIoBindingMethods(m);
-  addAdapterFormatMethods(m);
-
-#if !defined(__APPLE__) && !defined(ORT_MINIMAL_BUILD)
-  if (!InitProvidersSharedLibrary()) {
-    const logging::Logger& default_logger = logging::LoggingManager::DefaultLogger();
-    LOGS(default_logger, WARNING) << "Init provider bridge failed.";
-  }
-#endif
-
-  addGlobalSchemaFunctions(m);
-  addOpSchemaSubmodule(m);
-  addOpKernelSubmodule(m);
-  return true;
-}
-
-// This function is only used by orttraining module
 bool InitArray() {
   import_array1(false);
   return true;
 }
 
-namespace {
-// This class provides a static shell for on-demand and thread-safe construction
-// of Environment object for both Inference and Training python layers.
-// Environment class contains objects such as default logger, that must be available
-// for the entire duration of a program that makes use of onnxruntime library.
-// Because Python is a garbage collected language and the order of destruction of objects
-// is not guaranteed we design this class with the following important features.
-
-// 1) we make this class a singleton that is a function local static. The function local statics
-//    are constructed when the function is called the very first time. This fact has several important
-//    properties.
-//    - First, it is constructed before it is first needed possibly by another static object
-//      and destroyed after that object is destroyed.
-//    - Second, it is constructed in a thread safe manner.
-//    - Last, this order of construction/destruction is enforced across the compilation units, as opposed
-//      to the static objects that are simply declared in order in a single unit, but their lifespan is
-//      unconnected to that of in other compilation units. This is achieved automatically by run-time
-//      by execution atexit() to build a chain.
-//  2) We make Environment owned by a shared_ptr. This is done because python objects such as Inference and Training
-//    sessions depend on this global. We acquire a shared_ptr instance when those objects are instantiated
-//    and release it automatically when they are garbage collected. Although with this change all of the
-//    globals seem to have been destroyed after module is unloaded and GC runs before that, it is cheap and gives
-//    a piece of mind as there were situations when GC was still running in the past after Env was gone.
-//    TrainingEnv global also holds shared reference to this global.
-// 3) We guard against singleton resurrection attempts to detect code runs that when it should
-//    not and make necessary adjustments.
-//    For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6.
-class EnvInitializer {
- public:
-  static std::shared_ptr<onnxruntime::Environment> SharedInstance() {
-    // Guard against attempts to resurrect the singleton
-    if (EnvInitializer::destroyed) {
-      ORT_THROW("Detected an attempt to resurrect destroyed Environment");
-    }
-    static EnvInitializer env_holder;
-    return env_holder.Get();
-  }
-
- private:
-  EnvInitializer() {
-    std::unique_ptr<Environment> env_ptr;
-    Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
-    OrtPybindThrowIfError(Environment::Create(std::make_unique<LoggingManager>(
-                                                  std::make_unique<CLogSink>(),
-                                                  Severity::kWARNING, false, LoggingManager::InstanceType::Default,
-                                                  &SessionObjectInitializer::default_logger_id),
-                                              env_ptr));
-    session_env_ = std::shared_ptr<Environment>(env_ptr.release());
-    destroyed = false;
-  }
-
-  ~EnvInitializer() {
-    destroyed = true;
-  }
-
-  std::shared_ptr<Environment> Get() const {
-    return session_env_;
-  }
-
-  std::shared_ptr<Environment> session_env_;
-
-  static bool destroyed;
-};
-
-bool EnvInitializer::destroyed = false;
-}  // namespace
-
-std::shared_ptr<onnxruntime::Environment> GetEnv() {
-  return EnvInitializer::SharedInstance();
-}
-
 }  // namespace python
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.cc b/onnxruntime/python/onnxruntime_pybind_state_common.cc
index c3ca74526de03..4b9e012764885 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.cc
@@ -39,10 +39,17 @@ bool do_copy_in_default_stream = true;
 // TODO remove deprecated global config
 onnxruntime::rocm::TunableOpInfo tunable_op{};
 onnxruntime::ROCMExecutionProviderExternalAllocatorInfo external_allocator_info{};
+#endif
+
+#if defined(USE_ROCM) || defined(USE_MIGRAPHX)
 // TODO remove deprecated global config
 onnxruntime::ArenaExtendStrategy arena_extend_strategy = onnxruntime::ArenaExtendStrategy::kNextPowerOfTwo;
 #endif
 
+#ifdef USE_MIGRAPHX
+onnxruntime::MIGraphXExecutionProviderExternalAllocatorInfo migx_external_allocator_info{};
+#endif
+
 #if defined(ENABLE_DLPACK)
 
 void DlpackCapsuleDestructor(PyObject* data) {
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 4114bd4078799..1515879f61419 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -12,6 +12,7 @@
 #include "core/framework/allocator.h"
 #include "core/framework/session_options.h"
 #include "core/session/environment.h"
+#include "core/session/ort_env.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/inference_session.h"
 #if defined(ENABLE_DLPACK)
@@ -133,6 +134,7 @@ struct OrtStatus {
 #endif
 #ifdef USE_MIGRAPHX
 #include "core/providers/migraphx/migraphx_provider_factory.h"
+#include "core/providers/migraphx/migraphx_execution_provider_info.h"
 #endif
 #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE)
 #include "core/providers/openvino/openvino_provider_factory.h"
@@ -208,24 +210,38 @@ extern bool do_copy_in_default_stream;
 // TODO remove deprecated global config
 extern onnxruntime::rocm::TunableOpInfo tunable_op;
 extern onnxruntime::ROCMExecutionProviderExternalAllocatorInfo external_allocator_info;
+}  // namespace python
+}  // namespace onnxruntime
+#endif
+
+#if defined(USE_ROCM) || defined(USE_MIGRAPHX)
+namespace onnxruntime {
+namespace python {
 extern onnxruntime::ArenaExtendStrategy arena_extend_strategy;
 }  // namespace python
 }  // namespace onnxruntime
 #endif
 
+#ifdef USE_MIGRAPHX
+namespace onnxruntime {
+ProviderInfo_MIGraphX* TryGetProviderInfo_MIGraphX();
+ProviderInfo_MIGraphX& GetProviderInfo_MIGraphX();
+namespace python {
+extern onnxruntime::MIGraphXExecutionProviderExternalAllocatorInfo migx_external_allocator_info;
+}  // namespace python
+}  // namespace onnxruntime
+
+#endif
+
 #include "core/providers/dnnl/dnnl_provider_factory.h"
 #include "core/providers/shared_library/provider_host_api.h"
-
+#include "onnxruntime_pybind_module_functions.h"
 namespace onnxruntime {
 #if !defined(SHARED_PROVIDER) && !defined(DISABLE_SPARSE_TENSORS)
 class SparseTensor;
 #endif
 namespace python {
 
-using ExecutionProviderRegistrationFn = std::function<void(InferenceSession*,
-                                                           const std::vector<std::string>&,
-                                                           const ProviderOptionsMap&)>;
-
 // TODO remove deprecated global config
 extern OrtDevice::DeviceId cuda_device_id;
 // TODO remove deprecated global config
@@ -248,21 +264,21 @@ struct PySessionOptions : public OrtSessionOptions {
 
 // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
 struct PyInferenceSession {
-  PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so)
-      : env_(std::move(env)), session_options_(so) {
-    sess_ = std::make_unique<InferenceSession>(so.value, *env_);
+  PyInferenceSession(OrtEnv& env, const PySessionOptions& so)
+      : session_options_(so) {
+    sess_ = std::make_unique<InferenceSession>(so.value, env.GetEnvironment());
   }
 
 #if !defined(ORT_MINIMAL_BUILD)
-  PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name)
-      : env_(std::move(env)), session_options_(so) {
+  PyInferenceSession(OrtEnv& env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name)
+      : session_options_(so) {
     if (is_arg_file_name) {
       // Given arg is the file path. Invoke the corresponding ctor().
-      sess_ = std::make_unique<InferenceSession>(so.value, *env_, arg);
+      sess_ = std::make_unique<InferenceSession>(so.value, env.GetEnvironment(), arg);
     } else {
       // Given arg is the model content as bytes. Invoke the corresponding ctor().
       std::istringstream buffer(arg);
-      sess_ = std::make_unique<InferenceSession>(so.value, *env_, buffer);
+      sess_ = std::make_unique<InferenceSession>(so.value, env.GetEnvironment(), buffer);
     }
   }
 #endif
@@ -290,12 +306,11 @@ struct PyInferenceSession {
   virtual ~PyInferenceSession() = default;
 
  protected:
-  PyInferenceSession(std::shared_ptr<Environment> env, std::unique_ptr<InferenceSession> sess)
-      : env_(std::move(env)), sess_(std::move(sess)) {
+  PyInferenceSession(std::unique_ptr<InferenceSession> sess)
+      : sess_(std::move(sess)) {
   }
 
  private:
-  std::shared_ptr<Environment> env_;
   std::unique_ptr<InferenceSession> sess_;
   OrtSessionOptions session_options_;
 };
@@ -421,7 +436,8 @@ class SessionObjectInitializer {
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
 #endif
-std::shared_ptr<Environment> GetEnv();
+Environment& GetEnv();
+OrtEnv* GetOrtEnv();
 
 // Initialize an InferenceSession.
 // Any provider_options should have entries in matching order to provider_types.
diff --git a/onnxruntime/python/tools/quantization/fusions/__init__.py b/onnxruntime/python/tools/quantization/fusions/__init__.py
index f1576240a2ee3..3b025612dbea8 100644
--- a/onnxruntime/python/tools/quantization/fusions/__init__.py
+++ b/onnxruntime/python/tools/quantization/fusions/__init__.py
@@ -1,3 +1,4 @@
 from .fusion import Fusion  # noqa: F401
 from .fusion_gelu import FusionGelu  # noqa: F401
 from .fusion_layernorm import FusionLayerNormalization  # noqa: F401
+from .replace_upsample_with_resize import ReplaceUpsampleWithResize  # noqa: F401
diff --git a/onnxruntime/python/tools/quantization/fusions/replace_upsample_with_resize.py b/onnxruntime/python/tools/quantization/fusions/replace_upsample_with_resize.py
new file mode 100644
index 0000000000000..fc870f5d3ada8
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/fusions/replace_upsample_with_resize.py
@@ -0,0 +1,96 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import numpy as np
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class ReplaceUpsampleWithResize(Fusion):
+    """Replace Upsample with Resize."""
+
+    def __init__(self, model: ONNXModel, opset):
+        """Initialize."""
+        super().__init__(model, "Resize", "Upsample")
+        self.opset = opset
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Replace Upsample with Resize."""
+        mode = None
+        for attr in node.attribute:
+            if attr.name == "mode":
+                mode = attr.s.decode("utf-8")
+                break
+
+        scales_input = None
+        if self.opset > 7:
+            scales_input = node.input[1] if len(node.input) > 1 else ""
+            resize_inputs = [node.input[0], node.name + "_roi", scales_input]
+        else:
+            if self.opset == 7:
+                for attr in node.attribute:
+                    if attr.name == "scales":
+                        scales_input = attr.floats
+                        break
+
+                scales_input = np.array(list(scales_input), np.float32)
+            else:
+                h_scale = 1
+                w_scale = 1
+                for attr in node.attribute:
+                    if attr.name == "height_scale":
+                        h_scale = attr.float
+                    elif attr.name == "width_scale":
+                        w_scale = attr.float
+
+                scales_input = np.array([1, 1, h_scale, w_scale], np.float32)
+
+            scales_tensor = onnx.helper.make_tensor(
+                name=node.name + "_scales",
+                data_type=onnx.TensorProto.FLOAT,
+                dims=scales_input.shape,
+                vals=scales_input.flatten().tolist(),
+            )
+
+            scales_node = onnx.helper.make_node(
+                "Constant", inputs=[], outputs=[node.name + "_scales"], value=scales_tensor
+            )
+
+            self.nodes_to_add.append(scales_node)
+
+            resize_inputs = [node.input[0], node.name + "_roi", node.name + "_scales"]
+
+        roi_tensor = onnx.helper.make_tensor(
+            name=node.name + "_roi",
+            data_type=onnx.TensorProto.FLOAT,
+            dims=(len(scales_input) * 2,),
+            vals=[0] * len(scales_input) + [1] * len(scales_input),
+        )
+
+        roi_node = onnx.helper.make_node("Constant", inputs=[], outputs=[node.name + "_roi"], value=roi_tensor)
+
+        resize_node = onnx.helper.make_node(
+            op_type="Resize", inputs=resize_inputs, outputs=node.output, mode=mode, nearest_mode="floor"
+        )
+
+        self.nodes_to_remove.append(node)
+        self.nodes_to_add.append(roi_node)
+        self.nodes_to_add.append(resize_node)
+
+    def apply(self) -> bool:
+        """Apply."""
+        if super().apply():
+            self.model.topological_sort()
+            return True
+        return False
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index 0f5a9c376e00c..c588689187383 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -16,7 +16,9 @@
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
 
-from .quant_utils import add_pre_process_metadata
+from .fusions import ReplaceUpsampleWithResize
+from .onnx_model import ONNXModel
+from .quant_utils import add_pre_process_metadata, save_and_reload_model_with_shape_infer
 
 logger = logging.getLogger(__name__)
 
@@ -85,6 +87,21 @@ def quant_pre_process(
                 verbose,
             )
 
+        # Since Upsample is deprecated after opset v10, and the model's opset will
+        # be upgraded to at least v11 during quantization, we need to replace Upsample
+        # with Resize first to avoid generating an invalid model.
+        if model:
+            ai_onnx_domain = [opset for opset in model.opset_import if not opset.domain or opset.domain == "ai.onnx"]
+            if len(ai_onnx_domain) == 1:
+                opset_version = ai_onnx_domain[0].version
+                if opset_version < 10:
+                    ReplaceUpsampleWithResize(ONNXModel(model), opset_version).apply()
+                    model.opset_import.remove(ai_onnx_domain[0])
+                    opset_version = 11
+                    model.opset_import.extend([onnx.helper.make_opsetid("", opset_version)])
+                    model = onnx.version_converter.convert_version(model, opset_version)
+                    model = save_and_reload_model_with_shape_infer(model)
+
         if not skip_optimization:
             # Use ORT optimizers (native code) to optimize model
             if not skip_symbolic_shape:
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 2152a66d1f2e7..75082785c910b 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -36,6 +36,7 @@
     is_benchmark_mode,
     is_standalone,
     is_validate_mode,
+    layout,
     memory_ending,
     model_title,
     ort_provider_list,
@@ -116,6 +117,8 @@ def get_graph_opt_level(enablement):
 
     if enablement == enable_all:
         opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    elif enablement == layout:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_LAYOUT
     elif enablement == extended:
         opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
     elif enablement == basic:
@@ -2197,7 +2200,7 @@ def parse_arguments():
         "--graph_enablement",
         required=False,
         default=enable_all,
-        choices=[disable, basic, extended, enable_all],
+        choices=[disable, basic, extended, layout, enable_all],
         help="Choose graph optimization enablement.",
     )
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 9812c160e9eb4..610fd8333fdac 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -69,6 +69,7 @@
 disable = "disable"
 basic = "basic"
 extended = "extended"
+layout = "layout"
 enable_all = "all"
 
 
diff --git a/onnxruntime/python/tools/transformers/README.md b/onnxruntime/python/tools/transformers/README.md
index 4f147219f19f1..c431f1140a995 100644
--- a/onnxruntime/python/tools/transformers/README.md
+++ b/onnxruntime/python/tools/transformers/README.md
@@ -35,7 +35,7 @@ Models not in the list may only be partially optimized or not optimized at all.
 -  **use_gpu**: (*optional*)
     When opt_level > 1, please set this flag for GPU inference.
 - **opt_level**: (*optional*)
-    Set a proper graph optimization level of OnnxRuntime: 0 - disable all (default), 1 - basic, 2 - extended, 99 - all. If the value is positive, OnnxRuntime will be used to optimize graph first.
+    Set a proper graph optimization level of OnnxRuntime: 0 - disable all (default), 1 - basic, 2 - extended, 3 - layout, 99 - all. If the value is positive, OnnxRuntime will be used to optimize graph first.
 - **verbose**: (*optional*)
     Print verbose information when this flag is specified.
 
@@ -84,4 +84,3 @@ Since past state is used, sequence length in input_ids is 1. For example, s=4 me
 python -m onnxruntime.transformers.models.gpt2.benchmark_gpt2 --use_gpu -m gpt2 -o -v -b 1 8 32 128 -s 4 8 32 128 -p fp32
 python -m onnxruntime.transformers.models.gpt2.benchmark_gpt2 --use_gpu -m gpt2 -o -v -b 1 8 32 128 -s 4 8 32 128 -p fp16
 ```
-
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 96dce10e4c274..3986016cd6dfb 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -113,6 +113,8 @@ def create_session(
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
     elif graph_optimization_level == 2:
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+    elif graph_optimization_level == 3:
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_LAYOUT
     elif graph_optimization_level == 99:
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
     else:
@@ -422,9 +424,9 @@ def parse_arguments():
         "--opt_level",
         required=False,
         type=int,
-        choices=[0, 1, 2, 99],
+        choices=[0, 1, 2, 3, 99],
         default=99,
-        help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 99 - enable all.",
+        help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 3 - layout, 99 - enable all.",
     )
 
     parser.add_argument(
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 39fb7ee927ec5..93e5665d8f0c3 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -135,6 +135,8 @@ def optimize_by_onnxruntime(
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
     elif opt_level == 2:
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+    elif opt_level == 3:
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_LAYOUT
     else:
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
 
@@ -517,11 +519,11 @@ def _parse_arguments():
         "--opt_level",
         required=False,
         type=int,
-        choices=[0, 1, 2, 99],
+        choices=[0, 1, 2, 3, 99],
         default=None,
         help="onnxruntime optimization level. 0 will disable onnxruntime graph optimization. "
         "The recommended value is 1. When opt_level > 1 is used, optimized model for GPU might not run in CPU. "
-        "Level 2 and 99 are intended for --only_onnxruntime.",
+        "Level 2, Level 3 and 99 are intended for --only_onnxruntime.",
     )
 
     parser.add_argument(
diff --git a/onnxruntime/test/common/dnnl_op_test_utils.cc b/onnxruntime/test/common/dnnl_op_test_utils.cc
index 1fa1c1a92fb88..4c1f7a8b1ef81 100644
--- a/onnxruntime/test/common/dnnl_op_test_utils.cc
+++ b/onnxruntime/test/common/dnnl_op_test_utils.cc
@@ -23,6 +23,7 @@ bool DnnlSupportedGpuFound() {
 std::once_flag once_flag1;
 
 bool DnnlHasBF16Support() {
+#if defined(USE_DNNL)
   if (DnnlSupportedGpuFound()) {
     return true;
   }
@@ -49,6 +50,7 @@ bool DnnlHasBF16Support() {
       CPUIDInfo::GetCPUIDInfo().HasAMX_BF16()) {
     return true;
   }
+#endif
   return false;
 }
 }  // namespace test
diff --git a/onnxruntime/test/common/dnnl_op_test_utils.h b/onnxruntime/test/common/dnnl_op_test_utils.h
index 88868d5ed8af6..53482c084f0ef 100644
--- a/onnxruntime/test/common/dnnl_op_test_utils.h
+++ b/onnxruntime/test/common/dnnl_op_test_utils.h
@@ -3,6 +3,13 @@
 
 #pragma once
 
+// Some tests fail when DNNL is used. Skip them for now.
+#if defined(USE_DNNL)
+#define DNNL_GTEST_SKIP() GTEST_SKIP() << "Skipping test when DNNL is used."
+#else
+#define DNNL_GTEST_SKIP()
+#endif
+
 namespace onnxruntime {
 namespace test {
 bool DnnlHasBF16Support();
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index f8739b859bef5..97e1de4b6ad16 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -446,6 +446,8 @@ TEST(MatMulNBits, Float16_Accuracy4) {
   TestMatMulNBitsTyped<MLFloat16, 100, 32, 16, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 32, 32, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 32, 16, 128, 4>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 64, 32, 32, 4>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 128, 128, 32, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 4>();
diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
index 39f6958d47a12..9bcdb5389386f 100644
--- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
@@ -2,8 +2,6 @@
 // Licensed under the MIT License.
 
 #ifndef ORT_MINIMAL_BUILD
-#if (defined(MLAS_TARGET_AMD64_IX86) && !defined(USE_DML) && !defined(USE_WEBGPU) && !defined(USE_COREML)) || defined(USE_CUDA) || defined(USE_WEBGPU)
-
 #include <optional>
 
 #include "gtest/gtest.h"
@@ -26,6 +24,8 @@
 #include "core/session/ort_env.h"
 #include "core/util/qmath.h"
 
+#if (defined(MLAS_TARGET_AMD64_IX86) && !defined(USE_DML) && !defined(USE_WEBGPU) && !defined(USE_COREML)) || defined(USE_CUDA) || defined(USE_WEBGPU)
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -289,6 +289,8 @@ TEST(MatMulNBits, Float16_8b_AccuracyLevel4) {
   constexpr float abs_error = 0.055f;
   constexpr float rel_error = 0.02f;
   TestMatMul8BitsTyped<MLFloat16, 2, 4, 32, 16, 4>(abs_error, rel_error);
+  TestMatMul8BitsTyped<MLFloat16, 100, 64, 32, 32, 4>(abs_error, rel_error);
+  TestMatMul8BitsTyped<MLFloat16, 100, 128, 128, 32, 4>(abs_error, rel_error);
   TestMatMul8BitsTyped<MLFloat16, 199, 40, 576, 32, 4>(abs_error, rel_error);
 }
 #endif
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index 54dd831fe2fc2..907cfd7fa0833 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -43,7 +43,8 @@ void RunQAttention(const std::vector<float>& input_data,
                    int number_of_heads,
                    bool is_unidirectional = false,
                    bool use_float16 = false,
-                   int input_hidden_size = 0) {
+                   int input_hidden_size = 0,
+                   float abs_tolerance = -1.0f) {
   input_hidden_size = (input_hidden_size == 0) ? hidden_size : input_hidden_size;
 
   OpTester tester("QAttention", 1, onnxruntime::kMSDomain);
@@ -90,13 +91,13 @@ void RunQAttention(const std::vector<float>& input_data,
     tester.AddInput<MLFloat16>("input_scale", {1}, ToFloat16({input_quant_params.scale}));
     tester.AddInput<MLFloat16>("weight_scale", {1}, ToFloat16({weight_quant_params.scale}));
     tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
-    tester.SetOutputTolerance(0.01f);
+    tester.SetOutputTolerance(abs_tolerance > 0.0f ? abs_tolerance : 0.01f);
   } else {
     tester.AddInput<float>("bias", bias_dims, bias_data);
     tester.AddInput<float>("input_scale", {1}, {input_quant_params.scale});
     tester.AddInput<float>("weight_scale", {1}, {weight_quant_params.scale});
     tester.AddOutput<float>("output", output_dims, output_data);
-    tester.SetOutputTolerance(0.005f);
+    tester.SetOutputTolerance(abs_tolerance > 0.0f ? abs_tolerance : 0.005f);
   }
 
   if (mask_index_data.size() > 0) {
@@ -178,9 +179,12 @@ static void RunQAttentionDNNL(
     weights_quant_params.zero_point = 1;
   }
 
+  constexpr float abs_tolerance = 0.05f;
   RunQAttention<uint8_t, int8_t, EP::DNNL>(
       input_data, weights_data, bias_data, mask_index_data, output_data, input_quant_params, weights_quant_params,
-      batch_size, sequence_length, hidden_size, number_of_heads, is_unidirectional, false, input_hidden_size);
+      batch_size, sequence_length, hidden_size, number_of_heads, is_unidirectional, false, input_hidden_size,
+      abs_tolerance);
+
 #else
   ORT_UNUSED_PARAMETER(input_data);
   ORT_UNUSED_PARAMETER(weights_data);
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index dc776f74d8758..a7dc2ad8fc3ca 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -2386,8 +2386,8 @@ TEST(InferenceSessionTests, LoadModelWithValidOrtConfigJson) {
   ASSERT_TRUE(session_object_1.GetSessionOptions().execution_mode == ExecutionMode::ORT_SEQUENTIAL);
 
   // The default value for graph_optimization_level is Level1
-  // The model requests Level3 - hence that should be used
-  ASSERT_TRUE(session_object_1.GetSessionOptions().graph_optimization_level == TransformerLevel::Level3);
+  // The model requests MaxLevel - hence that should be used
+  ASSERT_TRUE(session_object_1.GetSessionOptions().graph_optimization_level == TransformerLevel::MaxLevel);
 
   // The default value for enable_profiling is false
   // The model requests true - hence that should be used
diff --git a/onnxruntime/test/mlas/unittest/test_rope.cpp b/onnxruntime/test/mlas/unittest/test_rope.cpp
index 516bc866459a8..eeb369224d523 100644
--- a/onnxruntime/test/mlas/unittest/test_rope.cpp
+++ b/onnxruntime/test/mlas/unittest/test_rope.cpp
@@ -15,8 +15,7 @@ Module Name:
 --*/
 
 #include "test_util.h"
-#include "mlas.h"
-#include "core/framework/float16.h"
+#include "core/mlas/lib/mlasi.h"
 #include "core/mlas/lib/rotary_embedding.h"
 
 using namespace onnxruntime;
diff --git a/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp b/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp
index 34f17843b0726..0b58e48439b0d 100644
--- a/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp
+++ b/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp
@@ -22,7 +22,7 @@ class MlasScaleOutputTest : public MlasTestBase {
                                                             std::numeric_limits<int16_t>::max());
 
     for (size_t s = 0; s < M * N; s++) {
-      Input[s] = int_distribution(generator);
+      Input[s] = int_distribution(generator);  // It could be zero
       Output[s] = OutputRef[s] = real_distribution(generator);
     }
 
@@ -52,10 +52,14 @@ class MlasScaleOutputTest : public MlasTestBase {
     constexpr float epsilon = 1e-6f;
 
     for (size_t n = 0; n < M * N; n++) {
-      float diff = std::fabs((Output[n] - OutputRef[n]) / OutputRef[n]);
+      float outvalue = OutputRef[n];  // When `AccumulateMode` is false, there is a high chance that this value could be zero
+      float diff = std::fabs(Output[n] - outvalue);
+      if (outvalue != 0) {
+        diff /= outvalue;
+      }
       ASSERT_LE(diff, epsilon)
           << " @[" << n / N << "," << n % N << "], total:[" << M << "," << N << "], got:"
-          << Output[n] << ", expecting:" << OutputRef[n];
+          << Output[n] << ", expecting:" << outvalue;
     }
   }
 
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index d9cfa1ea79d85..893ba1d7b5cac 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -7,6 +7,13 @@
 #include <fstream>
 #include <string>
 #include <unordered_map>
+// For memory leak detection on Windows with Visual Studio in Debug mode
+#ifdef _WIN32
+#define _CRTDBG_MAP_ALLOC
+#include <stdlib.h>
+#include <crtdbg.h>
+#endif
+
 #ifdef _WIN32
 #include "getopt.h"
 #elif defined(_AIX)
@@ -89,7 +96,7 @@ void usage() {
       "\t    [SNPE only] [enable_init_cache]: enable SNPE init caching feature, set to 1 to enabled it. Disabled by default. \n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
       "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
-      "\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
+      "\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 3 (layout), 99 (all).\n"
       "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. "
       "\t-f: Enable EP context cache generation.\n"
       "\t-b: Disable EP context embed mode.\n"
@@ -349,6 +356,9 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             case ORT_ENABLE_EXTENDED:
               graph_optimization_level = ORT_ENABLE_EXTENDED;
               break;
+            case ORT_ENABLE_LAYOUT:
+              graph_optimization_level = ORT_ENABLE_LAYOUT;
+              break;
             case ORT_ENABLE_ALL:
               graph_optimization_level = ORT_ENABLE_ALL;
               break;
@@ -962,6 +972,16 @@ int wmain(int argc, wchar_t* argv[]) {
 #else
 int main(int argc, char* argv[]) {
 #endif
+#ifdef _WIN32
+#if defined(_DEBUG) && !defined(ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+  int tmpFlag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+  tmpFlag |= _CRTDBG_LEAK_CHECK_DF;
+  tmpFlag |= _CRTDBG_ALLOC_MEM_DF;
+  _CrtSetDbgFlag(tmpFlag);
+  std::cout << "CRT Debug Memory Leak Detection Enabled." << std::endl;
+#endif
+#endif
+
   Ort::Env env{nullptr};
   int retval = -1;
   ORT_TRY {
diff --git a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
new file mode 100644
index 0000000000000..e59a9308155a0
--- /dev/null
+++ b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
@@ -0,0 +1,506 @@
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <algorithm>  // needed for std::transform
+#include "gtest/gtest.h"
+#include "test/framework/test_utils.h"
+#include "test/test_environment.h"
+#include "test/util/include/default_providers.h"
+#include "test/util/include/asserts.h"
+#include "core/graph/model.h"
+#include "core/graph/graph_utils.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/insert_cast_transformer.h"
+#include "core/optimizer/fuse_initializers_transformer.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "test/util/include/inference_session_wrapper.h"
+#include "test/common/random_generator.h"
+#include "core/mlas/inc/mlas.h"
+
+namespace onnxruntime {
+
+namespace test {
+
+/**
+ *
+ *   "fuse_fp16_initializers.onnx"
+ *
+ *           --------
+ *          | X_Fp16 |
+ *           --------
+ *              |
+ *              |
+ *              |
+ *              |
+ *              v
+ *  ---------------------------
+ * |        Conv_Fp16          |
+ * |        --W_Fp16--         |
+ * |        --B_Fp16--         |
+ *  ---------------------------
+ *              |
+ *              |
+ *              |
+ *              |
+ *              v
+ *           --------
+ *          | Y_Fp16 |
+ *           --------
+ */
+
+/**
+ *
+ *    "fuse_fp16_initializers_with_graph_outputs.onnx"
+ *
+ *        --------              ---------------
+ *       | A_fp32 |            |     Cast      |
+ *        --------             | (to: Float32) |
+ *         |                   |  --X_fp16--   |
+ *         |                    ---------------
+ *         |                      |         |
+ *         |                      |         |
+ *         |                      |         |
+ *         |                      v         |
+ *         |   +---------------(B_fp32)     |
+ *         |   |                            |
+ *         v   v                            |
+ *        -------                           |
+ *       |  Add  |                          |
+ *        -------                           |
+ *           |                              |
+ *           |                              |
+ *           v                              v
+ *        --------                       --------
+ *       | C_fp32 |                     | B_fp32 |
+ *        --------                       --------
+ *     (graph output)                 (graph output)
+ */
+
+#define MODEL_FOLDER ORT_TSTR("testdata/transform/")
+
+unsigned int CountNoOfInitializersInGraph(const Graph& graph, const onnxruntime::MLDataType _data_type) {
+  // init
+  unsigned int num_initializers = 0;
+
+  // Get nodes in topological order
+  const GraphViewer graph_viewer(graph);
+  auto nodes_indexes_in_topological_order = graph_viewer.GetNodesInTopologicalOrder();
+
+  // For each Node
+  for (auto node_index : nodes_indexes_in_topological_order) {
+    // Get Node
+    auto node = graph.GetNode(node_index);
+
+    // Get input defs
+    auto node_input_defs = node->InputDefs();
+
+    // For each Node Args
+    for (NodeIndex node_arg_index = 0; node_arg_index < node_input_defs.size(); ++node_arg_index) {
+      // Continue if the current arg is not an initialized tensor
+      if (!(graph.IsInitializedTensor(node_input_defs[node_arg_index]->Name()))) continue;
+
+      // Continue if initialzed tensor is not of specific type
+      if (!(_data_type == DataTypeImpl::TypeFromProto(*(node_input_defs[node_arg_index]->TypeAsProto())))) continue;
+
+      // increment
+      num_initializers += 1;
+    }
+  }
+
+  return num_initializers;
+}
+
+unsigned int CountNoOfNodesInGraph(const Graph& graph, const onnxruntime::MLDataType _data_type) {
+  // init
+  unsigned int num_nodes = 0;
+  unsigned int num_args_in_a_node = 0;
+
+  // Get nodes in topological order
+  const GraphViewer graph_viewer(graph);
+  auto nodes_indexes_in_topological_order = graph_viewer.GetNodesInTopologicalOrder();
+
+  // For each Node
+  for (auto node_index : nodes_indexes_in_topological_order) {
+    // Get Node
+    auto node = graph.GetNode(node_index);
+
+    // Get input defs
+    auto node_input_defs = node->InputDefs();
+
+    // For each Node Args
+    num_args_in_a_node = 0;
+    for (NodeIndex node_arg_index = 0; node_arg_index < node_input_defs.size(); ++node_arg_index) {
+      // Continue if current arg is not of specific type
+      if (!(_data_type == DataTypeImpl::TypeFromProto(*(node_input_defs[node_arg_index]->TypeAsProto())))) continue;
+
+      num_args_in_a_node += 1;
+    }
+
+    // increment
+    num_nodes += ((node_input_defs.size() == num_args_in_a_node) ? 1 : 0);
+  }
+
+  return num_nodes;
+}
+
+void test_graph_structure_at_init(const Graph& graph) {
+  // Count ops
+  auto op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(0, op_to_count["Cast"]);
+  // Count no. of initializers of FP16 type
+  auto num_initializers_fp16 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(2, num_initializers_fp16);
+  // Count no. of initializers of FP32 type
+  auto num_initializers_fp32 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(0, num_initializers_fp32);
+  // Count no. of FP16 nodes
+  auto num_nodes_fp16 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(1, num_nodes_fp16);
+  // Count no. of FP32 nodes
+  auto num_nodes_fp32 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(0, num_nodes_fp32);
+  // Check if all conditions are met
+  ASSERT_TRUE((0 == op_to_count["Cast"]) && (2 == num_initializers_fp16) && (0 == num_initializers_fp32) && (1 == num_nodes_fp16) && (0 == num_nodes_fp32));
+}
+
+void test_graph_structure_before_fusion(const Graph& graph) {
+  // Count ops
+  auto op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(4, op_to_count["Cast"]);
+  // Count no. of initializers of FP16 type
+  auto num_initializers_fp16 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(2, num_initializers_fp16);
+  // Count no. of initializers of FP32 type
+  auto num_initializers_fp32 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(0, num_initializers_fp32);
+  // Count no. of FP16 nodes
+  auto num_nodes_fp16 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(3, num_nodes_fp16);
+  // Count no. of FP32 nodes
+  auto num_nodes_fp32 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(2, num_nodes_fp32);
+  // Check if all conditions are met
+  ASSERT_TRUE((4 == op_to_count["Cast"]) && (2 == num_initializers_fp16) && (0 == num_initializers_fp32) && (3 == num_nodes_fp16) && (2 == num_nodes_fp32));
+}
+
+void test_graph_structure_after_fusion(const Graph& graph) {
+  // Count ops
+  auto op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(2, op_to_count["Cast"]);
+  // Count no. of initializers of FP16 type
+  auto num_initializers_fp16 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(0, num_initializers_fp16);
+  // Count no. of initializers of FP32 type
+  auto num_initializers_fp32 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(2, num_initializers_fp32);
+  // Count no. of FP16 nodes
+  auto num_nodes_fp16 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(1, num_nodes_fp16);
+  // Count no. of FP32 nodes
+  auto num_nodes_fp32 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(2, num_nodes_fp32);
+  // Check if all conditions are met
+  ASSERT_TRUE((2 == op_to_count["Cast"]) && (0 == num_initializers_fp16) && (2 == num_initializers_fp32) && (1 == num_nodes_fp16) && (2 == num_nodes_fp32));
+}
+
+void test_graph_structure_after_session_init_without_graph_optimization_loop(const Graph& graph) {
+  // Note: Unable to calc no. of fp16/fp32 initializers, as when session
+  // state is finalized after init it removes initializers from graph.
+  // Look for "session_state_->FinalizeSessionState" method in
+  // inference_session.cc for more explanation.
+
+  // Count ops
+  auto op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(2, op_to_count["Cast"]);
+  // Count no. of initializers of FP16 type
+  auto num_initializers_fp16 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(0, num_initializers_fp16);
+  // Count no. of initializers of FP32 type
+  auto num_initializers_fp32 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(0, num_initializers_fp32);
+  // Count no. of FP16 nodes
+  auto num_nodes_fp16 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(1, num_nodes_fp16);
+  // Count no. of FP32 nodes
+  auto num_nodes_fp32 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(2, num_nodes_fp32);
+  // Check if all conditions are met
+  ASSERT_TRUE((2 == op_to_count["Cast"]) && (0 == num_initializers_fp16) && (0 == num_initializers_fp32) && (1 == num_nodes_fp16) && (2 == num_nodes_fp32));
+}
+
+void test_graph_structure_after_session_init_with_graph_optimization_loop(const Graph& graph) {
+  // Note: Unable to calc no. of fp16/fp32 initializers, as when session
+  // state is finalized after init it removes initializers from graph.
+  // Look for "session_state_->FinalizeSessionState" method in
+  // inference_session.cc for more explanation.
+
+  // Count ops
+  auto op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(2, op_to_count["Cast"]);
+  // Count no. of initializers of FP16 type
+  auto num_initializers_fp16 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(0, num_initializers_fp16);
+  // Count no. of initializers of FP32 type
+  auto num_initializers_fp32 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  EXPECT_EQ(0, num_initializers_fp32);
+  // Count no. of FP16 nodes
+  auto num_nodes_fp16 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+  EXPECT_EQ(1, num_nodes_fp16);
+  // Count no. of FP32 nodes
+  auto num_nodes_fp32 = CountNoOfNodesInGraph(graph, DataTypeImpl::GetTensorType<float>());
+  // NOTE: On platforms where the NCHWC transformer is not supported, no reorder node will be added
+  // to the final optimized graph. Consequently, there will be "two" FP32 nodes instead of "three".
+  // This is an expected behavior and can be verified by testing if the Nchwc block size is greater than 1.
+  unsigned int expected_num_nodes_fp32 = (MlasNchwcGetBlockSize() > 1) ? 3 : 2;
+  EXPECT_EQ(expected_num_nodes_fp32, num_nodes_fp32);
+  // Check if all conditions are met
+  ASSERT_TRUE((2 == op_to_count["Cast"]) && (0 == num_initializers_fp16) && (0 == num_initializers_fp32) && (1 == num_nodes_fp16) && (expected_num_nodes_fp32 == num_nodes_fp32));
+}
+
+TEST(TransformerTest, FuseFp16InitializersWithFp32Node) {
+  // Init
+  auto test_logger = DefaultLoggingManager().DefaultLogger();
+  auto model_uri = MODEL_FOLDER ORT_TSTR("fuse_fp16_initializers.onnx");  // Graph for this model is drawn at beginning of this file
+  std::shared_ptr<Model> model;
+
+  // Load model
+  auto status_at_load = Model::Load(model_uri, model, nullptr, test_logger);
+  ASSERT_TRUE(status_at_load.IsOK()) << status_at_load;
+
+  // Load Graph
+  Graph& graph = model->MainGraph();
+
+  // check graph initial structure
+  test_graph_structure_at_init(graph);
+
+  // apply insert cast transforms
+  InsertCastTransformer insert_cast_transformer("TransformerTest.FusedInitializers",
+                                                DefaultCpuExecutionProvider()->GetKernelRegistry().get());
+
+  bool graph_modified_by_insert_cast_transforms = false;
+  auto status_insert_cast_transforms = insert_cast_transformer.Apply(graph,
+                                                                     graph_modified_by_insert_cast_transforms,
+                                                                     test_logger);
+
+  EXPECT_TRUE(status_insert_cast_transforms.IsOK()) << status_insert_cast_transforms;
+  auto status_insert_cast_transforms_resolve = graph.Resolve();
+  EXPECT_TRUE(status_insert_cast_transforms_resolve.IsOK()) << status_insert_cast_transforms_resolve;
+
+  // check graph structure before fusion
+  if (graph_modified_by_insert_cast_transforms) {
+    test_graph_structure_before_fusion(graph);
+  }
+
+  // apply fused initializer transforms
+  FuseInitializersTransformer fused_initializers_transformer("TransformerTest.FusedInitializers",
+                                                             DataTypeImpl::GetTensorType<MLFloat16>(),
+                                                             DataTypeImpl::GetTensorType<float>());
+
+  bool graph_modified_by_fused_initializers_transforms = false;
+  auto status_fused_initializers_transforms = fused_initializers_transformer.Apply(graph,
+                                                                                   graph_modified_by_fused_initializers_transforms,
+                                                                                   test_logger);
+
+  EXPECT_TRUE(status_fused_initializers_transforms.IsOK()) << status_fused_initializers_transforms;
+  auto status_fused_initializers_transforms_resolve = graph.Resolve();
+  EXPECT_TRUE(status_fused_initializers_transforms_resolve.IsOK()) << status_fused_initializers_transforms_resolve;
+
+  // If insert cast transforms is applied then FP16 compute is not supported
+  if (graph_modified_by_insert_cast_transforms) {
+    // If fp16 compute is not supported, Fusion is performed.
+    // The fp16 node/s is/are transformed to fp32 node/s.
+    // For each fp16 initializer in fp16 node/s, a cast node is created, converting fp16 tensors to fp32
+    // tensors everytime during each inference.
+    // Each of fp16 cast nodes will point to newly created fp32 nodes. Running nodes with fp32 kernel.
+    // From input to next node there will be one FP16 to FP32 cast node. Totaling two FP32 node.
+    // From last node to output there will be one FP32 to FP16 cast node. Totaling one FP16 node.
+    EXPECT_TRUE(graph_modified_by_fused_initializers_transforms) << status_fused_initializers_transforms_resolve;
+
+    // check if graph structure is changed from initial structure
+    test_graph_structure_after_fusion(graph);
+
+  } else {
+    // If fp16 compute is supported, Fusion is not performed, keeping the graph as it is.
+    EXPECT_FALSE(graph_modified_by_fused_initializers_transforms) << status_fused_initializers_transforms_resolve;
+
+    // check if graph structure is same as initial structure
+    test_graph_structure_at_init(graph);
+  }
+
+}  // FuseFp16InitializersWithFp32Node
+
+// NOTE: All the below tests for graph optimizations loop level will "FAIL" when FP16 Conv nodes are supported.
+// In order to avoid this situation we disabled these testcases whenever FP16 Acceleration is supported.
+
+TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_loop_level_set_to_0) {
+  if (MlasFp16AccelerationSupported()) {
+    GTEST_SKIP() << "Skipping test because FP16 acceleration support was detected.";
+  }
+
+  // Make model inputs and outputs
+  auto model_uri = MODEL_FOLDER ORT_TSTR("fuse_fp16_initializers.onnx");  // Graph for this model is drawn at beginning of this file
+  RandomValueGenerator random{123};
+  std::vector<int64_t> x_dims{1, 1, 5, 5};
+  std::vector<float> x_data = random.Gaussian<float>(x_dims, 0.0f, 1.0f);
+  std::vector<MLFloat16> x_data_fp16(detail::SizeFromDims(x_dims));
+  std::transform(x_data.begin(), x_data.end(), x_data_fp16.begin(),
+                 [](float value) -> MLFloat16 { return static_cast<MLFloat16>(value); });
+  OrtValue x_fp16;
+  CreateMLValue<MLFloat16>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0],
+                           x_dims, x_data_fp16, &x_fp16);
+  NameMLValMap inputs{{"X", x_fp16}};
+
+  std::vector<std::string> output_names{"Y"};
+  std::vector<OrtValue> outputs;
+
+  // set session options
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::MaxLevel;
+  // Add graph optimization loop level session option and set it to 0.
+  // Hence, during the session initialization only fused initializer graph transforms will be applied
+  // as we are disabling the graph optimization loop by setting this session option.
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsGraphOptimizationsLoopLevel, "0"));
+
+  // Create session and check graph before / after initiation
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.Load(model_uri));
+  test_graph_structure_at_init(session.GetGraph());
+  ASSERT_STATUS_OK(session.Initialize());
+  test_graph_structure_after_session_init_without_graph_optimization_loop(session.GetGraph());
+  ASSERT_STATUS_OK(session.Run(inputs, output_names, &outputs));
+}  // FuseFp16InitializersWithFp32Node_with_graph_optimizations_loop_level_set_to_0
+
+TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_loop_level_set_to_1) {
+  if (MlasFp16AccelerationSupported()) {
+    GTEST_SKIP() << "Skipping test because FP16 acceleration support was detected.";
+  }
+
+  // Make model inputs and outputs
+  auto model_uri = MODEL_FOLDER ORT_TSTR("fuse_fp16_initializers.onnx");  // Graph for this model is drawn at beginning of this file
+  RandomValueGenerator random{123};
+  std::vector<int64_t> x_dims{1, 1, 5, 5};
+  std::vector<float> x_data = random.Gaussian<float>(x_dims, 0.0f, 1.0f);
+  std::vector<MLFloat16> x_data_fp16(detail::SizeFromDims(x_dims));
+  std::transform(x_data.begin(), x_data.end(), x_data_fp16.begin(),
+                 [](float value) -> MLFloat16 { return static_cast<MLFloat16>(value); });
+  OrtValue x_fp16;
+  CreateMLValue<MLFloat16>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0],
+                           x_dims, x_data_fp16, &x_fp16);
+  NameMLValMap inputs{{"X", x_fp16}};
+
+  std::vector<std::string> output_names{"Y"};
+  std::vector<OrtValue> outputs;
+
+  // set session options
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::MaxLevel;
+  // Add graph optimization loop level session option and set it to 1.
+  // Hence, during the session initialization after fused initializer graph transforms is applied
+  // the graph optimization loop will run one more time to see if there is any valid graph transforms
+  // after the fusion which can be applied. In this case NchwcTransformer is applied.
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsGraphOptimizationsLoopLevel, "1"));
+
+  // Create session and check graph before / after initiation
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.Load(model_uri));
+  test_graph_structure_at_init(session.GetGraph());
+  ASSERT_STATUS_OK(session.Initialize());
+  test_graph_structure_after_session_init_with_graph_optimization_loop(session.GetGraph());
+  ASSERT_STATUS_OK(session.Run(inputs, output_names, &outputs));
+}  // FuseFp16InitializersWithFp32Node_with_graph_optimizations_loop_level_set_to_1
+
+TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_loop_level_set_to_2) {
+  if (MlasFp16AccelerationSupported()) {
+    GTEST_SKIP() << "Skipping test because FP16 acceleration support was detected.";
+  }
+
+  // Make model inputs and outputs
+  auto model_uri = MODEL_FOLDER ORT_TSTR("fuse_fp16_initializers.onnx");  // Graph for this model is drawn at beginning of this file
+  RandomValueGenerator random{123};
+  std::vector<int64_t> x_dims{1, 1, 5, 5};
+  std::vector<float> x_data = random.Gaussian<float>(x_dims, 0.0f, 1.0f);
+  std::vector<MLFloat16> x_data_fp16(detail::SizeFromDims(x_dims));
+  std::transform(x_data.begin(), x_data.end(), x_data_fp16.begin(),
+                 [](float value) -> MLFloat16 { return static_cast<MLFloat16>(value); });
+  OrtValue x_fp16;
+  CreateMLValue<MLFloat16>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0],
+                           x_dims, x_data_fp16, &x_fp16);
+  NameMLValMap inputs{{"X", x_fp16}};
+
+  std::vector<std::string> output_names{"Y"};
+  std::vector<OrtValue> outputs;
+
+  // set session options
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::MaxLevel;
+  // Add graph optimization loop level session option and set it to 2.
+  // Hence, during the session initialization after fused initializer graph transforms is applied
+  // the graph optimization loop will run one more time to see if there is any valid graph transforms
+  // after the fusion which can be applied. In this case NchwcTransformer is applied. Again the graph
+  // optimization loop will run one more time to check if there is any valid graph transforms which
+  // can be applied after nchwc transforms. This running one more time will not change anything.
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsGraphOptimizationsLoopLevel, "2"));
+
+  // Create session and check graph before / after initiation
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.Load(model_uri));
+  test_graph_structure_at_init(session.GetGraph());
+  ASSERT_STATUS_OK(session.Initialize());
+  test_graph_structure_after_session_init_with_graph_optimization_loop(session.GetGraph());
+  ASSERT_STATUS_OK(session.Run(inputs, output_names, &outputs));
+}  // FuseFp16InitializersWithFp32Node_with_graph_optimizations_loop_level_set_to_2
+
+TEST(TransformerTest, FuseFp16InitializersWithGraphOutputs) {
+  if (MlasFp16AccelerationSupported()) {
+    GTEST_SKIP() << "Skipping test because FP16 acceleration support was detected.";
+  }
+
+  auto model_uri = MODEL_FOLDER ORT_TSTR("fuse_fp16_initializers_with_graph_outputs.onnx");  // Graph for this model is drawn at beginning of this file
+  RandomValueGenerator random{123};
+  std::vector<int64_t> a_dims{1, 1, 5, 5};
+  std::vector<float> a_data = random.Gaussian<float>(a_dims, 0.0f, 1.0f);
+  OrtValue a_fp32;
+  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0],
+                       a_dims, a_data, &a_fp32);
+  NameMLValMap inputs{{"A", a_fp32}};
+
+  std::vector<std::string> output_names{"B", "C"};
+  std::vector<OrtValue> outputs;
+
+  auto _graph_structure_at_load = [](const Graph& graph) {
+    auto op_to_count = CountOpsInGraph(graph);
+    ASSERT_EQ(1, op_to_count["Add"]);
+    ASSERT_EQ(1, op_to_count["Cast"]);
+    auto num_initializers_fp16 = CountNoOfInitializersInGraph(graph, DataTypeImpl::GetTensorType<MLFloat16>());
+    ASSERT_EQ(1, num_initializers_fp16);
+  };
+
+  auto _graph_structure_at_initialized = [](const Graph& graph) {
+    auto op_to_count = CountOpsInGraph(graph);
+    ASSERT_EQ(1, op_to_count["Add"]);
+    ASSERT_EQ(1, op_to_count["Cast"]);
+    // Note: Unable to calc no. of fp16/fp32 initializers, as when session
+    // state is finalized after init it removes initializers from graph.
+    // Look for "session_state_->FinalizeSessionState" method in
+    // inference_session.cc for more explanation.
+  };
+
+  // set session options
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::MaxLevel;
+  // Create session and check graph before / after initiation
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  // Disabling ConstantFolding optimizer as it will remove the Cast node
+  // by folding it with Add node. This will not allow us to test the
+  // scenario where Cast node is producing graph output and need to
+  // kept untouched by FuseInitializersTransformer.
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"ConstantFolding"}));
+  ASSERT_STATUS_OK(session.Load(model_uri));
+  _graph_structure_at_load(session.GetGraph());
+  ASSERT_STATUS_OK(session.Initialize());
+  _graph_structure_at_initialized(session.GetGraph());
+  ASSERT_STATUS_OK(session.Run(inputs, output_names, &outputs));
+}  // FuseFp16InitializersWithGraphOutputs
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 8b3f55c7df756..35d50cbec678f 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -25,6 +25,7 @@
 #include "core/optimizer/bias_gelu_fusion.h"
 #include "core/optimizer/bias_softmax_fusion.h"
 #include "core/optimizer/cast_elimination.h"
+#include "core/optimizer/cast_chain_elimination.h"
 #include "core/optimizer/common_subexpression_elimination.h"
 #include "core/optimizer/concat_slice_elimination.h"
 #include "core/optimizer/constant_folding.h"
@@ -4362,7 +4363,7 @@ TEST_F(GraphTransformationTests, ExpandElimination) {
   ASSERT_TRUE(op_to_count["Expand"] == 3);
 }
 
-TEST_F(GraphTransformationTests, CastElimination) {
+TEST_F(GraphTransformationTests, CastEliminationSimple) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "cast_elimination.onnx";
   std::shared_ptr<Model> model;
   ASSERT_TRUE(Model::Load(model_uri, model, nullptr, *logger_).IsOK());
@@ -4380,6 +4381,25 @@ TEST_F(GraphTransformationTests, CastElimination) {
   ASSERT_TRUE(op_to_count["Cast"] == 4);
 }
 
+TEST_F(GraphTransformationTests, CastChainEliminationRepeatedPattern) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "cast_elimination_complex.onnx";
+
+  std::shared_ptr<Model> model;
+  ASSERT_TRUE(Model::Load(model_uri, model, nullptr, *logger_).IsOK());
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Cast"] == 7);
+
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformer1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<CastChainElimination>()));
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Cast"] == 3);
+}
+
 TEST_F(GraphTransformationTests, PreShapeNodeElimination) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "pre_shape_node_elimination.onnx";
   std::shared_ptr<Model> model;
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index a247fea7e5f53..1f76ca2a0291f 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -8,6 +8,7 @@
 #include "graph_transform_test_builder.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/graph/graph.h"
+#include "test/common/dnnl_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -35,6 +36,8 @@ NodeArg* NhwcMakeInitializer(ModelTestBuilder& builder, const std::vector<int64_
 #ifndef DISABLE_CONTRIB_OPS
 
 TEST(NhwcTransformerTests, Conv) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* input_arg = builder.MakeInput<uint8_t>(input_shape, 0, 31);
@@ -65,6 +68,8 @@ TEST(NhwcTransformerTests, Conv) {
 }
 
 TEST(NhwcTransformerTests, ConvBlockBinary) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::string& binary_op_type) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* input_arg = builder.MakeInput<uint8_t>({1, 23, 13, 13}, 0, 31);
@@ -111,6 +116,8 @@ TEST(NhwcTransformerTests, ConvBlockBinary) {
 }
 
 TEST(NhwcTransformerTests, ConvMaxPool) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* input_arg = builder.MakeInput<uint8_t>(input_shape, 0, 31);
@@ -177,6 +184,8 @@ TEST(NhwcTransformerTests, ConvMaxPoolIndexTensor) {
 }
 
 TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
+  DNNL_GTEST_SKIP();
+
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<uint8_t>({1, 23, 13, 13}, 0, 31);
     auto* conv1_output_arg = builder.MakeIntermediate();
@@ -216,6 +225,8 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
 }
 
 TEST(NhwcTransformerTests, ConvAveragePool) {
+  DNNL_GTEST_SKIP();
+
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<uint8_t>({1, 23, 13, 13}, 0, 31);
     auto* conv1_output_arg = builder.MakeIntermediate();
@@ -261,6 +272,8 @@ TEST(NhwcTransformerTests, ConvAveragePool) {
 }
 
 TEST(NhwcTransformerTests, ConvSplit) {
+  DNNL_GTEST_SKIP();
+
   for (int64_t axis = -4LL; axis < 4; axis++) {
     auto build_test_case = [&, axis](ModelTestBuilder& builder) {
       auto* input_arg = builder.MakeInput<uint8_t>({2, 23, 16, 16}, 0, 31);
@@ -366,6 +379,8 @@ TEST(NhwcTransformerTests, ConvSplitQLinearConcat) {
 }
 
 TEST(NhwcTransformerTests, ConvPad) {
+  DNNL_GTEST_SKIP();
+
   std::vector<std::string> pad_modes{"constant", "reflect", "edge"};
   for (const auto& mode : pad_modes) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -406,6 +421,8 @@ TEST(NhwcTransformerTests, ConvPad) {
 }
 
 TEST(NhwcTransformerTests, ConvBlockActivation) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](uint32_t extra_edges) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* input1_arg = builder.MakeInput<uint8_t>({1, 10, 13, 13}, 0, 31);
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index f7760c49d4e79..98640bb2f6b4c 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -27,6 +27,7 @@
 #include "test/framework/test_utils.h"
 #include "test/util/include/asserts.h"
 #include "test/util/include/inference_session_wrapper.h"
+#include "test/common/dnnl_op_test_utils.h"
 
 #include "gtest/gtest.h"
 #include "graph_transform_test_builder.h"
@@ -116,6 +117,8 @@ void QDQTransformerConvTests() {
 }
 
 TEST(QDQTransformerTests, Conv_U8X8U8) {
+  DNNL_GTEST_SKIP();
+
   QDQTransformerConvTests<uint8_t, uint8_t, int32_t, uint8_t>();
   QDQTransformerConvTests<uint8_t, int8_t, int32_t, uint8_t>();
 }
@@ -145,6 +148,8 @@ TEST(QDQTransformerTests, Conv_S8X8S8) {
 }
 
 TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        int opset_version, bool use_contrib_qdq = false) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -216,6 +221,8 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
 }
 
 TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        bool use_contrib_qdq = false) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2312,6 +2319,8 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
 }
 
 TEST(QDQTransformerTests, ConvRelu) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        bool is_zp_zero, bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2367,6 +2376,8 @@ TEST(QDQTransformerTests, ConvRelu) {
 }
 
 TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2436,6 +2447,8 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
 }
 
 TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2754,6 +2767,8 @@ TEST(QDQTransformerTests, Sigmoid_U8S8) {
 }
 
 TEST(QDQTransformerTests, ConvTranspose_QBackward) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        const std::vector<int64_t>& perms, bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2804,6 +2819,8 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
 }
 
 TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2886,6 +2903,8 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
 }
 
 TEST(QDQTransformerTests, ConvTranspose_DQForward) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        const std::vector<int64_t>& perms, bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
@@ -2952,6 +2971,8 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
 }
 
 TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
+  DNNL_GTEST_SKIP();
+
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape,
                        const std::vector<int64_t>& perms, bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
diff --git a/onnxruntime/test/perftest/README.md b/onnxruntime/test/perftest/README.md
index 4169d1bf54c65..c7d9544a36158 100644
--- a/onnxruntime/test/perftest/README.md
+++ b/onnxruntime/test/perftest/README.md
@@ -7,39 +7,39 @@ This tool provides the performance results using the ONNX Runtime with the speci
 Options:
 
 	-A: Disable memory arena.
-	
+
 	-M: Disable memory pattern.
-	
+
 	-P: Use parallel executor instead of sequential executor.
-	
+
 	-c: [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.
-	
+
 	-e: [cpu|cuda|mkldnn|tensorrt|openvino|acl|vitisai]: Specifies the execution provider 'cpu','cuda','dnnn','tensorrt', 'openvino', 'acl' and 'vitisai'. Default is 'cpu'.
-        
+
 	-m: [test_mode]: Specifies the test mode. Value coulde be 'duration' or 'times'. Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. Default:'duration'.
-        
-	-o: [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all). Please see __onnxruntime_c_api.h__ (enum GraphOptimizationLevel) for the full list of all optimization levels.
-	
+
+	-o: [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 3 (layout), 99 (all). Please see __onnxruntime_c_api.h__ (enum GraphOptimizationLevel) for the full list of all optimization levels.
+
 	-u: [path to save optimized model]: Default is empty so no optimized model would be saved.
-	
+
 	-p: [profile_file]: Specifies the profile name to enable profiling and dump the profile data to the file.
-	
+
 	-r: [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.
-        
+
 	-s: Show statistics result, like P75, P90.
 
 	-t: [seconds_to_run]: Specifies the seconds to run for 'duration' mode. Default:600.
-        
+
 	-v: Show verbose information.
-        
+
 	-x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0.
-	
+
 	-y: [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means the test will auto-select a default. Must >=0.
 
         -C: [session_config_entries]: Specify session configuration entries as key-value pairs: -C "<key1>|<val1> <key2>|<val2>"
                                       Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
                                       [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1"
-	
+
 	-h: help.
 
 Model path and input data dependency:
@@ -51,7 +51,7 @@ Model path and input data dependency:
         --test_data_set_2
             --input0.pb
         --model.onnx
-    
+
 The path of model.onnx needs to be provided as `<model_path>` argument.
 
 __Sample output__ from the tool will look something like this:
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index d409032b4ebb3..9e0db487dbfc0 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -56,7 +56,7 @@ namespace perftest {
       "\t-F [free_dimension_override]: Specifies a free dimension by denotation to override to a specific value for performance optimization. "
       "Syntax is [dimension_denotation:override_value]. override_value must > 0\n"
       "\t-P: Use parallel executor instead of sequential executor.\n"
-      "\t-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
+      "\t-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 3 (layout), 99 (all).\n"
       "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.\n"
       "\t-u [optimized_model_path]: Specify the optimized model path for saving.\n"
       "\t-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n"
@@ -332,6 +332,9 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
           case ORT_ENABLE_EXTENDED:
             test_config.run_config.optimization_level = ORT_ENABLE_EXTENDED;
             break;
+          case ORT_ENABLE_LAYOUT:
+            test_config.run_config.optimization_level = ORT_ENABLE_LAYOUT;
+            break;
           case ORT_ENABLE_ALL:
             test_config.run_config.optimization_level = ORT_ENABLE_ALL;
             break;
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 9201da348e75c..ecb8588f9f4a4 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -72,19 +72,41 @@ constexpr float LeakyReluGrad(float dy, float y, float alpha) {
 #endif
 
 TEST_F(ActivationOpTest, Sigmoid) {
-  TestActivationOp<float>("Sigmoid",
-                          input_values,
-                          [](float x) {
-                            auto y = 1.f / (1.f + std::exp(-std::abs(x)));  // safe sigmoid
-                            y = x > 0 ? y : 1 - y;
-                            return y;
+  auto sigmoid_f32 = [](float x) {
+    auto y = 1.f / (1.f + std::exp(-std::abs(x)));  // safe sigmoid
+    y = x > 0 ? y : 1 - y;
+    return y;
+  };
+  auto sigmoid_f64 = [](double x) {
+    auto y = 1. / (1. + std::exp(-std::abs(x)));  // safe sigmoid
+    y = x > 0 ? y : 1 - y;
+    return y;
+  };
+  // Test sigmoid using the default validator
+  TestActivationOp<float>("Sigmoid", input_values, sigmoid_f32);
+  // Test sigmoid using custom validator to check output range
+  TestActivationOp<float>("Sigmoid", input_values, sigmoid_f32,
+                          {}, {}, true, 7, kOnnxDomain,
+                          [](const std::vector<OrtValue>& fetches, const std::string&) {
+                            const auto& output = fetches[0].Get<Tensor>();
+                            const float* output_data = output.Data<float>();
+                            for (int64_t i = 0; i < output.Shape().Size(); ++i) {
+                              EXPECT_TRUE(output_data[i] >= 0.f && output_data[i] <= 1.f)
+                                  << "Output value out of range: " << output_data[i];
+                            }
                           });
-  TestActivationOp<double>("Sigmoid",
-                           input_values_double,
-                           [](double x) {
-                             auto y = 1. / (1. + std::exp(-std::abs(x)));  // safe sigmoid
-                             y = x > 0 ? y : 1 - y;
-                             return y;
+  // Test sigmoid using the default validator
+  TestActivationOp<double>("Sigmoid", input_values_double, sigmoid_f64);
+  // Test sigmoid using custom validator to check output range
+  TestActivationOp<double>("Sigmoid", input_values_double, sigmoid_f64,
+                           {}, {}, true, 7, kOnnxDomain,
+                           [](const std::vector<OrtValue>& fetches, const std::string&) {
+                             const auto& output = fetches[0].Get<Tensor>();
+                             const double* output_data = output.Data<double>();
+                             for (int64_t i = 0; i < output.Shape().Size(); ++i) {
+                               EXPECT_TRUE(output_data[i] >= 0. && output_data[i] <= 1.)
+                                   << "Output value out of range: " << output_data[i];
+                             }
                            });
 }
 
@@ -358,6 +380,8 @@ TEST_F(ActivationOpTest, Relu_bfloat16) {
 #if defined(USE_DNNL)
 TEST_F(ActivationOpTest, LeakyRelu_bfloat16) {
 #ifdef USE_DNNL
+  DNNL_GTEST_SKIP();
+
   if (!DnnlHasBF16Support()) {
     LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
     return;
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 04d116e29d3b0..b35dfd9cc5b2c 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -20,7 +20,8 @@ inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>
                              const std::unordered_map<std::string, float> float_attribs = {},
                              const std::unordered_map<std::string, std::string> string_attribs = {},
                              bool is_tensorrt_supported = true, int opset_version = 7,
-                             const char* domain = kOnnxDomain) {
+                             const char* domain = kOnnxDomain,
+                             BaseTester::CustomOutputVerifierFn custom_output_verifier = nullptr) {
   for (const std::vector<T>& input_vals : input_vals_vec) {
     OpTester test(szOp, opset_version, domain);
 
@@ -74,6 +75,10 @@ inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>
       test.SetOutputTolerance(0.0001f);
     }
 
+    if (custom_output_verifier) {
+      test.SetCustomOutputVerifier(custom_output_verifier);
+    }
+
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
   }
 }
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index 6abb3d62848f2..f8f38e5b4e76c 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -23,6 +23,78 @@ const onnxruntime::RunOptions run_options = []() {
 
 const constexpr auto run_with_tunable_op = &run_options;
 
+// Helper function to initialize input matrices
+auto initialize_matrix = [](int64_t rows, int64_t cols) {
+  std::vector<float> data;
+  data.reserve(rows * cols);
+  for (int64_t i = 0; i < rows * cols; ++i) {
+    data.push_back(static_cast<float>((i % 7) + 1));
+  }
+  return data;
+};
+
+enum class BiasType {
+  noBias,      // No bias input
+  MBias,       // C shape is {M,1}
+  ScalarBias,  // C shape is {1,1}
+  MNBias,      // C shape is {M,N}
+  NBias        // C shape is {N}
+};
+
+// Helper function to initialize bias data for Gemm tests
+auto initialize_bias = [](BiasType bias_type, int64_t M, int64_t N) {
+  std::pair<std::vector<float>, std::vector<int64_t>> result;
+  auto& [data, shape] = result;
+
+  switch (bias_type) {
+    case BiasType::noBias:
+      break;
+    case BiasType::MBias:
+      shape = {M, 1};
+      for (int64_t i = 0; i < M; ++i) {
+        data.push_back(static_cast<float>((i % 7) + 1));
+      }
+      break;
+    case BiasType::ScalarBias:
+      shape = {1, 1};
+      data.push_back(1.0f);
+      break;
+    case BiasType::MNBias:
+      shape = {M, N};
+      for (int64_t i = 0; i < M * N; ++i) {
+        data.push_back(static_cast<float>((i % 7) + 1));
+      }
+      break;
+    case BiasType::NBias:
+      shape = {N};
+      for (int64_t i = 0; i < N; ++i) {
+        data.push_back(static_cast<float>((i % 7) + 1));
+      }
+      break;
+  }
+  return result;
+};
+
+// Helper function to get bias value for Gemm tests
+auto get_bias_value = [](const std::vector<float>& bias_data, BiasType bias_type, int64_t i, int64_t j, int64_t N) {
+  if (bias_data.empty()) return 0.0f;
+
+  switch (bias_type) {
+    case BiasType::noBias:
+      return 0.0f;
+    case BiasType::MBias:
+      return bias_data[i];
+    case BiasType::ScalarBias:
+      return bias_data[0];
+    case BiasType::MNBias:
+      return bias_data[i * N + j];
+    case BiasType::NBias:
+      return bias_data[j];
+    default:
+      return 0.0f;
+  }
+};
+
 }  // namespace
 
 // Only CUDA, ROCM, CoreML and XNNPack kernels have float 16 support
@@ -966,35 +1038,26 @@ TEST(GemmOpTest, SharedPrepackedWeights) {
 }
 #endif
 
-TEST(GemmOpTest, GemmOptimizeVec4) {
-  auto run_test = [](int64_t M, int64_t K, int64_t N) {
-    OpTester test("Gemm");
+TEST(GemmOpTest, GemmOptimizePacked) {
+  auto run_test = [](int64_t M, int64_t K, int64_t N, BiasType bias_type) {
+    OpTester test("Gemm", 13);
 
     test.AddAttribute("transA", (int64_t)0);
     test.AddAttribute("transB", (int64_t)0);
     test.AddAttribute("alpha", 1.0f);
     test.AddAttribute("beta", 1.0f);
 
-    // Matrix A: MxK filled with sequential numbers
-    std::vector<float> a_data;
-    a_data.reserve(M * K);
-    for (int64_t i = 0; i < M * K; ++i) {
-      a_data.push_back(static_cast<float>((i % 7) + 1));
-    }
+    std::vector<float> a_data = initialize_matrix(M, K);
+    std::vector<float> b_data = initialize_matrix(K, N);
 
-    // Matrix B: KxN filled with sequential numbers
-    std::vector<float> b_data;
-    b_data.reserve(K * N);
-    for (int64_t i = 0; i < K * N; ++i) {
-      b_data.push_back(static_cast<float>((i % 7) + 1));
-    }
-
-    // Matrix C: MxN filled with zeros
-    std::vector<float> c_data(M * N, 1.0f);
+    auto [c_data, c_shape] = initialize_bias(bias_type, M, N);
+    bool has_bias = !c_data.empty();
 
     test.AddInput<float>("A", {M, K}, a_data);
     test.AddInput<float>("B", {K, N}, b_data);
-    test.AddInput<float>("C", {M, N}, c_data);
+    if (has_bias) {
+      test.AddInput<float>("C", c_shape, c_data);
+    }
 
     // Calculate expected output
     std::vector<float> expected_data(M * N, 0.0f);
@@ -1004,56 +1067,53 @@ TEST(GemmOpTest, GemmOptimizeVec4) {
         for (int64_t k = 0; k < K; ++k) {
           sum += a_data[i * K + k] * b_data[k * N + j];
         }
-        expected_data[i * N + j] = sum + c_data[i * N + j];
+        expected_data[i * N + j] = sum + get_bias_value(c_data, bias_type, i, j, N);
       }
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_data);
-    test.Config(run_with_tunable_op)
+    test.ConfigExcludeEps({kQnnExecutionProvider})
+        .Config(run_with_tunable_op)
         .RunWithConfig();
   };
 
-  run_test(60, 16, 92);
+  // Test different matrix sizes with all bias types
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> test_sizes = {
+      {32, 32, 32}, {64, 64, 64}, {60, 16, 92}, {8, 8, 8}, {128, 128, 128}, {128, 32, 64}, {96, 24, 48}, {48, 48, 120}, {72, 80, 84}, {33, 67, 99}, {1, 1, 1}, {31, 31, 31}};
 
-  run_test(8, 8, 8);
-  run_test(128, 128, 128);
-  run_test(128, 32, 64);
-  run_test(4, 8, 12);
+  std::vector<BiasType> bias_types = {
+      BiasType::noBias, BiasType::MBias, BiasType::ScalarBias,
+      BiasType::MNBias, BiasType::NBias};
 
-  run_test(96, 24, 48);
-  run_test(48, 48, 120);
-  run_test(72, 80, 84);
+  // Run tests with different combinations of matrix sizes and bias types
+  for (const auto& size : test_sizes) {
+    for (const auto& bias_type : bias_types) {
+      run_test(std::get<0>(size), std::get<1>(size), std::get<2>(size), bias_type);
+    }
+  }
 }
 
-TEST(GemmOpTest, GemmOptimizeVec4TransA) {
-  auto run_test = [](int64_t M, int64_t K, int64_t N) {
-    OpTester test("Gemm");
+TEST(GemmOpTest, GemmOptimizePackedTransA) {
+  auto run_test = [](int64_t M, int64_t K, int64_t N, BiasType bias_type) {
+    OpTester test("Gemm", 13);
 
     test.AddAttribute("transA", (int64_t)1);  // A is transposed
     test.AddAttribute("transB", (int64_t)0);
     test.AddAttribute("alpha", 1.0f);
     test.AddAttribute("beta", 1.0f);
 
-    // Matrix A: KxM (will be transposed to MxK) filled with sequential numbers
-    std::vector<float> a_data;
-    a_data.reserve(M * K);
-    for (int64_t i = 0; i < K * M; ++i) {
-      a_data.push_back(static_cast<float>((i % 7) + 1));
-    }
-
-    // Matrix B: KxN filled with sequential numbers
-    std::vector<float> b_data;
-    b_data.reserve(K * N);
-    for (int64_t i = 0; i < K * N; ++i) {
-      b_data.push_back(static_cast<float>((i % 7) + 1));
-    }
+    std::vector<float> a_data = initialize_matrix(K, M);
+    std::vector<float> b_data = initialize_matrix(K, N);
 
-    // Matrix C: MxN filled with zeros
-    std::vector<float> c_data(M * N, 1.0f);
+    // Initialize bias with appropriate shape
+    auto [c_data, c_shape] = initialize_bias(bias_type, M, N);
+    bool has_bias = !c_data.empty();
 
-    test.AddInput<float>("A", {K, M}, a_data);  // Note dimensions are swapped
+    test.AddInput<float>("A", {K, M}, a_data);
     test.AddInput<float>("B", {K, N}, b_data);
-    test.AddInput<float>("C", {M, N}, c_data);
+    if (has_bias) {
+      test.AddInput<float>("C", c_shape, c_data);
+    }
 
     // Calculate expected output for transposed A
     std::vector<float> expected_data(M * N, 0.0f);
@@ -1061,115 +1121,109 @@ TEST(GemmOpTest, GemmOptimizeVec4TransA) {
       for (int64_t j = 0; j < N; ++j) {
         float sum = 0.0f;
         for (int64_t k = 0; k < K; ++k) {
-          sum += a_data[k * M + i] * b_data[k * N + j];  // Adjusted index for transposed A
+          sum += a_data[k * M + i] * b_data[k * N + j];
         }
-        expected_data[i * N + j] = sum + c_data[i * N + j];
+        expected_data[i * N + j] = sum + get_bias_value(c_data, bias_type, i, j, N);
       }
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_data);
-    test.Config(run_with_tunable_op)
+    test.ConfigExcludeEps({kQnnExecutionProvider})
+        .Config(run_with_tunable_op)
         .RunWithConfig();
   };
 
-  run_test(60, 16, 92);
-  run_test(8, 8, 8);
-  run_test(128, 128, 128);
-  run_test(128, 32, 64);
-  run_test(4, 8, 12);
-  run_test(96, 24, 48);
-  run_test(48, 48, 120);
-  run_test(72, 80, 84);
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> test_sizes = {
+      {32, 32, 32}, {64, 64, 64}, {60, 16, 92}, {8, 8, 8}, {128, 128, 128}, {128, 32, 64}, {96, 24, 48}, {48, 48, 120}, {72, 80, 84}, {33, 67, 99}, {1, 1, 1}, {31, 31, 31}, {2, 3, 4}, {63, 64, 65}, {129, 129, 129}};
+
+  std::vector<BiasType> bias_types = {
+      BiasType::noBias, BiasType::MBias, BiasType::ScalarBias,
+      BiasType::MNBias, BiasType::NBias};
+
+  // Run tests with different combinations
+  for (const auto& size : test_sizes) {
+    for (const auto& bias_type : bias_types) {
+      run_test(std::get<0>(size), std::get<1>(size), std::get<2>(size), bias_type);
+    }
+  }
 }
 
-TEST(GemmOpTest, GemmOptimizeVec4TransB) {
-  auto run_test = [](int64_t M, int64_t K, int64_t N) {
-    OpTester test("Gemm");
+TEST(GemmOpTest, GemmOptimizePackedTransB) {
+  auto run_test = [](int64_t M, int64_t K, int64_t N, BiasType bias_type) {
+    OpTester test("Gemm", 13);
 
     test.AddAttribute("transA", (int64_t)0);
-    test.AddAttribute("transB", (int64_t)1);  // B is transposed
+    test.AddAttribute("transB", (int64_t)1);
     test.AddAttribute("alpha", 1.0f);
     test.AddAttribute("beta", 1.0f);
 
-    // Matrix A: MxK filled with sequential numbers
-    std::vector<float> a_data;
-    a_data.reserve(M * K);
-    for (int64_t i = 0; i < M * K; ++i) {
-      a_data.push_back(static_cast<float>((i % 7) + 1));
-    }
-
-    // Matrix B: NxK (will be transposed to KxN) filled with sequential numbers
-    std::vector<float> b_data;
-    b_data.reserve(K * N);
-    for (int64_t i = 0; i < N * K; ++i) {
-      b_data.push_back(static_cast<float>((i % 7) + 1));
-    }
+    std::vector<float> a_data = initialize_matrix(M, K);
+    std::vector<float> b_data = initialize_matrix(N, K);
 
-    // Matrix C: MxN filled with zeros
-    std::vector<float> c_data(M * N, 1.0f);
+    // Initialize bias with appropriate shape
+    auto [c_data, c_shape] = initialize_bias(bias_type, M, N);
+    bool has_bias = !c_data.empty();
 
     test.AddInput<float>("A", {M, K}, a_data);
-    test.AddInput<float>("B", {N, K}, b_data);  // Note dimensions are swapped
-    test.AddInput<float>("C", {M, N}, c_data);
+    test.AddInput<float>("B", {N, K}, b_data);
+    if (has_bias) {
+      test.AddInput<float>("C", c_shape, c_data);
+    }
 
-    // Calculate expected output for transposed B
+    // Calculate expected output
     std::vector<float> expected_data(M * N, 0.0f);
     for (int64_t i = 0; i < M; ++i) {
       for (int64_t j = 0; j < N; ++j) {
         float sum = 0.0f;
         for (int64_t k = 0; k < K; ++k) {
-          sum += a_data[i * K + k] * b_data[j * K + k];  // Adjusted index for transposed B
+          sum += a_data[i * K + k] * b_data[j * K + k];
         }
-        expected_data[i * N + j] = sum + c_data[i * N + j];
+        expected_data[i * N + j] = sum + get_bias_value(c_data, bias_type, i, j, N);
       }
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_data);
-    test.Config(run_with_tunable_op)
+    test.ConfigExcludeEps({kQnnExecutionProvider})
+        .Config(run_with_tunable_op)
         .RunWithConfig();
   };
 
-  run_test(32, 32, 32);
-  run_test(60, 16, 92);
-  run_test(8, 8, 8);
-  run_test(64, 64, 64);
-  run_test(128, 128, 128);
-  run_test(128, 32, 64);
-  run_test(4, 8, 12);
-  run_test(96, 24, 48);
-  run_test(48, 48, 120);
-  run_test(72, 80, 84);
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> test_sizes = {
+      {32, 32, 32}, {64, 64, 64}, {60, 16, 92}, {8, 8, 8}, {128, 128, 128}, {128, 32, 64}, {96, 24, 48}, {48, 48, 120}, {72, 80, 84}, {33, 67, 99}, {1, 1, 1}, {31, 31, 31}, {2, 3, 4}, {63, 64, 65}, {129, 129, 129}};
+
+  std::vector<BiasType> bias_types = {
+      BiasType::noBias, BiasType::MBias, BiasType::ScalarBias,
+      BiasType::MNBias, BiasType::NBias};
+
+  // Run tests with different combinations
+  for (const auto& size : test_sizes) {
+    for (const auto& bias_type : bias_types) {
+      run_test(std::get<0>(size), std::get<1>(size), std::get<2>(size), bias_type);
+    }
+  }
 }
 
-TEST(GemmOpTest, GemmOptimizeVec4TransAB) {
-  auto run_test = [](int64_t M, int64_t K, int64_t N) {
-    OpTester test("Gemm");
+TEST(GemmOpTest, GemmOptimizePackedTransAB) {
+  auto run_test = [](int64_t M, int64_t K, int64_t N, BiasType bias_type) {
+    OpTester test("Gemm", 13);
 
-    test.AddAttribute("transA", (int64_t)1);  // A is transposed
-    test.AddAttribute("transB", (int64_t)1);  // B is transposed
+    test.AddAttribute("transA", (int64_t)1);
+    test.AddAttribute("transB", (int64_t)1);
     test.AddAttribute("alpha", 1.0f);
     test.AddAttribute("beta", 1.0f);
 
-    // Matrix A: KxM (will be transposed to MxK) filled with sequential numbers
-    std::vector<float> a_data;
-    a_data.reserve(M * K);
-    for (int64_t i = 0; i < K * M; ++i) {
-      a_data.push_back(static_cast<float>((i % 7) + 1));
-    }
-
-    // Matrix B: NxK (will be transposed to KxN) filled with sequential numbers
-    std::vector<float> b_data;
-    b_data.reserve(K * N);
-    for (int64_t i = 0; i < N * K; ++i) {
-      b_data.push_back(static_cast<float>((i % 7) + 1));
-    }
+    std::vector<float> a_data = initialize_matrix(M, K);
+    std::vector<float> b_data = initialize_matrix(N, K);
 
-    // Matrix C: MxN filled with zeros
-    std::vector<float> c_data(M * N, 1.0f);
+    // Initialize bias with appropriate shape
+    auto [c_data, c_shape] = initialize_bias(bias_type, M, N);
+    bool has_bias = !c_data.empty();
 
-    test.AddInput<float>("A", {K, M}, a_data);  // Note dimensions are swapped
-    test.AddInput<float>("B", {N, K}, b_data);  // Note dimensions are swapped
-    test.AddInput<float>("C", {M, N}, c_data);
+    test.AddInput<float>("A", {K, M}, a_data);
+    test.AddInput<float>("B", {N, K}, b_data);
+    if (has_bias) {
+      test.AddInput<float>("C", c_shape, c_data);
+    }
 
     // Calculate expected output for both matrices transposed
     std::vector<float> expected_data(M * N, 0.0f);
@@ -1177,25 +1231,31 @@ TEST(GemmOpTest, GemmOptimizeVec4TransAB) {
       for (int64_t j = 0; j < N; ++j) {
         float sum = 0.0f;
         for (int64_t k = 0; k < K; ++k) {
-          sum += a_data[k * M + i] * b_data[j * K + k];  // Adjusted indices for both transposed
+          sum += a_data[k * M + i] * b_data[j * K + k];
         }
-        expected_data[i * N + j] = sum + c_data[i * N + j];
+        expected_data[i * N + j] = sum + get_bias_value(c_data, bias_type, i, j, N);
       }
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_data);
-    test.Config(run_with_tunable_op)
+    test.ConfigExcludeEps({kQnnExecutionProvider})
+        .Config(run_with_tunable_op)
         .RunWithConfig();
   };
-  run_test(32, 32, 32);
-  run_test(60, 16, 92);
-  run_test(8, 8, 8);
-  run_test(128, 128, 128);
-  run_test(128, 32, 64);
-  run_test(4, 8, 12);
-  run_test(96, 24, 48);
-  run_test(48, 48, 120);
-  run_test(72, 80, 84);
+
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> test_sizes = {
+      {32, 32, 32}, {64, 64, 64}, {60, 16, 92}, {8, 8, 8}, {128, 128, 128}, {128, 32, 64}, {96, 24, 48}, {48, 48, 120}, {72, 80, 84}, {33, 67, 99}, {1, 1, 1}, {31, 31, 31}, {2, 3, 4}, {63, 64, 65}, {64, 64, 65}, {129, 129, 129}};
+
+  std::vector<BiasType> bias_types = {
+      BiasType::noBias, BiasType::MBias, BiasType::ScalarBias,
+      BiasType::MNBias, BiasType::NBias};
+
+  // Run tests with different combinations
+  for (const auto& size : test_sizes) {
+    for (const auto& bias_type : bias_types) {
+      run_test(std::get<0>(size), std::get<1>(size), std::get<2>(size), bias_type);
+    }
+  }
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 504e645738344..4531e480e1460 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -61,6 +61,13 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {3, 2, 1, 2},
        real_expected_vals({2, 3, 6, 7, 6, 11, 26, 31, 10, 19, 46, 55})});
 
+  test_cases.push_back(
+      {"test padding and broadcast A > B - no broadcast in B",
+       {2, 2, 3, 2},
+       {2, 1},
+       {2, 2, 3, 1},
+       real_expected_vals({1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23})});
+
   test_cases.push_back(
       {"test padding and broadcast B > A",
        {2, 3, 2},
@@ -138,9 +145,9 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {3, 1, 2},
        real_expected_vals({
            // clang-format off
-            10,  13,
-           100, 112,
-           298, 319,
+              10,  13,
+             100, 112,
+             298, 319,
            // clang-format on
        })});
 
@@ -151,10 +158,10 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {2, 2, 1, 2},
        real_expected_vals({
            // clang-format off
-            10,  13,
-           100, 112,
-           298, 319,
-           604, 634,
+              10,  13,
+             100, 112,
+             298, 319,
+             604, 634,
            // clang-format on
        })});
 
@@ -166,12 +173,12 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {6, 1, 3},
        real_expected_vals({
            // clang-format off
-            420, 448, 476,
-            1092, 1184, 1276,
-            1764, 1920, 2076,
-            2436, 2656, 2876,
-            3108, 3392, 3676,
-            3780, 4128, 4476,
+              420, 448, 476,
+              1092, 1184, 1276,
+              1764, 1920, 2076,
+              2436, 2656, 2876,
+              3108, 3392, 3676,
+              3780, 4128, 4476,
            // clang-format on
        })});
 
@@ -189,15 +196,15 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {2, 3, 4, 5},
        real_expected_vals({
            // clang-format off
-          25, 28, 31, 34, 37, 70, 82, 94, 106, 118, 115, 136, 157, 178, 199, 160, 190, 220,
-          250, 280, 790, 829, 868, 907, 946, 970, 1018, 1066, 1114, 1162, 1150, 1207, 1264,
-          1321, 1378, 1330, 1396, 1462, 1528, 1594, 2635, 2710, 2785, 2860, 2935, 2950, 3034,
-          3118, 3202, 3286, 3265, 3358, 3451, 3544, 3637, 3580, 3682, 3784, 3886, 3988, 5560,
-          5671, 5782, 5893, 6004, 6010, 6130, 6250, 6370, 6490, 6460, 6589, 6718, 6847, 6976,
-          6910, 7048, 7186, 7324, 7462, 9565, 9712, 9859, 10006, 10153, 10150, 10306, 10462,
-          10618, 10774, 10735, 10900, 11065, 11230, 11395, 11320, 11494, 11668, 11842, 12016,
-          14650, 14833, 15016, 15199, 15382, 15370, 15562, 15754, 15946, 16138, 16090, 16291,
-          16492, 16693, 16894, 16810, 17020, 17230, 17440, 17650
+            25, 28, 31, 34, 37, 70, 82, 94, 106, 118, 115, 136, 157, 178, 199, 160, 190, 220,
+            250, 280, 790, 829, 868, 907, 946, 970, 1018, 1066, 1114, 1162, 1150, 1207, 1264,
+            1321, 1378, 1330, 1396, 1462, 1528, 1594, 2635, 2710, 2785, 2860, 2935, 2950, 3034,
+            3118, 3202, 3286, 3265, 3358, 3451, 3544, 3637, 3580, 3682, 3784, 3886, 3988, 5560,
+            5671, 5782, 5893, 6004, 6010, 6130, 6250, 6370, 6490, 6460, 6589, 6718, 6847, 6976,
+            6910, 7048, 7186, 7324, 7462, 9565, 9712, 9859, 10006, 10153, 10150, 10306, 10462,
+            10618, 10774, 10735, 10900, 11065, 11230, 11395, 11320, 11494, 11668, 11842, 12016,
+            14650, 14833, 15016, 15199, 15382, 15370, 15562, 15754, 15946, 16138, 16090, 16291,
+            16492, 16693, 16894, 16810, 17020, 17230, 17440, 17650
            // clang-format on
        })});
 
@@ -209,14 +216,14 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {4, 3, 2, 2, 2},
        real_expected_vals({
            // clang-format off
-          28, 34, 76, 98, 76, 82, 252, 274, 124, 130, 428, 450, 172, 178, 604, 626,
-          220, 226, 780, 802, 268, 274, 956, 978, 316, 322, 1132, 1154, 364, 370,
-          1308, 1330, 412, 418, 1484, 1506, 460, 466, 1660, 1682, 508, 514, 1836,
-          1858, 556, 562, 2012, 2034, 604, 610, 2188, 2210, 652, 658, 2364, 2386,
-          700, 706, 2540, 2562, 748, 754, 2716, 2738, 796, 802, 2892, 2914, 844,
-          850, 3068, 3090, 892, 898, 3244, 3266, 940, 946, 3420, 3442, 988, 994,
-          3596, 3618, 1036, 1042, 3772, 3794, 1084, 1090, 3948, 3970, 1132, 1138,
-          4124, 4146
+            28, 34, 76, 98, 76, 82, 252, 274, 124, 130, 428, 450, 172, 178, 604, 626,
+            220, 226, 780, 802, 268, 274, 956, 978, 316, 322, 1132, 1154, 364, 370,
+            1308, 1330, 412, 418, 1484, 1506, 460, 466, 1660, 1682, 508, 514, 1836,
+            1858, 556, 562, 2012, 2034, 604, 610, 2188, 2210, 652, 658, 2364, 2386,
+            700, 706, 2540, 2562, 748, 754, 2716, 2738, 796, 802, 2892, 2914, 844,
+            850, 3068, 3090, 892, 898, 3244, 3266, 940, 946, 3420, 3442, 988, 994,
+            3596, 3618, 1036, 1042, 3772, 3794, 1084, 1090, 3948, 3970, 1132, 1138,
+            4124, 4146
            // clang-format on
        })});
 
@@ -228,14 +235,14 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {4, 3, 2, 2, 2},
        real_expected_vals({
            // clang-format off
-            28, 34, 76, 98, 76, 82, 252, 274, 732, 770, 1036, 1090, 1036, 1074, 1468,
-            1522, 2460, 2530, 3020, 3106, 3020, 3090, 3708, 3794, 316, 322, 1132,
-            1154, 364, 370, 1308, 1330, 2556, 2594, 3628, 3682, 2860, 2898, 4060,
-            4114, 5820, 5890, 7148, 7234, 6380, 6450, 7836, 7922, 604, 610, 2188,
-            2210, 652, 658, 2364, 2386, 4380, 4418, 6220, 6274, 4684, 4722, 6652,
-            6706, 9180, 9250, 11276, 11362, 9740, 9810, 11964, 12050, 892, 898, 3244,
-            3266, 940, 946, 3420, 3442, 6204, 6242, 8812, 8866, 6508, 6546, 9244,
-            9298, 12540, 12610, 15404, 15490, 13100, 13170, 16092, 16178
+              28, 34, 76, 98, 76, 82, 252, 274, 732, 770, 1036, 1090, 1036, 1074, 1468,
+              1522, 2460, 2530, 3020, 3106, 3020, 3090, 3708, 3794, 316, 322, 1132,
+              1154, 364, 370, 1308, 1330, 2556, 2594, 3628, 3682, 2860, 2898, 4060,
+              4114, 5820, 5890, 7148, 7234, 6380, 6450, 7836, 7922, 604, 610, 2188,
+              2210, 652, 658, 2364, 2386, 4380, 4418, 6220, 6274, 4684, 4722, 6652,
+              6706, 9180, 9250, 11276, 11362, 9740, 9810, 11964, 12050, 892, 898, 3244,
+              3266, 940, 946, 3420, 3442, 6204, 6242, 8812, 8866, 6508, 6546, 9244,
+              9298, 12540, 12610, 15404, 15490, 13100, 13170, 16092, 16178
 
            // clang-format on
        })});
@@ -263,6 +270,38 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {1, 2, 8, 1},
        {2, 2, 2, 1},
        real_expected_vals({140, 364, 364, 1100, 588, 812, 1836, 2572})});
+
+  // Test case: 9 * 33, 33 * 10 matrix for vec1
+  test_cases.push_back(
+      {"test one batch 9 * 33, 33 * 10 matrix",
+       {2, 1, 9, 33},
+       {1, 2, 33, 10},
+       {2, 2, 9, 10},
+       real_expected_vals({114400, 114928, 115456, 115984, 116512, 117040, 117568, 118096, 118624, 119152, 288640, 290257, 291874, 293491, 295108, 296725, 298342, 299959, 301576, 303193, 462880, 465586, 468292, 470998, 473704, 476410, 479116, 481822, 484528, 487234, 637120, 640915, 644710, 648505, 652300, 656095, 659890, 663685, 667480, 671275, 811360, 816244, 821128, 826012, 830896, 835780, 840664, 845548, 850432, 855316, 985600, 991573, 997546, 1003519, 1009492, 1015465, 1021438, 1027411, 1033384, 1039357, 1159840, 1166902, 1173964, 1181026, 1188088, 1195150, 1202212, 1209274, 1216336, 1223398, 1334080, 1342231, 1350382, 1358533, 1366684, 1374835, 1382986, 1391137, 1399288, 1407439, 1508320, 1517560, 1526800, 1536040, 1545280, 1554520, 1563760, 1573000, 1582240, 1591480, 288640, 289168, 289696, 290224, 290752, 291280, 291808, 292336, 292864, 293392, 822250, 823867, 825484, 827101, 828718, 830335, 831952, 833569, 835186, 836803, 1355860, 1358566, 1361272, 1363978, 1366684, 1369390, 1372096, 1374802, 1377508, 1380214, 1889470, 1893265, 1897060, 1900855, 1904650, 1908445, 1912240, 1916035, 1919830, 1923625, 2423080, 2427964, 2432848, 2437732, 2442616, 2447500, 2452384, 2457268, 2462152, 2467036, 2956690, 2962663, 2968636, 2974609, 2980582, 2986555, 2992528, 2998501, 3004474, 3010447, 3490300, 3497362, 3504424, 3511486, 3518548, 3525610, 3532672, 3539734, 3546796, 3553858, 4023910, 4032061, 4040212, 4048363, 4056514, 4064665, 4072816, 4080967, 4089118, 4097269, 4557520, 4566760, 4576000, 4585240, 4594480, 4603720, 4612960, 4622200, 4631440, 4640680, 1682560, 1692889, 1703218, 1713547, 1723876, 1734205, 1744534, 1754863, 1765192, 1775521, 1856800, 1868218, 1879636, 1891054, 1902472, 1913890, 1925308, 1936726, 1948144, 1959562, 2031040, 2043547, 2056054, 2068561, 2081068, 2093575, 2106082, 2118589, 2131096, 2143603, 2205280, 2218876, 2232472, 2246068, 2259664, 2273260, 2286856, 2300452, 2314048, 2327644, 2379520, 2394205, 2408890, 2423575, 2438260, 2452945, 2467630, 2482315, 2497000, 2511685, 2553760, 2569534, 2585308, 2601082, 2616856, 2632630, 2648404, 2664178, 2679952, 2695726, 2728000, 2744863, 2761726, 2778589, 2795452, 2812315, 2829178, 2846041, 2862904, 2879767, 2902240, 2920192, 2938144, 2956096, 2974048, 2992000, 3009952, 3027904, 3045856, 3063808, 3076480, 3095521, 3114562, 3133603, 3152644, 3171685, 3190726, 3209767, 3228808, 3247849, 5091130, 5101459, 5111788, 5122117, 5132446, 5142775, 5153104, 5163433, 5173762, 5184091, 5624740, 5636158, 5647576, 5658994, 5670412, 5681830, 5693248, 5704666, 5716084, 5727502, 6158350, 6170857, 6183364, 6195871, 6208378, 6220885, 6233392, 6245899, 6258406, 6270913, 6691960, 6705556, 6719152, 6732748, 6746344, 6759940, 6773536, 6787132, 6800728, 6814324, 7225570, 7240255, 7254940, 7269625, 7284310, 7298995, 7313680, 7328365, 7343050, 7357735, 7759180, 7774954, 7790728, 7806502, 7822276, 7838050, 7853824, 7869598, 7885372, 7901146, 8292790, 8309653, 8326516, 8343379, 8360242, 8377105, 8393968, 8410831, 8427694, 8444557, 8826400, 8844352, 8862304, 8880256, 8898208, 8916160, 8934112, 8952064, 8970016, 8987968, 9360010, 9379051, 9398092, 9417133, 9436174, 9455215, 9474256, 9493297, 9512338, 9531379})});
+
+  // Test case: 5* 32, 32 * 67 matrix for vec1
+  test_cases.push_back(
+      {"test one batch 5 * 32, 32 * 67 matrix",
+       {1, 5, 32},
+       {1, 32, 67},
+       {1, 5, 67},
+       real_expected_vals({697872, 698368, 698864, 699360, 699856, 700352, 700848, 701344, 701840, 702336, 702832, 703328, 703824, 704320, 704816, 705312, 705808, 706304, 706800, 707296, 707792, 708288, 708784, 709280, 709776, 710272, 710768, 711264, 711760, 712256, 712752, 713248, 713744, 714240, 714736, 715232, 715728, 716224, 716720, 717216, 717712, 718208, 718704, 719200, 719696, 720192, 720688, 721184, 721680, 722176, 722672, 723168, 723664, 724160, 724656, 725152, 725648, 726144, 726640, 727136, 727632, 728128, 728624, 729120, 729616, 730112, 730608, 1761296, 1762816, 1764336, 1765856, 1767376, 1768896, 1770416, 1771936, 1773456, 1774976, 1776496, 1778016, 1779536, 1781056, 1782576, 1784096, 1785616, 1787136, 1788656, 1790176, 1791696, 1793216, 1794736, 1796256, 1797776, 1799296, 1800816, 1802336, 1803856, 1805376, 1806896, 1808416, 1809936, 1811456, 1812976, 1814496, 1816016, 1817536, 1819056, 1820576, 1822096, 1823616, 1825136, 1826656, 1828176, 1829696, 1831216, 1832736, 1834256, 1835776, 1837296, 1838816, 1840336, 1841856, 1843376, 1844896, 1846416, 1847936, 1849456, 1850976, 1852496, 1854016, 1855536, 1857056, 1858576, 1860096, 1861616, 2824720, 2827264, 2829808, 2832352, 2834896, 2837440, 2839984, 2842528, 2845072, 2847616, 2850160, 2852704, 2855248, 2857792, 2860336, 2862880, 2865424, 2867968, 2870512, 2873056, 2875600, 2878144, 2880688, 2883232, 2885776, 2888320, 2890864, 2893408, 2895952, 2898496, 2901040, 2903584, 2906128, 2908672, 2911216, 2913760, 2916304, 2918848, 2921392, 2923936, 2926480, 2929024, 2931568, 2934112, 2936656, 2939200, 2941744, 2944288, 2946832, 2949376, 2951920, 2954464, 2957008, 2959552, 2962096, 2964640, 2967184, 2969728, 2972272, 2974816, 2977360, 2979904, 2982448, 2984992, 2987536, 2990080, 2992624, 3888144, 3891712, 3895280, 3898848, 3902416, 3905984, 3909552, 3913120, 3916688, 3920256, 3923824, 3927392, 3930960, 3934528, 3938096, 3941664, 3945232, 3948800, 3952368, 3955936, 3959504, 3963072, 3966640, 3970208, 3973776, 3977344, 3980912, 3984480, 3988048, 3991616, 3995184, 3998752, 4002320, 4005888, 4009456, 4013024, 4016592, 4020160, 4023728, 4027296, 4030864, 4034432, 4038000, 4041568, 4045136, 4048704, 4052272, 4055840, 4059408, 4062976, 4066544, 4070112, 4073680, 4077248, 4080816, 4084384, 4087952, 4091520, 4095088, 4098656, 4102224, 4105792, 4109360, 4112928, 4116496, 4120064, 4123632, 4951568, 4956160, 4960752, 4965344, 4969936, 4974528, 4979120, 4983712, 4988304, 4992896, 4997488, 5002080, 5006672, 5011264, 5015856, 5020448, 5025040, 5029632, 5034224, 5038816, 5043408, 5048000, 5052592, 5057184, 5061776, 5066368, 5070960, 5075552, 5080144, 5084736, 5089328, 5093920, 5098512, 5103104, 5107696, 5112288, 5116880, 5121472, 5126064, 5130656, 5135248, 5139840, 5144432, 5149024, 5153616, 5158208, 5162800, 5167392, 5171984, 5176576, 5181168, 5185760, 5190352, 5194944, 5199536, 5204128, 5208720, 5213312, 5217904, 5222496, 5227088, 5231680, 5236272, 5240864, 5245456, 5250048, 5254640})});
+
+  // Test case: 32 * 32 matrix for vec4
+  test_cases.push_back(
+      {"test one batch 32 * 32 matrix",
+       {1, 1, 32, 32},
+       {1, 1, 32, 32},
+       {1, 1, 32, 32},
+       real_expected_vals({333312, 333808, 334304, 334800, 335296, 335792, 336288, 336784, 337280, 337776, 338272, 338768, 339264, 339760, 340256, 340752, 341248, 341744, 342240, 342736, 343232, 343728, 344224, 344720, 345216, 345712, 346208, 346704, 347200, 347696, 348192, 348688, 841216, 842736, 844256, 845776, 847296, 848816, 850336, 851856, 853376, 854896, 856416, 857936, 859456, 860976, 862496, 864016, 865536, 867056, 868576, 870096, 871616, 873136, 874656, 876176, 877696, 879216, 880736, 882256, 883776, 885296, 886816, 888336, 1349120, 1351664, 1354208, 1356752, 1359296, 1361840, 1364384, 1366928, 1369472, 1372016, 1374560, 1377104, 1379648, 1382192, 1384736, 1387280, 1389824, 1392368, 1394912, 1397456, 1400000, 1402544, 1405088, 1407632, 1410176, 1412720, 1415264, 1417808, 1420352, 1422896, 1425440, 1427984, 1857024, 1860592, 1864160, 1867728, 1871296, 1874864, 1878432, 1882000, 1885568, 1889136, 1892704, 1896272, 1899840, 1903408, 1906976, 1910544, 1914112, 1917680, 1921248, 1924816, 1928384, 1931952, 1935520, 1939088, 1942656, 1946224, 1949792, 1953360, 1956928, 1960496, 1964064, 1967632, 2364928, 2369520, 2374112, 2378704, 2383296, 2387888, 2392480, 2397072, 2401664, 2406256, 2410848, 2415440, 2420032, 2424624, 2429216, 2433808, 2438400, 2442992, 2447584, 2452176, 2456768, 2461360, 2465952, 2470544, 2475136, 2479728, 2484320, 2488912, 2493504, 2498096, 2502688, 2507280, 2872832, 2878448, 2884064, 2889680, 2895296, 2900912, 2906528, 2912144, 2917760, 2923376, 2928992, 2934608, 2940224, 2945840, 2951456, 2957072, 2962688, 2968304, 2973920, 2979536, 2985152, 2990768, 2996384, 3002000, 3007616, 3013232, 3018848, 3024464, 3030080, 3035696, 3041312, 3046928, 3380736, 3387376, 3394016, 3400656, 3407296, 3413936, 3420576, 3427216, 3433856, 3440496, 3447136, 3453776, 3460416, 3467056, 3473696, 3480336, 3486976, 3493616, 3500256, 3506896, 3513536, 3520176, 3526816, 3533456, 3540096, 3546736, 3553376, 3560016, 3566656, 3573296, 3579936, 3586576, 3888640, 3896304, 3903968, 3911632, 3919296, 3926960, 3934624, 3942288, 3949952, 3957616, 3965280, 3972944, 3980608, 3988272, 3995936, 4003600, 4011264, 4018928, 4026592, 4034256, 4041920, 4049584, 4057248, 4064912, 4072576, 4080240, 4087904, 4095568, 4103232, 4110896, 4118560, 4126224, 4396544, 4405232, 4413920, 4422608, 4431296, 4439984, 4448672, 4457360, 4466048, 4474736, 4483424, 4492112, 4500800, 4509488, 4518176, 4526864, 4535552, 4544240, 4552928, 4561616, 4570304, 4578992, 4587680, 4596368, 4605056, 4613744, 4622432, 4631120, 4639808, 4648496, 4657184, 4665872, 4904448, 4914160, 4923872, 4933584, 4943296, 4953008, 4962720, 4972432, 4982144, 4991856, 5001568, 5011280, 5020992, 5030704, 5040416, 5050128, 5059840, 5069552, 5079264, 5088976, 5098688, 5108400, 5118112, 5127824, 5137536, 5147248, 5156960, 5166672, 5176384, 5186096, 5195808, 5205520, 5412352, 5423088, 5433824, 5444560, 5455296, 5466032, 5476768, 5487504, 5498240, 5508976, 5519712, 5530448, 5541184, 5551920, 5562656, 5573392, 5584128, 5594864, 5605600, 5616336, 5627072, 5637808, 5648544, 5659280, 5670016, 5680752, 5691488, 5702224, 5712960, 5723696, 5734432, 5745168, 5920256, 5932016, 5943776, 5955536, 5967296, 5979056, 5990816, 6002576, 6014336, 6026096, 6037856, 6049616, 6061376, 6073136, 6084896, 6096656, 6108416, 6120176, 6131936, 6143696, 6155456, 6167216, 6178976, 6190736, 6202496, 6214256, 6226016, 6237776, 6249536, 6261296, 6273056, 6284816, 6428160, 6440944, 6453728, 6466512, 6479296, 6492080, 6504864, 6517648, 6530432, 6543216, 6556000, 6568784, 6581568, 6594352, 6607136, 6619920, 6632704, 6645488, 6658272, 6671056, 6683840, 6696624, 6709408, 6722192, 6734976, 6747760, 6760544, 6773328, 6786112, 6798896, 6811680, 6824464, 6936064, 6949872, 6963680, 6977488, 6991296, 7005104, 7018912, 7032720, 7046528, 7060336, 7074144, 7087952, 7101760, 7115568, 7129376, 7143184, 7156992, 7170800, 7184608, 7198416, 7212224, 7226032, 7239840, 7253648, 7267456, 7281264, 7295072, 7308880, 7322688, 7336496, 7350304, 7364112, 7443968, 7458800, 7473632, 7488464, 7503296, 7518128, 7532960, 7547792, 7562624, 7577456, 7592288, 7607120, 7621952, 7636784, 7651616, 7666448, 7681280, 7696112, 7710944, 7725776, 7740608, 7755440, 7770272, 7785104, 7799936, 7814768, 7829600, 7844432, 7859264, 7874096, 7888928, 7903760, 7951872, 7967728, 7983584, 7999440, 8015296, 8031152, 8047008, 8062864, 8078720, 8094576, 8110432, 8126288, 8142144, 8158000, 8173856, 8189712, 8205568, 8221424, 8237280, 8253136, 8268992, 8284848, 8300704, 8316560, 8332416, 8348272, 8364128, 8379984, 8395840, 8411696, 8427552, 8443408, 8459776, 8476656, 8493536, 8510416, 8527296, 8544176, 8561056, 8577936, 8594816, 8611696, 8628576, 8645456, 8662336, 8679216, 8696096, 8712976, 8729856, 8746736, 8763616, 8780496, 8797376, 8814256, 8831136, 8848016, 8864896, 8881776, 8898656, 8915536, 8932416, 8949296, 8966176, 8983056, 8967680, 8985584, 9003488, 9021392, 9039296, 9057200, 9075104, 9093008, 9110912, 9128816, 9146720, 9164624, 9182528, 9200432, 9218336, 9236240, 9254144, 9272048, 9289952, 9307856, 9325760, 9343664, 9361568, 9379472, 9397376, 9415280, 9433184, 9451088, 9468992, 9486896, 9504800, 9522704, 9475584, 9494512, 9513440, 9532368, 9551296, 9570224, 9589152, 9608080, 9627008, 9645936, 9664864, 9683792, 9702720, 9721648, 9740576, 9759504, 9778432, 9797360, 9816288, 9835216, 9854144, 9873072, 9892000, 9910928, 9929856, 9948784, 9967712, 9986640, 10005568, 10024496, 10043424, 10062352, 9983488, 10003440, 10023392, 10043344, 10063296, 10083248, 10103200, 10123152, 10143104, 10163056, 10183008, 10202960, 10222912, 10242864, 10262816, 10282768, 10302720, 10322672, 10342624, 10362576, 10382528, 10402480, 10422432, 10442384, 10462336, 10482288, 10502240, 10522192, 10542144, 10562096, 10582048, 10602000, 10491392, 10512368, 10533344, 10554320, 10575296, 10596272, 10617248, 10638224, 10659200, 10680176, 10701152, 10722128, 10743104, 10764080, 10785056, 10806032, 10827008, 10847984, 10868960, 10889936, 10910912, 10931888, 10952864, 10973840, 10994816, 11015792, 11036768, 11057744, 11078720, 11099696, 11120672, 11141648, 10999296, 11021296, 11043296, 11065296, 11087296, 11109296, 11131296, 11153296, 11175296, 11197296, 11219296, 11241296, 11263296, 11285296, 11307296, 11329296, 11351296, 11373296, 11395296, 11417296, 11439296, 11461296, 11483296, 11505296, 11527296, 11549296, 11571296, 11593296, 11615296, 11637296, 11659296, 11681296, 11507200, 11530224, 11553248, 11576272, 11599296, 11622320, 11645344, 11668368, 11691392, 11714416, 11737440, 11760464, 11783488, 11806512, 11829536, 11852560, 11875584, 11898608, 11921632, 11944656, 11967680, 11990704, 12013728, 12036752, 12059776, 12082800, 12105824, 12128848, 12151872, 12174896, 12197920, 12220944, 12015104, 12039152, 12063200, 12087248, 12111296, 12135344, 12159392, 12183440, 12207488, 12231536, 12255584, 12279632, 12303680, 12327728, 12351776, 12375824, 12399872, 12423920, 12447968, 12472016, 12496064, 12520112, 12544160, 12568208, 12592256, 12616304, 12640352, 12664400, 12688448, 12712496, 12736544, 12760592, 12523008, 12548080, 12573152, 12598224, 12623296, 12648368, 12673440, 12698512, 12723584, 12748656, 12773728, 12798800, 12823872, 12848944, 12874016, 12899088, 12924160, 12949232, 12974304, 12999376, 13024448, 13049520, 13074592, 13099664, 13124736, 13149808, 13174880, 13199952, 13225024, 13250096, 13275168, 13300240, 13030912, 13057008, 13083104, 13109200, 13135296, 13161392, 13187488, 13213584, 13239680, 13265776, 13291872, 13317968, 13344064, 13370160, 13396256, 13422352, 13448448, 13474544, 13500640, 13526736, 13552832, 13578928, 13605024, 13631120, 13657216, 13683312, 13709408, 13735504, 13761600, 13787696, 13813792, 13839888, 13538816, 13565936, 13593056, 13620176, 13647296, 13674416, 13701536, 13728656, 13755776, 13782896, 13810016, 13837136, 13864256, 13891376, 13918496, 13945616, 13972736, 13999856, 14026976, 14054096, 14081216, 14108336, 14135456, 14162576, 14189696, 14216816, 14243936, 14271056, 14298176, 14325296, 14352416, 14379536, 14046720, 14074864, 14103008, 14131152, 14159296, 14187440, 14215584, 14243728, 14271872, 14300016, 14328160, 14356304, 14384448, 14412592, 14440736, 14468880, 14497024, 14525168, 14553312, 14581456, 14609600, 14637744, 14665888, 14694032, 14722176, 14750320, 14778464, 14806608, 14834752, 14862896, 14891040, 14919184, 14554624, 14583792, 14612960, 14642128, 14671296, 14700464, 14729632, 14758800, 14787968, 14817136, 14846304, 14875472, 14904640, 14933808, 14962976, 14992144, 15021312, 15050480, 15079648, 15108816, 15137984, 15167152, 15196320, 15225488, 15254656, 15283824, 15312992, 15342160, 15371328, 15400496, 15429664, 15458832, 15062528, 15092720, 15122912, 15153104, 15183296, 15213488, 15243680, 15273872, 15304064, 15334256, 15364448, 15394640, 15424832, 15455024, 15485216, 15515408, 15545600, 15575792, 15605984, 15636176, 15666368, 15696560, 15726752, 15756944, 15787136, 15817328, 15847520, 15877712, 15907904, 15938096, 15968288, 15998480, 15570432, 15601648, 15632864, 15664080, 15695296, 15726512, 15757728, 15788944, 15820160, 15851376, 15882592, 15913808, 15945024, 15976240, 16007456, 16038672, 16069888, 16101104, 16132320, 16163536, 16194752, 16225968, 16257184, 16288400, 16319616, 16350832, 16382048, 16413264, 16444480, 16475696, 16506912, 16538128, 16078336, 16110576, 16142816, 16175056, 16207296, 16239536, 16271776, 16304016, 16336256, 16368496, 16400736, 16432976, 16465216, 16497456, 16529696, 16561936, 16594176, 16626416, 16658656, 16690896, 16723136, 16755376, 16787616, 16819856, 16852096, 16884336, 16916576, 16948816, 16981056, 17013296, 17045536, 17077776})});
+
+  // Test case: 34 * 68, 68 * 72 for vec4 and multiple workgroups
+  test_cases.push_back(
+      {"test 34 * 68, 68 * 72",
+       {34, 68},
+       {68, 72},
+       {34, 72},
+       real_expected_vals({7380720, 7382998, 7385276, 7387554, 7389832, 7392110, 7394388, 7396666, 7398944, 7401222, 7403500, 7405778, 7408056, 7410334, 7412612, 7414890, 7417168, 7419446, 7421724, 7424002, 7426280, 7428558, 7430836, 7433114, 7435392, 7437670, 7439948, 7442226, 7444504, 7446782, 7449060, 7451338, 7453616, 7455894, 7458172, 7460450, 7462728, 7465006, 7467284, 7469562, 7471840, 7474118, 7476396, 7478674, 7480952, 7483230, 7485508, 7487786, 7490064, 7492342, 7494620, 7496898, 7499176, 7501454, 7503732, 7506010, 7508288, 7510566, 7512844, 7515122, 7517400, 7519678, 7521956, 7524234, 7526512, 7528790, 7531068, 7533346, 7535624, 7537902, 7540180, 7542458, 18533808, 18540710, 18547612, 18554514, 18561416, 18568318, 18575220, 18582122, 18589024, 18595926, 18602828, 18609730, 18616632, 18623534, 18630436, 18637338, 18644240, 18651142, 18658044, 18664946, 18671848, 18678750, 18685652, 18692554, 18699456, 18706358, 18713260, 18720162, 18727064, 18733966, 18740868, 18747770, 18754672, 18761574, 18768476, 18775378, 18782280, 18789182, 18796084, 18802986, 18809888, 18816790, 18823692, 18830594, 18837496, 18844398, 18851300, 18858202, 18865104, 18872006, 18878908, 18885810, 18892712, 18899614, 18906516, 18913418, 18920320, 18927222, 18934124, 18941026, 18947928, 18954830, 18961732, 18968634, 18975536, 18982438, 18989340, 18996242, 19003144, 19010046, 19016948, 19023850, 29686896, 29698422, 29709948, 29721474, 29733000, 29744526, 29756052, 29767578, 29779104, 29790630, 29802156, 29813682, 29825208, 29836734, 29848260, 29859786, 29871312, 29882838, 29894364, 29905890, 29917416, 29928942, 29940468, 29951994, 29963520, 29975046, 29986572, 29998098, 30009624, 30021150, 30032676, 30044202, 30055728, 30067254, 30078780, 30090306, 30101832, 30113358, 30124884, 30136410, 30147936, 30159462, 30170988, 30182514, 30194040, 30205566, 30217092, 30228618, 30240144, 30251670, 30263196, 30274722, 30286248, 30297774, 30309300, 30320826, 30332352, 30343878, 30355404, 30366930, 30378456, 30389982, 30401508, 30413034, 30424560, 30436086, 30447612, 30459138, 30470664, 30482190, 30493716, 30505242, 40839984, 40856134, 40872284, 40888434, 40904584, 40920734, 40936884, 40953034, 40969184, 40985334, 41001484, 41017634, 41033784, 41049934, 41066084, 41082234, 41098384, 41114534, 41130684, 41146834, 41162984, 41179134, 41195284, 41211434, 41227584, 41243734, 41259884, 41276034, 41292184, 41308334, 41324484, 41340634, 41356784, 41372934, 41389084, 41405234, 41421384, 41437534, 41453684, 41469834, 41485984, 41502134, 41518284, 41534434, 41550584, 41566734, 41582884, 41599034, 41615184, 41631334, 41647484, 41663634, 41679784, 41695934, 41712084, 41728234, 41744384, 41760534, 41776684, 41792834, 41808984, 41825134, 41841284, 41857434, 41873584, 41889734, 41905884, 41922034, 41938184, 41954334, 41970484, 41986634, 51993072, 52013846, 52034620, 52055394, 52076168, 52096942, 52117716, 52138490, 52159264, 52180038, 52200812, 52221586, 52242360, 52263134, 52283908, 52304682, 52325456, 52346230, 52367004, 52387778, 52408552, 52429326, 52450100, 52470874, 52491648, 52512422, 52533196, 52553970, 52574744, 52595518, 52616292, 52637066, 52657840, 52678614, 52699388, 52720162, 52740936, 52761710, 52782484, 52803258, 52824032, 52844806, 52865580, 52886354, 52907128, 52927902, 52948676, 52969450, 52990224, 53010998, 53031772, 53052546, 53073320, 53094094, 53114868, 53135642, 53156416, 53177190, 53197964, 53218738, 53239512, 53260286, 53281060, 53301834, 53322608, 53343382, 53364156, 53384930, 53405704, 53426478, 53447252, 53468026, 63146160, 63171558, 63196956, 63222354, 63247752, 63273150, 63298548, 63323946, 63349344, 63374742, 63400140, 63425538, 63450936, 63476334, 63501732, 63527130, 63552528, 63577926, 63603324, 63628722, 63654120, 63679518, 63704916, 63730314, 63755712, 63781110, 63806508, 63831906, 63857304, 63882702, 63908100, 63933498, 63958896, 63984294, 64009692, 64035090, 64060488, 64085886, 64111284, 64136682, 64162080, 64187478, 64212876, 64238274, 64263672, 64289070, 64314468, 64339866, 64365264, 64390662, 64416060, 64441458, 64466856, 64492254, 64517652, 64543050, 64568448, 64593846, 64619244, 64644642, 64670040, 64695438, 64720836, 64746234, 64771632, 64797030, 64822428, 64847826, 64873224, 64898622, 64924020, 64949418, 74299248, 74329270, 74359292, 74389314, 74419336, 74449358, 74479380, 74509402, 74539424, 74569446, 74599468, 74629490, 74659512, 74689534, 74719556, 74749578, 74779600, 74809622, 74839644, 74869666, 74899688, 74929710, 74959732, 74989754, 75019776, 75049798, 75079820, 75109842, 75139864, 75169886, 75199908, 75229930, 75259952, 75289974, 75319996, 75350018, 75380040, 75410062, 75440084, 75470106, 75500128, 75530150, 75560172, 75590194, 75620216, 75650238, 75680260, 75710282, 75740304, 75770326, 75800348, 75830370, 75860392, 75890414, 75920436, 75950458, 75980480, 76010502, 76040524, 76070546, 76100568, 76130590, 76160612, 76190634, 76220656, 76250678, 76280700, 76310722, 76340744, 76370766, 76400788, 76430810, 85452336, 85486982, 85521628, 85556274, 85590920, 85625566, 85660212, 85694858, 85729504, 85764150, 85798796, 85833442, 85868088, 85902734, 85937380, 85972026, 86006672, 86041318, 86075964, 86110610, 86145256, 86179902, 86214548, 86249194, 86283840, 86318486, 86353132, 86387778, 86422424, 86457070, 86491716, 86526362, 86561008, 86595654, 86630300, 86664946, 86699592, 86734238, 86768884, 86803530, 86838176, 86872822, 86907468, 86942114, 86976760, 87011406, 87046052, 87080698, 87115344, 87149990, 87184636, 87219282, 87253928, 87288574, 87323220, 87357866, 87392512, 87427158, 87461804, 87496450, 87531096, 87565742, 87600388, 87635034, 87669680, 87704326, 87738972, 87773618, 87808264, 87842910, 87877556, 87912202, 96605424, 96644694, 96683964, 96723234, 96762504, 96801774, 96841044, 96880314, 96919584, 96958854, 96998124, 97037394, 97076664, 97115934, 97155204, 97194474, 97233744, 97273014, 97312284, 97351554, 97390824, 97430094, 97469364, 97508634, 97547904, 97587174, 97626444, 97665714, 97704984, 97744254, 97783524, 97822794, 97862064, 97901334, 97940604, 97979874, 98019144, 98058414, 98097684, 98136954, 98176224, 98215494, 98254764, 98294034, 98333304, 98372574, 98411844, 98451114, 98490384, 98529654, 98568924, 98608194, 98647464, 98686734, 98726004, 98765274, 98804544, 98843814, 98883084, 98922354, 98961624, 99000894, 99040164, 99079434, 99118704, 99157974, 99197244, 99236514, 99275784, 99315054, 99354324, 99393594, 107758512, 107802406, 107846300, 107890194, 107934088, 107977982, 108021876, 108065770, 108109664, 108153558, 108197452, 108241346, 108285240, 108329134, 108373028, 108416922, 108460816, 108504710, 108548604, 108592498, 108636392, 108680286, 108724180, 108768074, 108811968, 108855862, 108899756, 108943650, 108987544, 109031438, 109075332, 109119226, 109163120, 109207014, 109250908, 109294802, 109338696, 109382590, 109426484, 109470378, 109514272, 109558166, 109602060, 109645954, 109689848, 109733742, 109777636, 109821530, 109865424, 109909318, 109953212, 109997106, 110041000, 110084894, 110128788, 110172682, 110216576, 110260470, 110304364, 110348258, 110392152, 110436046, 110479940, 110523834, 110567728, 110611622, 110655516, 110699410, 110743304, 110787198, 110831092, 110874986, 118911600, 118960118, 119008636, 119057154, 119105672, 119154190, 119202708, 119251226, 119299744, 119348262, 119396780, 119445298, 119493816, 119542334, 119590852, 119639370, 119687888, 119736406, 119784924, 119833442, 119881960, 119930478, 119978996, 120027514, 120076032, 120124550, 120173068, 120221586, 120270104, 120318622, 120367140, 120415658, 120464176, 120512694, 120561212, 120609730, 120658248, 120706766, 120755284, 120803802, 120852320, 120900838, 120949356, 120997874, 121046392, 121094910, 121143428, 121191946, 121240464, 121288982, 121337500, 121386018, 121434536, 121483054, 121531572, 121580090, 121628608, 121677126, 121725644, 121774162, 121822680, 121871198, 121919716, 121968234, 122016752, 122065270, 122113788, 122162306, 122210824, 122259342, 122307860, 122356378, 130064688, 130117830, 130170972, 130224114, 130277256, 130330398, 130383540, 130436682, 130489824, 130542966, 130596108, 130649250, 130702392, 130755534, 130808676, 130861818, 130914960, 130968102, 131021244, 131074386, 131127528, 131180670, 131233812, 131286954, 131340096, 131393238, 131446380, 131499522, 131552664, 131605806, 131658948, 131712090, 131765232, 131818374, 131871516, 131924658, 131977800, 132030942, 132084084, 132137226, 132190368, 132243510, 132296652, 132349794, 132402936, 132456078, 132509220, 132562362, 132615504, 132668646, 132721788, 132774930, 132828072, 132881214, 132934356, 132987498, 133040640, 133093782, 133146924, 133200066, 133253208, 133306350, 133359492, 133412634, 133465776, 133518918, 133572060, 133625202, 133678344, 133731486, 133784628, 133837770, 141217776, 141275542, 141333308, 141391074, 141448840, 141506606, 141564372, 141622138, 141679904, 141737670, 141795436, 141853202, 141910968, 141968734, 142026500, 142084266, 142142032, 142199798, 142257564, 142315330, 142373096, 142430862, 142488628, 142546394, 142604160, 142661926, 142719692, 142777458, 142835224, 142892990, 142950756, 143008522, 143066288, 143124054, 143181820, 143239586, 143297352, 143355118, 143412884, 143470650, 143528416, 143586182, 143643948, 143701714, 143759480, 143817246, 143875012, 143932778, 143990544, 144048310, 144106076, 144163842, 144221608, 144279374, 144337140, 144394906, 144452672, 144510438, 144568204, 144625970, 144683736, 144741502, 144799268, 144857034, 144914800, 144972566, 145030332, 145088098, 145145864, 145203630, 145261396, 145319162, 152370864, 152433254, 152495644, 152558034, 152620424, 152682814, 152745204, 152807594, 152869984, 152932374, 152994764, 153057154, 153119544, 153181934, 153244324, 153306714, 153369104, 153431494, 153493884, 153556274, 153618664, 153681054, 153743444, 153805834, 153868224, 153930614, 153993004, 154055394, 154117784, 154180174, 154242564, 154304954, 154367344, 154429734, 154492124, 154554514, 154616904, 154679294, 154741684, 154804074, 154866464, 154928854, 154991244, 155053634, 155116024, 155178414, 155240804, 155303194, 155365584, 155427974, 155490364, 155552754, 155615144, 155677534, 155739924, 155802314, 155864704, 155927094, 155989484, 156051874, 156114264, 156176654, 156239044, 156301434, 156363824, 156426214, 156488604, 156550994, 156613384, 156675774, 156738164, 156800554, 163523952, 163590966, 163657980, 163724994, 163792008, 163859022, 163926036, 163993050, 164060064, 164127078, 164194092, 164261106, 164328120, 164395134, 164462148, 164529162, 164596176, 164663190, 164730204, 164797218, 164864232, 164931246, 164998260, 165065274, 165132288, 165199302, 165266316, 165333330, 165400344, 165467358, 165534372, 165601386, 165668400, 165735414, 165802428, 165869442, 165936456, 166003470, 166070484, 166137498, 166204512, 166271526, 166338540, 166405554, 166472568, 166539582, 166606596, 166673610, 166740624, 166807638, 166874652, 166941666, 167008680, 167075694, 167142708, 167209722, 167276736, 167343750, 167410764, 167477778, 167544792, 167611806, 167678820, 167745834, 167812848, 167879862, 167946876, 168013890, 168080904, 168147918, 168214932, 168281946, 174677040, 174748678, 174820316, 174891954, 174963592, 175035230, 175106868, 175178506, 175250144, 175321782, 175393420, 175465058, 175536696, 175608334, 175679972, 175751610, 175823248, 175894886, 175966524, 176038162, 176109800, 176181438, 176253076, 176324714, 176396352, 176467990, 176539628, 176611266, 176682904, 176754542, 176826180, 176897818, 176969456, 177041094, 177112732, 177184370, 177256008, 177327646, 177399284, 177470922, 177542560, 177614198, 177685836, 177757474, 177829112, 177900750, 177972388, 178044026, 178115664, 178187302, 178258940, 178330578, 178402216, 178473854, 178545492, 178617130, 178688768, 178760406, 178832044, 178903682, 178975320, 179046958, 179118596, 179190234, 179261872, 179333510, 179405148, 179476786, 179548424, 179620062, 179691700, 179763338, 185830128, 185906390, 185982652, 186058914, 186135176, 186211438, 186287700, 186363962, 186440224, 186516486, 186592748, 186669010, 186745272, 186821534, 186897796, 186974058, 187050320, 187126582, 187202844, 187279106, 187355368, 187431630, 187507892, 187584154, 187660416, 187736678, 187812940, 187889202, 187965464, 188041726, 188117988, 188194250, 188270512, 188346774, 188423036, 188499298, 188575560, 188651822, 188728084, 188804346, 188880608, 188956870, 189033132, 189109394, 189185656, 189261918, 189338180, 189414442, 189490704, 189566966, 189643228, 189719490, 189795752, 189872014, 189948276, 190024538, 190100800, 190177062, 190253324, 190329586, 190405848, 190482110, 190558372, 190634634, 190710896, 190787158, 190863420, 190939682, 191015944, 191092206, 191168468, 191244730, 196983216, 197064102, 197144988, 197225874, 197306760, 197387646, 197468532, 197549418, 197630304, 197711190, 197792076, 197872962, 197953848, 198034734, 198115620, 198196506, 198277392, 198358278, 198439164, 198520050, 198600936, 198681822, 198762708, 198843594, 198924480, 199005366, 199086252, 199167138, 199248024, 199328910, 199409796, 199490682, 199571568, 199652454, 199733340, 199814226, 199895112, 199975998, 200056884, 200137770, 200218656, 200299542, 200380428, 200461314, 200542200, 200623086, 200703972, 200784858, 200865744, 200946630, 201027516, 201108402, 201189288, 201270174, 201351060, 201431946, 201512832, 201593718, 201674604, 201755490, 201836376, 201917262, 201998148, 202079034, 202159920, 202240806, 202321692, 202402578, 202483464, 202564350, 202645236, 202726122, 208136304, 208221814, 208307324, 208392834, 208478344, 208563854, 208649364, 208734874, 208820384, 208905894, 208991404, 209076914, 209162424, 209247934, 209333444, 209418954, 209504464, 209589974, 209675484, 209760994, 209846504, 209932014, 210017524, 210103034, 210188544, 210274054, 210359564, 210445074, 210530584, 210616094, 210701604, 210787114, 210872624, 210958134, 211043644, 211129154, 211214664, 211300174, 211385684, 211471194, 211556704, 211642214, 211727724, 211813234, 211898744, 211984254, 212069764, 212155274, 212240784, 212326294, 212411804, 212497314, 212582824, 212668334, 212753844, 212839354, 212924864, 213010374, 213095884, 213181394, 213266904, 213352414, 213437924, 213523434, 213608944, 213694454, 213779964, 213865474, 213950984, 214036494, 214122004, 214207514, 219289392, 219379526, 219469660, 219559794, 219649928, 219740062, 219830196, 219920330, 220010464, 220100598, 220190732, 220280866, 220371000, 220461134, 220551268, 220641402, 220731536, 220821670, 220911804, 221001938, 221092072, 221182206, 221272340, 221362474, 221452608, 221542742, 221632876, 221723010, 221813144, 221903278, 221993412, 222083546, 222173680, 222263814, 222353948, 222444082, 222534216, 222624350, 222714484, 222804618, 222894752, 222984886, 223075020, 223165154, 223255288, 223345422, 223435556, 223525690, 223615824, 223705958, 223796092, 223886226, 223976360, 224066494, 224156628, 224246762, 224336896, 224427030, 224517164, 224607298, 224697432, 224787566, 224877700, 224967834, 225057968, 225148102, 225238236, 225328370, 225418504, 225508638, 225598772, 225688906, 230442480, 230537238, 230631996, 230726754, 230821512, 230916270, 231011028, 231105786, 231200544, 231295302, 231390060, 231484818, 231579576, 231674334, 231769092, 231863850, 231958608, 232053366, 232148124, 232242882, 232337640, 232432398, 232527156, 232621914, 232716672, 232811430, 232906188, 233000946, 233095704, 233190462, 233285220, 233379978, 233474736, 233569494, 233664252, 233759010, 233853768, 233948526, 234043284, 234138042, 234232800, 234327558, 234422316, 234517074, 234611832, 234706590, 234801348, 234896106, 234990864, 235085622, 235180380, 235275138, 235369896, 235464654, 235559412, 235654170, 235748928, 235843686, 235938444, 236033202, 236127960, 236222718, 236317476, 236412234, 236506992, 236601750, 236696508, 236791266, 236886024, 236980782, 237075540, 237170298, 241595568, 241694950, 241794332, 241893714, 241993096, 242092478, 242191860, 242291242, 242390624, 242490006, 242589388, 242688770, 242788152, 242887534, 242986916, 243086298, 243185680, 243285062, 243384444, 243483826, 243583208, 243682590, 243781972, 243881354, 243980736, 244080118, 244179500, 244278882, 244378264, 244477646, 244577028, 244676410, 244775792, 244875174, 244974556, 245073938, 245173320, 245272702, 245372084, 245471466, 245570848, 245670230, 245769612, 245868994, 245968376, 246067758, 246167140, 246266522, 246365904, 246465286, 246564668, 246664050, 246763432, 246862814, 246962196, 247061578, 247160960, 247260342, 247359724, 247459106, 247558488, 247657870, 247757252, 247856634, 247956016, 248055398, 248154780, 248254162, 248353544, 248452926, 248552308, 248651690, 252748656, 252852662, 252956668, 253060674, 253164680, 253268686, 253372692, 253476698, 253580704, 253684710, 253788716, 253892722, 253996728, 254100734, 254204740, 254308746, 254412752, 254516758, 254620764, 254724770, 254828776, 254932782, 255036788, 255140794, 255244800, 255348806, 255452812, 255556818, 255660824, 255764830, 255868836, 255972842, 256076848, 256180854, 256284860, 256388866, 256492872, 256596878, 256700884, 256804890, 256908896, 257012902, 257116908, 257220914, 257324920, 257428926, 257532932, 257636938, 257740944, 257844950, 257948956, 258052962, 258156968, 258260974, 258364980, 258468986, 258572992, 258676998, 258781004, 258885010, 258989016, 259093022, 259197028, 259301034, 259405040, 259509046, 259613052, 259717058, 259821064, 259925070, 260029076, 260133082, 263901744, 264010374, 264119004, 264227634, 264336264, 264444894, 264553524, 264662154, 264770784, 264879414, 264988044, 265096674, 265205304, 265313934, 265422564, 265531194, 265639824, 265748454, 265857084, 265965714, 266074344, 266182974, 266291604, 266400234, 266508864, 266617494, 266726124, 266834754, 266943384, 267052014, 267160644, 267269274, 267377904, 267486534, 267595164, 267703794, 267812424, 267921054, 268029684, 268138314, 268246944, 268355574, 268464204, 268572834, 268681464, 268790094, 268898724, 269007354, 269115984, 269224614, 269333244, 269441874, 269550504, 269659134, 269767764, 269876394, 269985024, 270093654, 270202284, 270310914, 270419544, 270528174, 270636804, 270745434, 270854064, 270962694, 271071324, 271179954, 271288584, 271397214, 271505844, 271614474, 275054832, 275168086, 275281340, 275394594, 275507848, 275621102, 275734356, 275847610, 275960864, 276074118, 276187372, 276300626, 276413880, 276527134, 276640388, 276753642, 276866896, 276980150, 277093404, 277206658, 277319912, 277433166, 277546420, 277659674, 277772928, 277886182, 277999436, 278112690, 278225944, 278339198, 278452452, 278565706, 278678960, 278792214, 278905468, 279018722, 279131976, 279245230, 279358484, 279471738, 279584992, 279698246, 279811500, 279924754, 280038008, 280151262, 280264516, 280377770, 280491024, 280604278, 280717532, 280830786, 280944040, 281057294, 281170548, 281283802, 281397056, 281510310, 281623564, 281736818, 281850072, 281963326, 282076580, 282189834, 282303088, 282416342, 282529596, 282642850, 282756104, 282869358, 282982612, 283095866, 286207920, 286325798, 286443676, 286561554, 286679432, 286797310, 286915188, 287033066, 287150944, 287268822, 287386700, 287504578, 287622456, 287740334, 287858212, 287976090, 288093968, 288211846, 288329724, 288447602, 288565480, 288683358, 288801236, 288919114, 289036992, 289154870, 289272748, 289390626, 289508504, 289626382, 289744260, 289862138, 289980016, 290097894, 290215772, 290333650, 290451528, 290569406, 290687284, 290805162, 290923040, 291040918, 291158796, 291276674, 291394552, 291512430, 291630308, 291748186, 291866064, 291983942, 292101820, 292219698, 292337576, 292455454, 292573332, 292691210, 292809088, 292926966, 293044844, 293162722, 293280600, 293398478, 293516356, 293634234, 293752112, 293869990, 293987868, 294105746, 294223624, 294341502, 294459380, 294577258, 297361008, 297483510, 297606012, 297728514, 297851016, 297973518, 298096020, 298218522, 298341024, 298463526, 298586028, 298708530, 298831032, 298953534, 299076036, 299198538, 299321040, 299443542, 299566044, 299688546, 299811048, 299933550, 300056052, 300178554, 300301056, 300423558, 300546060, 300668562, 300791064, 300913566, 301036068, 301158570, 301281072, 301403574, 301526076, 301648578, 301771080, 301893582, 302016084, 302138586, 302261088, 302383590, 302506092, 302628594, 302751096, 302873598, 302996100, 303118602, 303241104, 303363606, 303486108, 303608610, 303731112, 303853614, 303976116, 304098618, 304221120, 304343622, 304466124, 304588626, 304711128, 304833630, 304956132, 305078634, 305201136, 305323638, 305446140, 305568642, 305691144, 305813646, 305936148, 306058650, 308514096, 308641222, 308768348, 308895474, 309022600, 309149726, 309276852, 309403978, 309531104, 309658230, 309785356, 309912482, 310039608, 310166734, 310293860, 310420986, 310548112, 310675238, 310802364, 310929490, 311056616, 311183742, 311310868, 311437994, 311565120, 311692246, 311819372, 311946498, 312073624, 312200750, 312327876, 312455002, 312582128, 312709254, 312836380, 312963506, 313090632, 313217758, 313344884, 313472010, 313599136, 313726262, 313853388, 313980514, 314107640, 314234766, 314361892, 314489018, 314616144, 314743270, 314870396, 314997522, 315124648, 315251774, 315378900, 315506026, 315633152, 315760278, 315887404, 316014530, 316141656, 316268782, 316395908, 316523034, 316650160, 316777286, 316904412, 317031538, 317158664, 317285790, 317412916, 317540042, 319667184, 319798934, 319930684, 320062434, 320194184, 320325934, 320457684, 320589434, 320721184, 320852934, 320984684, 321116434, 321248184, 321379934, 321511684, 321643434, 321775184, 321906934, 322038684, 322170434, 322302184, 322433934, 322565684, 322697434, 322829184, 322960934, 323092684, 323224434, 323356184, 323487934, 323619684, 323751434, 323883184, 324014934, 324146684, 324278434, 324410184, 324541934, 324673684, 324805434, 324937184, 325068934, 325200684, 325332434, 325464184, 325595934, 325727684, 325859434, 325991184, 326122934, 326254684, 326386434, 326518184, 326649934, 326781684, 326913434, 327045184, 327176934, 327308684, 327440434, 327572184, 327703934, 327835684, 327967434, 328099184, 328230934, 328362684, 328494434, 328626184, 328757934, 328889684, 329021434, 330820272, 330956646, 331093020, 331229394, 331365768, 331502142, 331638516, 331774890, 331911264, 332047638, 332184012, 332320386, 332456760, 332593134, 332729508, 332865882, 333002256, 333138630, 333275004, 333411378, 333547752, 333684126, 333820500, 333956874, 334093248, 334229622, 334365996, 334502370, 334638744, 334775118, 334911492, 335047866, 335184240, 335320614, 335456988, 335593362, 335729736, 335866110, 336002484, 336138858, 336275232, 336411606, 336547980, 336684354, 336820728, 336957102, 337093476, 337229850, 337366224, 337502598, 337638972, 337775346, 337911720, 338048094, 338184468, 338320842, 338457216, 338593590, 338729964, 338866338, 339002712, 339139086, 339275460, 339411834, 339548208, 339684582, 339820956, 339957330, 340093704, 340230078, 340366452, 340502826, 341973360, 342114358, 342255356, 342396354, 342537352, 342678350, 342819348, 342960346, 343101344, 343242342, 343383340, 343524338, 343665336, 343806334, 343947332, 344088330, 344229328, 344370326, 344511324, 344652322, 344793320, 344934318, 345075316, 345216314, 345357312, 345498310, 345639308, 345780306, 345921304, 346062302, 346203300, 346344298, 346485296, 346626294, 346767292, 346908290, 347049288, 347190286, 347331284, 347472282, 347613280, 347754278, 347895276, 348036274, 348177272, 348318270, 348459268, 348600266, 348741264, 348882262, 349023260, 349164258, 349305256, 349446254, 349587252, 349728250, 349869248, 350010246, 350151244, 350292242, 350433240, 350574238, 350715236, 350856234, 350997232, 351138230, 351279228, 351420226, 351561224, 351702222, 351843220, 351984218, 353126448, 353272070, 353417692, 353563314, 353708936, 353854558, 354000180, 354145802, 354291424, 354437046, 354582668, 354728290, 354873912, 355019534, 355165156, 355310778, 355456400, 355602022, 355747644, 355893266, 356038888, 356184510, 356330132, 356475754, 356621376, 356766998, 356912620, 357058242, 357203864, 357349486, 357495108, 357640730, 357786352, 357931974, 358077596, 358223218, 358368840, 358514462, 358660084, 358805706, 358951328, 359096950, 359242572, 359388194, 359533816, 359679438, 359825060, 359970682, 360116304, 360261926, 360407548, 360553170, 360698792, 360844414, 360990036, 361135658, 361281280, 361426902, 361572524, 361718146, 361863768, 362009390, 362155012, 362300634, 362446256, 362591878, 362737500, 362883122, 363028744, 363174366, 363319988, 363465610, 364279536, 364429782, 364580028, 364730274, 364880520, 365030766, 365181012, 365331258, 365481504, 365631750, 365781996, 365932242, 366082488, 366232734, 366382980, 366533226, 366683472, 366833718, 366983964, 367134210, 367284456, 367434702, 367584948, 367735194, 367885440, 368035686, 368185932, 368336178, 368486424, 368636670, 368786916, 368937162, 369087408, 369237654, 369387900, 369538146, 369688392, 369838638, 369988884, 370139130, 370289376, 370439622, 370589868, 370740114, 370890360, 371040606, 371190852, 371341098, 371491344, 371641590, 371791836, 371942082, 372092328, 372242574, 372392820, 372543066, 372693312, 372843558, 372993804, 373144050, 373294296, 373444542, 373594788, 373745034, 373895280, 374045526, 374195772, 374346018, 374496264, 374646510, 374796756, 374947002, 375432624, 375587494, 375742364, 375897234, 376052104, 376206974, 376361844, 376516714, 376671584, 376826454, 376981324, 377136194, 377291064, 377445934, 377600804, 377755674, 377910544, 378065414, 378220284, 378375154, 378530024, 378684894, 378839764, 378994634, 379149504, 379304374, 379459244, 379614114, 379768984, 379923854, 380078724, 380233594, 380388464, 380543334, 380698204, 380853074, 381007944, 381162814, 381317684, 381472554, 381627424, 381782294, 381937164, 382092034, 382246904, 382401774, 382556644, 382711514, 382866384, 383021254, 383176124, 383330994, 383485864, 383640734, 383795604, 383950474, 384105344, 384260214, 384415084, 384569954, 384724824, 384879694, 385034564, 385189434, 385344304, 385499174, 385654044, 385808914, 385963784, 386118654, 386273524, 386428394})});
 #endif
   return test_cases;
 }
diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc
index d97873c21983f..649c9af7cc80b 100644
--- a/onnxruntime/test/providers/cpu/math/softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc
@@ -60,8 +60,8 @@ TEST(SoftmaxOperator, webgpu_nan) {
   test.AddInput<float>("X", dimensions, x_vals);
   test.AddOutput<float>("Y", dimensions, expected_result);
 
-  // explicitly disable CPU EP for this test since CPU implementation does not handle NaN
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
+  // explicitly disable for EPs that do not handle NaN
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider, kCoreMLExecutionProvider});
 }
 #endif
 
diff --git a/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc b/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
index eaf8fea03eaa0..e6c9f8435ffdd 100644
--- a/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
@@ -844,5 +844,46 @@ TEST(MLOpTest, TreeRegressorTrueNodeBeforeNode) {
   test.Run();
 }
 
+TEST(MLOpTest, TreeRegressorBranchEq) {
+  OpTester test("TreeEnsembleRegressor", 3, onnxruntime::kMLDomain);
+
+  // tree
+  int64_t n_targets = 1;
+  std::vector<int64_t> nodes_featureids = {0, 0, 0, 0, 0};
+  std::vector<std::string> nodes_modes = {"BRANCH_EQ", "LEAF", "BRANCH_EQ", "LEAF", "LEAF"};
+  std::vector<float> nodes_values = {0.0, -1.0, 1.0, -1.0, -1.0};
+  std::vector<int64_t> nodes_treeids = {0, 0, 0, 0, 0};
+  std::vector<int64_t> nodes_nodeids = {0, 1, 2, 3, 4};
+  std::vector<int64_t> nodes_falsenodeids = {2, -1, 3, -1, -1};
+  std::vector<int64_t> nodes_truenodeids = {1, -1, 4, -1, -1};
+
+  std::string post_transform = "NONE";
+  std::vector<int64_t> target_ids = {0, 1, 2};
+  std::vector<int64_t> target_nodeids = {1, 3, 4};
+  std::vector<int64_t> target_treeids = {0, 0, 0};
+  std::vector<float> target_weights = {10.0, 20.0, 30.0};
+
+  // add attributes
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_treeids", nodes_treeids);
+  test.AddAttribute("nodes_nodeids", nodes_nodeids);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_values", nodes_values);
+  test.AddAttribute("nodes_modes", nodes_modes);
+  test.AddAttribute("target_treeids", target_treeids);
+  test.AddAttribute("target_nodeids", target_nodeids);
+  test.AddAttribute("target_ids", target_ids);
+  test.AddAttribute("target_weights", target_weights);
+  test.AddAttribute("n_targets", n_targets);
+
+  // fill input data
+  std::vector<float> X = {-1.0, 0.0, 0.5, 1.0, 1.5};
+  std::vector<float> Y = {20.0, 10.0, 20.0, 30.0, 20.0};
+  test.AddInput<float>("X", {5, 1}, X);
+  test.AddOutput<float>("Y", {5, 1}, Y);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 06434d5b59ec6..0b8624ad6c67f 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -697,6 +697,60 @@ TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) {
   TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
 }
 
+TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992_Packed) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {1.0f};  // shape: [1, 1, 1, 1]
+  vector<int64_t> X_shape = {1, 1, 1, 1};
+  vector<float> W(32, 0.5f);  // shape: [32, 1, 1, 1]
+  vector<int64_t> W_shape = {32, 1, 1, 1};
+  vector<float> B(32, 0.5f);  // shape: [32]
+  vector<int64_t> B_shape = {32};
+  vector<int64_t> Y_shape = {1, 32, 1, 1};
+  auto expected_vals = {
+      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992_Packed4) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{8, 8},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X(64, 1.0f);
+  vector<int64_t> X_shape = {1, 1, 8, 8};
+  vector<float> W(2048, 0.5f);
+  vector<int64_t> W_shape = {32, 1, 8, 8};
+  vector<float> B(32, 0.5f);
+  vector<int64_t> B_shape = {32};
+  vector<int64_t> Y_shape = {1, 32, 1, 1};
+  auto expected_vals = {
+      32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f,
+      32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f,
+      32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f,
+      32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f, 32.5f};
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
 TEST(ConvTest, Depthwise2D_Bias_Group2) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
diff --git a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
new file mode 100644
index 0000000000000..4bdd3ea5adaff
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
@@ -0,0 +1,72 @@
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+
+using namespace std;
+
+namespace onnxruntime {
+namespace test {
+
+// All tests in this file are for the CPU provider and
+// CUDA provider
+
+TEST(RMSNormalizationOpTest, RMSNorm) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  std::vector<int64_t> input_dims{1, 2, 3};
+  test.AddInput<float>("X", input_dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  vector<int64_t> scale_dims = {3};
+  test.AddInput<float>("scale", scale_dims, {1.F, 1.F, 1.F});
+  test.AddOutput<float>("Y", input_dims, {0.4629f, 0.9258f, 1.3887f, 0.7895f, 0.9869f, 1.1843f});
+  // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_float16) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  std::vector<int64_t> input_dims{1, 2, 3};
+  test.AddInput<MLFloat16>("X", input_dims, ToFloat16({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}));
+  vector<int64_t> scale_dims = {3};
+  test.AddInput<MLFloat16>("scale", scale_dims, ToFloat16({1.F, 1.F, 1.F}));
+  test.AddOutput<MLFloat16>("Y", input_dims, ToFloat16({0.4629f, 0.9258f, 1.3887f, 0.7895f, 0.9869f, 1.1843f}));
+  // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  std::vector<int64_t> input_dims{2, 2, 2};
+  test.AddInput<float>("X", input_dims, {-10.264f, 8.6453f, 43.1561f, -0.641239f, -8.2164f, 0.11412f, 41.3156f, 3.0458f});
+  vector<int64_t> scale_dims = {2};
+  test.AddInput<float>("scale", scale_dims, {-0.6953f, 5.1824f});
+  test.AddOutput<float>("Y", input_dims, {0.7521f, 4.7215f, -0.9832f, -0.1089f, 0.9832f, 0.1018f, -0.9806f, 0.5388f});
+  // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Float16) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  std::vector<int64_t> input_dims{2, 2, 2};
+  test.AddInput<MLFloat16>("X", input_dims, ToFloat16({-10.264f, 8.6453f, 43.1561f, -0.641239f, -8.2164f, 0.11412f, 41.3156f, 3.0458f}));
+  vector<int64_t> scale_dims = {2};
+  test.AddInput<MLFloat16>("scale", scale_dims, ToFloat16({-0.6953f, 5.1824f}));
+  test.AddOutput<MLFloat16>("Y", input_dims, ToFloat16({0.7521f, 4.7215f, -0.9832f, -0.1089f, 0.9832f, 0.1018f, -0.9806f, 0.5388f}));
+  // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
index c2d64b8e5ee4a..d819b0973adc2 100644
--- a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
@@ -536,5 +536,19 @@ TEST(SequenceOpsTest, SplitToSequence_PositiveAxisDontKeepDims) {
   test.AddSeqOutput("S2", output);
   test.Run();
 }
+
+TEST(SequenceOpsTest, SplitToSequence_BoolSplit) {
+  OpTester test("SplitToSequence", 11);
+  test.AddInput<bool>("input", {4, 2}, std::initializer_list<bool>({1, 1, 1, 1, 0, 0, 0, 0}));
+  int64_t axis = 0;
+  test.AddAttribute("axis", axis);
+  SeqTensors<bool> output;
+  output.AddTensor({1, 2}, {1, 1});
+  output.AddTensor({1, 2}, {1, 1});
+  output.AddTensor({1, 2}, {0, 0});
+  output.AddTensor({1, 2}, {0, 0});
+  test.AddSeqOutput("S2", output);
+  test.Run();
+}
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index f182cea838933..a7aff1241e7ce 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -2381,7 +2381,7 @@ TEST(ResizeOpTest, NoAntialias_AlignCorners_Cubic_Floor_NHWC) {
     23.0000f, 24.0000f,
   };
   // clang-format on
-  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
   TestAntialiasing(
       {{"antialias", "0"},
        {"coordinate_transformation_mode", "align_corners"},
diff --git a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
index df1f0989a9607..9853ce89d29f3 100644
--- a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
@@ -91,6 +91,36 @@ TEST(TensorOpTest, Reshape_WithOutAllowZero) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(TensorOpTest, Reshape_WithOutAllowZeroToDiffRank) {
+  OpTester test("Reshape", 14);
+
+  test.AddInput<float>("data", {2, 3, 12}, std::vector<float>(72, 1.0f));
+  test.AddInput<int64_t>("shape", {4}, {2, 3, 3, 4}, true);
+  test.AddAttribute<int64_t>("allowzero", 0);
+  test.AddOutput<float>("reshaped", {2, 3, 3, 4}, std::vector<float>(72, 1.0f));
+  test.Run();
+}
+
+TEST(TensorOpTest, Reshape_WithOutAllowZeroToDiffRankOneZero) {
+  OpTester test("Reshape", 14);
+
+  test.AddInput<float>("data", {2, 3, 12}, std::vector<float>(72, 1.0f));
+  test.AddInput<int64_t>("shape", {4}, {0, 3, 3, 4}, true);
+  test.AddAttribute<int64_t>("allowzero", 0);
+  test.AddOutput<float>("reshaped", {2, 3, 3, 4}, std::vector<float>(72, 1.0f));
+  test.Run();
+}
+
+TEST(TensorOpTest, Reshape_WithOutAllowZeroToDiffRankTwoZeroes) {
+  OpTester test("Reshape", 14);
+
+  test.AddInput<float>("data", {2, 3, 12}, std::vector<float>(72, 1.0f));
+  test.AddInput<int64_t>("shape", {4}, {0, 0, 3, 4}, true);
+  test.AddAttribute<int64_t>("allowzero", 0);
+  test.AddOutput<float>("reshaped", {2, 3, 3, 4}, std::vector<float>(72, 1.0f));
+  test.Run();
+}
+
 TEST(TensorOpTest, Reshape_WithAllowZero) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
index c04cbc7d4924e..f5f68a20d327c 100644
--- a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
+++ b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
@@ -365,7 +365,8 @@ TEST(NvExecutionProviderTest, ContextEmbedAndReloadDataDynamic) {
 TYPED_TEST(NvExecutionProviderTest, IOTypeTests) {
   std::string dtype_name = this->getTypeAsName();
   ASSERT_FALSE(dtype_name.empty());
-  PathString model_name = ORT_TSTR("nv_execution_provider_" + dtype_name + ".onnx");
+  const std::string model_name_str = "nv_execution_provider_" + dtype_name + ".onnx";
+  const PathString model_name = ToPathString(model_name_str);
   std::string graph_name = "test" + dtype_name;
   std::vector<int> dims = {1, -1, -1};
 
diff --git a/onnxruntime/test/providers/qnn/cumsum_op_htp_test.cc b/onnxruntime/test/providers/qnn/cumsum_op_htp_test.cc
new file mode 100644
index 0000000000000..f3affc18d8a9a
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/cumsum_op_htp_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+// Runs a non-QDQ model on HTP and compares output to CPU EP.
+template <typename InputType1 = float, typename InputType2 = float>
+static void RunCumSumOpTest(const std::string& op_type,
+                            const TestInputDef<InputType1>& input_def_1,
+                            const TestInputDef<InputType2>& input_def_2,
+                            const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                            int opset_version,
+                            ExpectedEPNodeAssignment expected_ep_assignment,
+                            float fp32_abs_err = 2e-3f) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
+  RunQnnModelTest(BuildOpTestCase<InputType1, InputType2>(op_type, {input_def_1}, {input_def_2}, attrs),
+                  provider_options,
+                  opset_version,
+                  expected_ep_assignment,
+                  fp32_abs_err);
+}
+
+// Non-QDQ model, CumSum with float input and axis input as initializer with axis 0
+TEST_F(QnnHTPBackendTests, CumSum_float_int32_e0_r0_axis_0) {
+  RunCumSumOpTest<float, int32_t>("CumSum",
+                                  TestInputDef<float>({3, 2}, false, {1.3f, 7.2f, 0.4f, 3.4f, 5.7f, 0.8f}),
+                                  TestInputDef<int32_t>({}, true, {0}),
+                                  {utils::MakeAttribute("exclusive", static_cast<int64_t>(0)),
+                                   utils::MakeAttribute("reverse", static_cast<int64_t>(0))},
+                                  17,
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Non-QDQ model, CumSum with float input and axis input as initializer with axis -1
+TEST_F(QnnHTPBackendTests, CumSum_float_int32_e0_r0_axis_neg1) {
+  RunCumSumOpTest<float, int32_t>("CumSum",
+                                  TestInputDef<float>({3, 2}, false, {1.3f, 7.2f, 0.4f, 3.4f, 5.7f, 0.8f}),
+                                  TestInputDef<int32_t>({}, true, {-1}),
+                                  {utils::MakeAttribute("exclusive", static_cast<int64_t>(0)),
+                                   utils::MakeAttribute("reverse", static_cast<int64_t>(0))},
+                                  17,
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Returns a function that creates a graph with a QDQ CumSum operator.
+template <typename QuantType, typename AxisType>
+GetTestQDQModelFn<QuantType> BuildQDQCumSumTestCase(const TestInputDef<float>& input_def,
+                                                    const TestInputDef<AxisType>& axis_def,
+                                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                    bool use_contrib_qdq = false) {
+  return [input_def, axis_def, attrs, use_contrib_qdq](ModelTestBuilder& builder,
+                                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                   use_contrib_qdq);
+
+    // axis input
+    NodeArg* axis_input = MakeTestInput(builder, axis_def);
+
+    // CumSum op
+    NodeArg* op_output = builder.MakeIntermediate();
+    Node& cumsum_node = builder.AddNode("CumSum", {input_qdq, axis_input}, {op_output});
+
+    for (const auto& attr : attrs) {
+      cumsum_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+// Test the accuracy of a QDQ CumSum model on QNN EP. Checks if the QDQ model on QNN EP is as accurate as the QDQ model on CPU EP
+// (compared to float32 model).
+template <typename QuantType, typename AxisType>
+static void RunQDQCumSumOpTest(const TestInputDef<float>& input_def,
+                               const TestInputDef<AxisType>& axis_def,
+                               const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                               int opset,
+                               ExpectedEPNodeAssignment expected_ep_assignment,
+                               bool use_contrib_qdq = false) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  auto f32_model_builder = BuildOpTestCase<float, AxisType>("CumSum", {input_def}, {axis_def}, attrs);
+  auto qdq_model_builder = BuildQDQCumSumTestCase<QuantType, AxisType>(input_def, axis_def, attrs,
+                                                                       use_contrib_qdq);
+
+  TestQDQModelAccuracy<QuantType>(f32_model_builder,
+                                  qdq_model_builder,
+                                  provider_options,
+                                  opset,
+                                  expected_ep_assignment);
+}
+
+// Test creates a DQ -> CumSum -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP.
+//
+// QDQ model, CumSum with uint8 input and axis input as initializer
+TEST_F(QnnHTPBackendTests, CumSum_uint8_int32_e0_r0) {
+  RunQDQCumSumOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 2}, false, {1.3f, 7.2f, 0.4f, 3.4f, 5.7f, 0.8f}),
+                                       TestInputDef<int32_t>({}, true, {0}),
+                                       {utils::MakeAttribute("exclusive", static_cast<int64_t>(0)),
+                                        utils::MakeAttribute("reverse", static_cast<int64_t>(0))},
+                                       17,
+                                       ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index fbaf997b476da..73665ec1b9bdc 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -303,6 +303,22 @@ TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Static_B_Dynamic_Bias_U8) {
                                         ExpectedEPNodeAssignment::All);
 }
 
+#ifndef __linux__
+// Test 16-bit QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
+TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Dynamic_B_Dynamic_Bias_U16) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint16_t, uint16_t>({TestInputDef<float>({1, 6}, false, input_a_data),
+                                           TestInputDef<float>({6, 4}, false, input_b_data),
+                                           TestInputDef<float>({1, 4}, false, input_c_data)},
+                                          {},
+                                          ExpectedEPNodeAssignment::All,
+                                          13,     // opset
+                                          true);  // Use com.microsoft Q/DQ ops
+}
+#endif
+
 // Test broadcasting of bias input. All inputs are dynamic.
 TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicInputs) {
   std::vector<float> input_a_data = {1.0f, 2.0f, 3.0f, 4.0f, -1.0f, -2.0f, -3.0f, -4.0f};
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index 09ead72889bca..e983c31ab1b75 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -340,6 +340,60 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_dynamic_inputs) {
   }
 }
 
+#ifndef __linux__
+// Tests MatMul with two uint16 (quantized) inputs with weight as static.
+// This exercises a workaround in QNN EP that inserts a QNN Convert op before input[1] (converts from uint16 to sint16).
+// This workaround prevents a validation error for this specific MatMul configuration.
+// Got specific shapes and input ranges (quant params) from customer model.
+TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_static_weight) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  // Test with rank 4 inputs
+  {
+    std::vector<int64_t> shape_0 = {1, 12, 512, 96};
+    TestInputDef<float> input0_def(
+        {1, 12, 512, 96}, false,
+        GetFloatDataInRange(-5.087f, 4.992f,
+                            static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+    std::vector<int64_t> shape_1 = {1, 12, 96, 512};
+    TestInputDef<float> input1_def(
+        shape_1, true,
+        GetFloatDataInRange(-6.772f, 7.258f,
+                            static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+
+    TestQDQModelAccuracy(
+        BuildMatMulOpTestCase(input0_def, input1_def),
+        BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
+        provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
+  }
+
+  // Test with input[1] as rank 1
+  {
+    std::vector<int64_t> shape_0 = {1, 12, 512, 96};
+    TestInputDef<float> input0_def(
+        {1, 12, 512, 96}, false,
+        GetFloatDataInRange(-5.087f, 4.992f,
+                            static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+    std::vector<int64_t> shape_1 = {96};
+    TestInputDef<float> input1_def(
+        shape_1, true,
+        GetFloatDataInRange(-6.772f, 7.258f,
+                            static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
+                                                                std::multiplies<int64_t>()))));
+
+    TestQDQModelAccuracy(
+        BuildMatMulOpTestCase(input0_def, input1_def),
+        BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
+        provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
+  }
+}
+#endif
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/qnn/qnn_node_group/channel_shuffle_fusion_test.cc b/onnxruntime/test/providers/qnn/qnn_node_group/channel_shuffle_fusion_test.cc
new file mode 100644
index 0000000000000..e5f7fd69ed655
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/qnn_node_group/channel_shuffle_fusion_test.cc
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+
+GetTestModelFn BuildTestCase() {
+  return [](ModelTestBuilder& builder) -> void {
+    const int64_t num_channels = 12;
+    const std::vector<int64_t> input_shape{1, num_channels, 8, 8};
+    auto input_def = TestInputDef<float>(input_shape, false, -0.5f, 0.5f);
+    // Conv
+    NodeArg* output_conv1 = builder.MakeIntermediate();
+    NodeArg* input = MakeTestInput<float>(builder, input_def);
+    const std::vector<int64_t> conv_weight_shape = {num_channels, num_channels / 2, 1, 1};
+    NodeArg* conv_weight = builder.MakeInitializer<float>({conv_weight_shape}, -2.f, 2.f);
+    Node& conv1 = builder.AddNode("Conv", {input, conv_weight}, {output_conv1});
+    conv1.AddAttribute("group", static_cast<int64_t>(2));
+    // Reshape
+    NodeArg* shape1 = builder.Make1DInitializer<int64_t>({input_shape[0],
+                                                          2,
+                                                          num_channels / 2,
+                                                          input_shape[2],
+                                                          input_shape[3]});
+    NodeArg* output_reshape1 = builder.MakeIntermediate();
+    builder.AddNode("Reshape", {output_conv1, shape1}, {output_reshape1});
+    // Transpose
+    NodeArg* output_transpose = builder.MakeIntermediate();
+    Node& transpose = builder.AddNode("Transpose", {output_reshape1}, {output_transpose});
+    transpose.AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3, 4});
+    // Reshape
+    NodeArg* output_reshape2 = builder.MakeIntermediate();
+    NodeArg* shape2 = builder.Make1DInitializer<int64_t>(input_shape);
+    builder.AddNode("Reshape", {output_transpose, shape2}, {output_reshape2});
+    // Conv
+    NodeArg* output_conv2 = builder.MakeOutput();
+    const std::vector<int64_t> conv_weight_shape2 = {num_channels, 1, 3, 1};
+    NodeArg* conv_weight2 = builder.MakeInitializer<float>({conv_weight_shape2}, -2.f, 2.f);
+    Node& conv2 = builder.AddNode("Conv", {output_reshape2, conv_weight2}, {output_conv2});
+    conv2.AddAttribute("group", static_cast<int64_t>(num_channels));
+    conv2.AddAttribute("kernel_shape", std::vector<int64_t>{3, 1});
+  };
+}
+
+ProviderOptions GetProviderOptions() {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+  return provider_options;
+}
+
+}  // namespace
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+TEST_F(QnnHTPBackendTests, ChannelShuffleFusion) {
+  ProviderOptions provider_options = GetProviderOptions();
+  RunQnnModelTest(BuildTestCase(),
+                  provider_options,
+                  /*opset_version=*/10,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc
index 84e7299dbe3bb..0178d76feb4e7 100644
--- a/onnxruntime/test/providers/qnn/topk_op_test.cc
+++ b/onnxruntime/test/providers/qnn/topk_op_test.cc
@@ -4,12 +4,13 @@
 #if !defined(ORT_MINIMAL_BUILD)
 
 #include <string>
+#include <vector>
 
-#include "test/providers/qnn/qnn_test_utils.h"
-#include "core/graph/node_attr_utils.h"
+#include "gtest/gtest.h"
 
+#include "core/graph/node_attr_utils.h"
 #include "core/graph/onnx_protobuf.h"
-#include "gtest/gtest.h"
+#include "test/providers/qnn/qnn_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -63,12 +64,12 @@ TEST_F(QnnCPUBackendTests, TopK_DynamicK_Unsupported) {
                           ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
 }
 
-// Test that TopK with an axis attribute that is not the last dimension is not supported by QNN EP.
-TEST_F(QnnCPUBackendTests, TopK_NonLastAxis_Unsupported) {
+// Test that TopK with an axis attribute that is not the last dimension.
+TEST_F(QnnCPUBackendTests, TopK_NonLastAxis) {
   RunTopKTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
                           TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
                           {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
-                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+                          ExpectedEPNodeAssignment::All);
 }
 
 // Test that TopK that returns the top k minimum values is not supported by QNN EP.
@@ -165,6 +166,14 @@ TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U8_LastAxis) {
                                ExpectedEPNodeAssignment::All);
 }
 
+// Test 8-bit QDQ TopK on HTP backend: non-last axis
+TEST_F(QnnHTPBackendTests, TopK_U8_NonLastAxis) {
+  RunQDQTopKTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                               TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                               {utils::MakeAttribute("axis", static_cast<int64_t>(1))},  // Attributes
+                               ExpectedEPNodeAssignment::All);
+}
+
 // Test 16-bit QDQ TopK on HTP backend: top 2 largest floats from last axis
 TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U16_LastAxis) {
   RunQDQTopKTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-20.0f, 20.0f, 48)),
@@ -174,6 +183,15 @@ TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U16_LastAxis) {
                                 21);  // opset
 }
 
+// Test 16-bit QDQ TopK on HTP backend: non-last axis
+TEST_F(QnnHTPBackendTests, TopK_U16_NonLastAxis) {
+  RunQDQTopKTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-20.0f, 20.0f, 48)),
+                                TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                                {utils::MakeAttribute("axis", static_cast<int64_t>(1))},  // Attributes
+                                ExpectedEPNodeAssignment::All,
+                                21);  // opset
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/shared_lib/dlopen_main.cc b/onnxruntime/test/shared_lib/dlopen_main.cc
new file mode 100644
index 0000000000000..4eb9f9ca2f567
--- /dev/null
+++ b/onnxruntime/test/shared_lib/dlopen_main.cc
@@ -0,0 +1,314 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This program does not link to any library except MSVC runtime and standard Windows libraries from Windows SDK.
+// Note: If onnxruntime.dll uses the CUDA EP, it will dynamically load CUDA libraries if available.
+#define ORT_API_MANUAL_INIT 1     // Crucial for manual initialization
+#include "onnxruntime_cxx_api.h"  // Ensure this header is in your include path
+                                  // This should also include onnxruntime_c_api.h which defines ORT_API_VERSION and OrtApiBase
+
+#include <Windows.h>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <cassert>
+#include <stdexcept>  // For std::runtime_error
+
+// For memory leak detection on Windows with Visual Studio in Debug mode
+#ifdef _WIN32
+#define _CRTDBG_MAP_ALLOC
+#include <stdlib.h>
+#include <crtdbg.h>
+#endif
+
+// BEGIN: Copied from debug_heap.cpp from public Windows SDK
+#ifdef _DEBUG  // This structure is only relevant for debug builds
+static size_t const no_mans_land_size = 4;
+
+struct _CrtMemBlockHeader {
+  _CrtMemBlockHeader* _block_header_next;
+  _CrtMemBlockHeader* _block_header_prev;
+  char const* _file_name;
+  int _line_number;
+
+  int _block_use;
+  size_t _data_size;
+
+  long _request_number;
+  unsigned char _gap[no_mans_land_size];
+
+  // Followed by:
+  // unsigned char     _data[_data_size];
+  // unsigned char     _another_gap[no_mans_land_size];
+};
+#endif  // _DEBUG
+// END: Copied definition
+
+static constexpr const ORTCHAR_T* MATMUL_MODEL_URI = ORT_TSTR("testdata/matmul_1.onnx");
+
+// Typedef for the OrtGetApiBase function pointer
+typedef const OrtApiBase*(ORT_API_CALL* OrtGetApiBaseFunction)(void);
+
+// Helper to check OrtStatus and throw an exception on error
+void CheckOrtCApiStatus(const OrtApi* ort_api_ptr, OrtStatus* status) {
+  if (status != nullptr) {
+    std::string error_message = ort_api_ptr->GetErrorMessage(status);
+    ort_api_ptr->ReleaseStatus(status);
+    throw std::runtime_error("ONNX Runtime C API Error: " + error_message);
+  }
+}
+
+#ifdef _DEBUG
+
+void PrintCrtMemStateDetails(const _CrtMemState* state, const char* state_name) {
+  std::cout << "\n--- Custom Dump for _CrtMemState: " << state_name << " ---" << std::endl;
+  const char* block_type_names[] = {
+      "_FREE_BLOCK   (0)",
+      "_NORMAL_BLOCK (1)",  // User allocations
+      "_CRT_BLOCK    (2)",  // CRT internal allocations
+      "_IGNORE_BLOCK (3)",
+      "_CLIENT_BLOCK (4)"};
+
+  bool has_differences = false;
+  for (int i = 0; i < _MAX_BLOCKS; ++i) {
+    if (state->lCounts[i] != 0 || state->lSizes[i] != 0) {
+      has_differences = true;
+      std::cout << "  Block Type " << (i < 5 ? block_type_names[i] : "UNKNOWN") << ":" << std::endl;
+      std::cout << "    Net Change in Count of Blocks: " << state->lCounts[i] << std::endl;
+      std::cout << "    Net Change in Total Bytes:     " << state->lSizes[i] << std::endl;
+    }
+  }
+
+  if (!has_differences && (state->lHighWaterCount != 0 || state->lTotalCount != 0)) {
+    std::cout << "  No net change in counts/sizes per block type, but other diffs exist." << std::endl;
+  } else if (!has_differences && state->lHighWaterCount == 0 && state->lTotalCount == 0) {
+    std::cout << "  No differences found in lCounts, lSizes, lHighWaterCount, or lTotalCount for this diff state." << std::endl;
+  }
+
+  std::cout << "  lHighWaterCount (max increase in bytes between snapshots): " << state->lHighWaterCount << std::endl;
+  std::cout << "  lTotalCount (net increase in total bytes allocated):     " << state->lTotalCount << std::endl;
+  std::cout << "----------------------------------------------------" << std::endl;
+}
+
+void DumpCurrentlyAllocatedNormalBlocks(const _CrtMemState* memState, const char* stateName) {
+  std::cout << "\n--- Details of _NORMAL_BLOCKs in State: " << stateName << " ---" << std::endl;
+  _CrtMemBlockHeader* pHead;
+  size_t totalNormalBlockBytes = 0;
+  int normalBlockCount = 0;
+
+  for (pHead = memState->pBlockHeader; pHead != NULL; pHead = pHead->_block_header_next) {
+    if (pHead->_block_use == _NORMAL_BLOCK) {
+      normalBlockCount++;
+      totalNormalBlockBytes += pHead->_data_size;
+
+      std::cout << "  Allocation Request #" << pHead->_request_number << ":" << std::endl;
+      if (pHead->_file_name != NULL) {
+        std::cout << "    File: " << pHead->_file_name << std::endl;
+      } else {
+        std::cout << "    File: (N/A or not recorded)" << std::endl;
+      }
+      std::cout << "    Line: " << pHead->_line_number << std::endl;
+      std::cout << "    Size: " << pHead->_data_size << " bytes" << std::endl;
+    }
+  }
+
+  if (normalBlockCount == 0) {
+    std::cout << "  No _NORMAL_BLOCKs found currently allocated in this memory state." << std::endl;
+  } else {
+    std::cout << "  Summary for " << stateName << ": Found " << normalBlockCount
+              << " _NORMAL_BLOCKs, totaling " << totalNormalBlockBytes << " bytes." << std::endl;
+  }
+  std::cout << "----------------------------------------------------------" << std::endl;
+}
+#endif  // _DEBUG
+
+int main() {
+#ifdef _WIN32
+#if defined(_DEBUG) && !defined(ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+  int tmpFlag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+  tmpFlag |= _CRTDBG_LEAK_CHECK_DF;
+  tmpFlag |= _CRTDBG_ALLOC_MEM_DF;
+  _CrtSetDbgFlag(tmpFlag);
+  std::cout << "CRT Debug Memory Leak Detection Enabled." << std::endl;
+#endif
+#endif
+
+#ifdef _DEBUG
+  _CrtMemState s1, s2, s_before_unload, s3_diff_final;
+  bool heap_debug_initialized = false;
+#endif
+
+  HMODULE ort_library_handle = nullptr;
+  const OrtApi* g_ort_api_instance = nullptr;
+
+  std::cout << "Attempting to test ONNX Runtime dynamic load/unload..." << std::endl;
+
+  try {
+#ifdef _DEBUG
+    _CrtMemCheckpoint(&s1);
+    heap_debug_initialized = true;
+    std::cout << "HEAP_DEBUG: Initial memory checkpoint (s1) taken." << std::endl;
+#endif
+
+    std::cout << "Loading onnxruntime.dll..." << std::endl;
+    ort_library_handle = LoadLibrary(TEXT("onnxruntime.dll"));
+    if (!ort_library_handle) {
+      DWORD error_code = GetLastError();
+      throw std::runtime_error("Failed to load onnxruntime.dll. Error code: " + std::to_string(error_code));
+    }
+    std::cout << "onnxruntime.dll loaded successfully. Handle: " << ort_library_handle << std::endl;
+
+    OrtGetApiBaseFunction ort_get_api_base_func =
+        (OrtGetApiBaseFunction)GetProcAddress(ort_library_handle, "OrtGetApiBase");
+    if (!ort_get_api_base_func) {
+      DWORD error_code = GetLastError();
+      FreeLibrary(ort_library_handle);
+      throw std::runtime_error("Failed to get address of OrtGetApiBase. Error code: " + std::to_string(error_code));
+    }
+    std::cout << "OrtGetApiBase function address obtained." << std::endl;
+
+    const OrtApiBase* api_base = ort_get_api_base_func();
+    if (!api_base) {
+      FreeLibrary(ort_library_handle);
+      throw std::runtime_error("OrtGetApiBase returned nullptr for OrtApiBase.");
+    }
+    std::cout << "OrtApiBase pointer obtained." << std::endl;
+
+    g_ort_api_instance = api_base->GetApi(ORT_API_VERSION);
+    if (!g_ort_api_instance) {
+      const char* version_string = api_base->GetVersionString ? api_base->GetVersionString() : "unknown";
+      FreeLibrary(ort_library_handle);
+      throw std::runtime_error("Failed to get OrtApi from OrtApiBase for ORT_API_VERSION " +
+                               std::to_string(ORT_API_VERSION) + ". DLL version: " + version_string);
+    }
+    std::cout << "OrtApi pointer obtained for ORT_API_VERSION " << ORT_API_VERSION << "." << std::endl;
+
+    Ort::InitApi(g_ort_api_instance);
+    std::cout << "Ort::Api initialized for C++ wrapper." << std::endl;
+
+    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "DynamicLoadTestEnv");
+    std::cout << "OrtEnv created." << std::endl;
+
+    Ort::SessionOptions session_options;
+    session_options.SetIntraOpNumThreads(1);  // Example for CPU
+    std::cout << "OrtSessionOptions configured (base)." << std::endl;
+
+    // Attempt to append CUDA Execution Provider
+    std::cout << "Attempting to append CUDA Execution Provider..." << std::endl;
+    try {
+      OrtCUDAProviderOptions cuda_options{};  // Default options for device 0
+      // For more specific control, you can set members of cuda_options:
+      // cuda_options.device_id = 0;
+      // cuda_options.gpu_mem_limit = SIZE_MAX; // e.g. No limit for ORT's arena
+      // cuda_options.arena_extend_strategy = 0; // kNextPowerOfTwo
+      // For very new ORT versions, check if OrtCUDAProviderOptionsV2 is preferred.
+      session_options.AppendExecutionProvider_CUDA(cuda_options);
+      std::cout << "CUDA Execution Provider appended successfully." << std::endl;
+    } catch (const Ort::Exception& e) {
+      std::cout << "WARNING: Failed to append CUDA Execution Provider. Error: " << e.what() << std::endl;
+      std::cout << "         This is expected if CUDA is not available. Running with CPU or other available EPs." << std::endl;
+    } catch (const std::exception& e_std) {  // Catch other potential std exceptions
+      std::cout << "WARNING: Failed to append CUDA Execution Provider (std::exception). Error: " << e_std.what() << std::endl;
+      std::cout << "         Running with CPU or other available EPs." << std::endl;
+    }
+
+    std::cout << "Creating inference session..." << std::endl;
+    Ort::Session session(env, MATMUL_MODEL_URI, session_options);
+    std::cout << "Inference session created." << std::endl;
+
+    Ort::AllocatorWithDefaultOptions allocator;
+    auto input_name_ptr = session.GetInputNameAllocated(0, allocator);
+    auto output_name_ptr = session.GetOutputNameAllocated(0, allocator);
+    std::vector<const char*> input_names = {input_name_ptr.get()};
+    std::vector<const char*> output_names = {output_name_ptr.get()};
+    std::vector<float> input_tensor_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+    std::vector<int64_t> input_dims = {3, 2};
+    Ort::Value input_tensor = Ort::Value::CreateTensor(allocator.GetInfo(), input_tensor_values.data(),
+                                                       input_tensor_values.size(), input_dims.data(), input_dims.size());
+    auto output_tensors = session.Run(Ort::RunOptions{}, input_names.data(), &input_tensor, 1, output_names.data(), 1);
+
+    assert(output_tensors.size() == 1);
+    const auto& output_tensor = output_tensors[0];
+    assert(output_tensor.IsTensor());
+    const float* output_array = output_tensor.GetTensorData<float>();
+    std::cout << "Output tensor values: " << output_array[0] << ", " << output_array[1] << std::endl;
+    std::cout << "Output verified successfully." << std::endl;
+
+    std::cout << "ONNX Runtime C++ objects (session, env, etc.) are about to go out of scope and be released." << std::endl;
+  } catch (const Ort::Exception& e) {
+    std::cerr << "ONNX Runtime C++ API Exception: " << e.what() << std::endl;
+    if (ort_library_handle) {
+      FreeLibrary(ort_library_handle);
+    }
+    return 1;
+  } catch (const std::exception& e) {
+    std::cerr << "Standard Exception: " << e.what() << std::endl;
+    if (ort_library_handle) {
+      FreeLibrary(ort_library_handle);
+    }
+    return 1;
+  }
+
+  if (ort_library_handle) {
+    std::cout << "Unloading onnxruntime.dll..." << std::endl;
+
+#ifdef _DEBUG
+    if (heap_debug_initialized) {
+      std::cout << "HEAP_DEBUG: Taking memory checkpoint (s_before_unload) before FreeLibrary." << std::endl;
+      _CrtMemCheckpoint(&s_before_unload);
+      // TODO: currently this information is not used.
+    }
+#endif
+
+    BOOL free_result = FreeLibrary(ort_library_handle);
+    HMODULE temp_handle_before_nullptr = ort_library_handle;
+    ort_library_handle = nullptr;
+    g_ort_api_instance = nullptr;
+
+    if (free_result) {
+      std::cout << "FreeLibrary call for onnxruntime.dll succeeded." << std::endl;
+    } else {
+      DWORD error_code = GetLastError();
+      std::cerr << "FreeLibrary call for onnxruntime.dll failed. Error code: " << error_code << std::endl;
+    }
+
+#ifdef _DEBUG
+    if (heap_debug_initialized) {
+      std::cout << "HEAP_DEBUG: Taking memory checkpoint (s2) after FreeLibrary." << std::endl;
+      _CrtMemCheckpoint(&s2);
+
+      if (_CrtMemDifference(&s3_diff_final, &s1, &s2)) {
+        std::cout << "\n---------- HEAP DIFFERENCE (s2 - s1: After Unload vs Before Load) ----------" << std::endl;
+        _CrtMemDumpStatistics(&s3_diff_final);
+        PrintCrtMemStateDetails(&s3_diff_final, "s3_diff_final (Net Change: s2 - s1)");
+
+        if (s3_diff_final.lCounts[_NORMAL_BLOCK] > 0 || s3_diff_final.lSizes[_NORMAL_BLOCK] > 0) {
+          std::cout << "\nHEAP_DEBUG: s3_diff_final indicates an increase in _NORMAL_BLOCKs." << std::endl;
+
+          std::cout << "\nHEAP_DEBUG: Dumping _NORMAL_BLOCKs from s2 (after FreeLibrary - miss DLL symbols):" << std::endl;
+          DumpCurrentlyAllocatedNormalBlocks(&s2, "s2 (After Unload)");
+        } else {
+          std::cout << "\nHEAP_DEBUG: s3_diff_final did not show a net increase in _NORMAL_BLOCKs." << std::endl;
+        }
+      } else {
+        std::cout << "\nHEAP_DEBUG: No overall memory difference detected between s1 (before load) and s2 (after unload)." << std::endl;
+      }
+    }
+#endif
+
+    std::cout << "Verifying if onnxruntime.dll is unloaded from the current process..." << std::endl;
+    HMODULE module_check_handle = GetModuleHandle(TEXT("onnxruntime.dll"));
+    if (module_check_handle == NULL) {
+      std::cout << "onnxruntime.dll is no longer loaded in the process (GetModuleHandle returned NULL)." << std::endl;
+    } else {
+      std::cout << "WARNING: onnxruntime.dll appears to STILL be loaded in the process (GetModuleHandle returned "
+                << module_check_handle << ")." << std::endl;
+      std::cout << "         Original handle was: " << temp_handle_before_nullptr << std::endl;
+    }
+  } else {
+    std::cout << "onnxruntime.dll was not loaded, skipping unload and heap diff." << std::endl;
+  }
+
+  std::cout << "Program finished." << std::endl;
+  return 0;
+}
\ No newline at end of file
diff --git a/onnxruntime/test/shared_lib/test_allocator.cc b/onnxruntime/test/shared_lib/test_allocator.cc
index 0eb76c44412b0..f3d390f64d4b6 100644
--- a/onnxruntime/test/shared_lib/test_allocator.cc
+++ b/onnxruntime/test/shared_lib/test_allocator.cc
@@ -31,4 +31,8 @@ TEST(CApiTest, DefaultAllocator) {
   ASSERT_EQ(allocation.size(), 100U);
   ASSERT_NE(allocation.get(), nullptr);
   memset(allocation.get(), 0, 100U);
+
+  // Default Allocator does not implement GetStats, we expect the stats to be empty.
+  Ort::KeyValuePairs stats = default_allocator.GetStats();
+  ASSERT_EQ(0, stats.GetKeyValuePairs().size());
 }
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index b49c0bad711e1..05a6a433a152d 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -1992,6 +1992,27 @@ TEST(CApiTest, get_allocator_cpu) {
   auto mem_allocation = cpu_allocator.GetAllocation(1024);
   ASSERT_NE(nullptr, mem_allocation.get());
   ASSERT_EQ(1024U, mem_allocation.size());
+
+  Ort::KeyValuePairs stats = cpu_allocator.GetStats();
+
+  // CPU allocator may not support arena usage.
+  // See func DoesCpuAllocatorSupportArenaUsage() in allocator_utils.cc.
+  if (allocator_info.GetAllocatorType() == OrtAllocatorType::OrtArenaAllocator) {
+    ASSERT_EQ("-1", std::string(stats.GetValue("Limit")));
+    ASSERT_EQ("1024", std::string(stats.GetValue("InUse")));
+    ASSERT_EQ("1024", std::string(stats.GetValue("MaxInUse")));
+    ASSERT_EQ("1024", std::string(stats.GetValue("MaxAllocSize")));
+    ASSERT_EQ("2", std::string(stats.GetValue("NumAllocs")));
+    ASSERT_EQ("0", std::string(stats.GetValue("NumReserves")));
+
+    // We don't check values of the following stats
+    ASSERT_NE(nullptr, stats.GetValue("TotalAllocated"));
+    ASSERT_NE(nullptr, stats.GetValue("NumArenaExtensions"));
+    ASSERT_NE(nullptr, stats.GetValue("NumArenaShrinkages"));
+  } else {
+    // If the allocator is not an arena allocator, we expect the stats to be empty.
+    ASSERT_EQ(0, stats.GetKeyValuePairs().size());
+  }
 }
 
 #ifdef USE_CUDA
@@ -2014,6 +2035,20 @@ TEST(CApiTest, get_allocator_cuda) {
   auto mem_allocation = cuda_allocator.GetAllocation(1024);
   ASSERT_NE(nullptr, mem_allocation.get());
   ASSERT_EQ(1024U, mem_allocation.size());
+
+  Ort::KeyValuePairs stats = cuda_allocator.GetStats();
+
+  ASSERT_EQ("-1", std::string(stats.GetValue("Limit")));
+  ASSERT_EQ("1024", std::string(stats.GetValue("InUse")));
+  ASSERT_EQ("1024", std::string(stats.GetValue("MaxInUse")));
+  ASSERT_EQ("1024", std::string(stats.GetValue("MaxAllocSize")));
+  ASSERT_EQ("2", std::string(stats.GetValue("NumAllocs")));
+  ASSERT_EQ("0", std::string(stats.GetValue("NumReserves")));
+
+  // We don't check values of the following stats
+  ASSERT_NE(nullptr, stats.GetValue("TotalAllocated"));
+  ASSERT_NE(nullptr, stats.GetValue("NumArenaExtensions"));
+  ASSERT_NE(nullptr, stats.GetValue("NumArenaShrinkages"));
 }
 #endif
 
diff --git a/onnxruntime/test/testdata/transform/cast_elimination_complex.onnx b/onnxruntime/test/testdata/transform/cast_elimination_complex.onnx
new file mode 100644
index 0000000000000..a76af02be7187
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/cast_elimination_complex.onnx
@@ -0,0 +1,40 @@
+cast_chain_generator:�
+,
+XX_fp16Cast_X_to_fp16"Cast*	
+to
+�
+1
+X_fp16X_fp32Cast_X_to_fp32"Cast*	
+to�
+,
+YY_fp32Cast_Y_to_fp32"Cast*	
+to�
+"
+X_fp32
+Y_fp32t0_sumAdd"Add
+*
+t0_sumt1_castCast_1"Cast*	
+to
+�
++
+t1_castt2_castCast_2"Cast*	
+to�
++
+t2_castt3_castCast_3"Cast*	
+to�
++
+t3_castt4_castCast_4"Cast*	
+to
+�
+&
+t4_castZOutputIdentity"IdentityCastChainGraphZ
+X
+	
+NZ
+Y
+	
+Nb
+Z
+	
+
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fuse_fp16_initializers.onnx b/onnxruntime/test/testdata/transform/fuse_fp16_initializers.onnx
new file mode 100644
index 0000000000000..ecdf064207ba1
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fuse_fp16_initializers.onnx
@@ -0,0 +1,28 @@
+
+:�
+�
+X
+W
+BYconv_1"Conv*
+auto_pad"NOTSET�*
+	dilations@@�*
+group�*
+kernel_shape@@�*
+pads@@@@�*
+strides@@�!conv_with_initializer_and_padding*!
+*�s�T�t�\�t�c�e�e�QBW*
+*�sBBZ
+X
+
+
+
+
+
+b
+Y
+
+
+
+
+
+B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fuse_fp16_initializers_with_graph_outputs.onnx b/onnxruntime/test/testdata/transform/fuse_fp16_initializers_with_graph_outputs.onnx
new file mode 100644
index 0000000000000..7cee4b952b67a
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fuse_fp16_initializers_with_graph_outputs.onnx
@@ -0,0 +1,27 @@
+
+:�
+"
+XB	cast_node"Cast*	
+to�
+
+A
+BCadd_node"Addtest*A
+*2�q�o�p�h�t�w�O�i�b�s�s�E�l�o�w�s�w�p�w�t�w�[�_�v�sBXZ
+A
+
+
+
+
+b
+B
+
+
+
+
+b
+C
+
+
+
+
+B
\ No newline at end of file
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 9b1c1608ea25d..a52043ee20207 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -85,12 +85,15 @@ std::unique_ptr<IExecutionProvider> DefaultMIGraphXExecutionProvider() {
       0,
       0,
       0,
+      0,
       nullptr,
       1,
       "./compiled_model.mxr",
       1,
       "./compiled_model.mxr",
-      1};
+      1,
+      SIZE_MAX,
+      0};
   return MIGraphXProviderFactoryCreator::Create(&params)->CreateProvider();
 #else
   return nullptr;
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index c488c16f4a60c..0f75b8c3defed 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -60,7 +60,7 @@ int EMSCRIPTEN_KEEPALIVE OrtGetLastError(int* error_code, const char** error_mes
  * create an instance of ORT session options.
  * assume that all enum type parameters, such as graph_optimization_level, execution_mode, and log_severity_level,
  * are checked and set properly at JavaScript.
- * @param graph_optimization_level disabled, basic, extended, or enable all
+ * @param graph_optimization_level disabled, basic, extended, layout, or enable all
  * @param enable_cpu_mem_arena enable or disable cpu memory arena
  * @param enable_mem_pattern enable or disable memory pattern
  * @param execution_mode sequential or parallel execution mode
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index 29a309920c74b..a2b30eac03514 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -216,6 +216,9 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
     case TransformerLevel::Level3: {
     } break;
 
+    case TransformerLevel::Level4: {
+    } break;
+
     default:
       ORT_ENFORCE(false, "Unsupported level " + std::to_string(static_cast<uint32_t>(level)));
       break;
@@ -295,6 +298,14 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
     } break;
 
+    case TransformerLevel::Level4: {
+      // NOTE: Placeholder for adding level 4 transformers to handle unsupported datatype optimizations.
+      // For inference, FuseInitializersTransformer is used to fuse FP16 initializers with FP32 nodes.
+      // For training, if it is necessary to fuse FP16 initializers to FP32 nodes (e.g., to support platforms without FP16),
+      // add FuseInitializersTransformer to the training graph transformers list.
+      // For reference, see FuseFp16InitializerToFp32NodeTransformer in onnxruntime/core/optimizer/graph_transformer_utils.cc.
+    } break;
+
     default:
       ORT_ENFORCE(false, "Unsupported level " + std::to_string(static_cast<uint32_t>(level)));
       break;
diff --git a/orttraining/orttraining/python/orttraining_pybind_common.h b/orttraining/orttraining/python/orttraining_pybind_common.h
index 46fe3efd41bdf..6304fc4ab11ad 100644
--- a/orttraining/orttraining/python/orttraining_pybind_common.h
+++ b/orttraining/orttraining/python/orttraining_pybind_common.h
@@ -20,9 +20,10 @@ using ExecutionProviderLibInfoMap = std::unordered_map<std::string, std::pair<st
 
 class ORTTrainingPythonEnv {
  public:
-  ORTTrainingPythonEnv();
+  ORTTrainingPythonEnv(std::unique_ptr<OrtEnv> ort_env);
 
-  std::shared_ptr<Environment> GetORTEnv() const;
+  const OrtEnv& GetORTEnv() const;
+  OrtEnv& GetORTEnv();
 
   std::shared_ptr<IExecutionProvider> GetExecutionProviderInstance(const std::string& provider_type,
                                                                    size_t hash);
@@ -45,7 +46,8 @@ class ORTTrainingPythonEnv {
   std::string GetExecutionProviderMapKey(const std::string& provider_type,
                                          size_t hash);
 
-  std::shared_ptr<Environment> ort_env_;
+  std::unique_ptr<OrtEnv> ort_env_;
+  // NOTE: the EPs in the following map probably depends on dynamic EP DLLs that are going to be unloaded by OrtEnv's destructor if we delete OrtEnv
   ExecutionProviderMap execution_provider_instances_map_;
   std::vector<std::string> available_training_eps_;
 };
diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc
index b2392b68ac43e..51a1ffeeed267 100644
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@@ -171,10 +171,9 @@ struct PyOptimizer {
               std::vector<std::shared_ptr<IExecutionProvider>> providers, PySessionOptions* session_options)
       : optimizer_() {
     auto model_identifiers = onnxruntime::training::api::ModelIdentifiers("", std::nullopt, optimizer_model_uri);
-    auto env = GetTrainingEnv().GetORTEnv();
     // XXX: We hope that env will be around when optimizer needs it.
     optimizer_ = std::make_shared<onnxruntime::training::api::Optimizer>(
-        model_identifiers, state, session_options->value, *env, providers, session_options->custom_op_domains_);
+        model_identifiers, state, session_options->value, GetTrainingEnv().GetORTEnv().GetEnvironment(), providers, session_options->custom_op_domains_);
   }
 
   std::shared_ptr<onnxruntime::training::api::Optimizer> optimizer_;
@@ -583,9 +582,9 @@ void addObjectMethodsForTraining(py::module& m) {
                ORT_THROW_IF_ERROR(onnxruntime::CreateCustomRegistry(options->custom_op_domains_, custom_registry));
                local_registries.push_back(custom_registry->GetOpschemaRegistry());
              }
+             auto logger_ptr = GetOrtEnv()->GetLoggingManager()->CreateLogger("orttraining");
 
              std::shared_ptr<Model> model;
-             auto logger_ptr = std::make_unique<logging::Logger>(logging::LoggingManager::DefaultLogger());
              logging::Severity severity = logging::Severity::kINFO;
              if (options && options->value.session_log_severity_level >= 0) {
                severity = static_cast<logging::Severity>(options->value.session_log_severity_level);
@@ -665,11 +664,9 @@ void addObjectMethodsForTraining(py::module& m) {
                        std::optional<std::string> eval_model_uri,
                        OrtDevice device, PySessionOptions* session_options) {
         std::vector<std::shared_ptr<IExecutionProvider>> provider = GetExecutionProvidersForTrainingApis(device);
-        auto env = GetTrainingEnv().GetORTEnv();
         auto model_identifiers = onnxruntime::training::api::ModelIdentifiers(model_uri, eval_model_uri, std::nullopt);
         return std::make_unique<onnxruntime::training::api::Module>(model_identifiers,
-                                                                    state, session_options->value, *env, provider,
-                                                                    session_options->custom_op_domains_);
+                                                                    state, session_options->value, GetTrainingEnv().GetORTEnv().GetEnvironment(), provider, session_options->custom_op_domains_);
       }))
       .def("train_step",
            [](onnxruntime::training::api::Module* model,
@@ -980,7 +977,7 @@ void addObjectMethodsForTraining(py::module& m) {
           ORT_THROW_IF_ERROR(Model::Load(buffer, &model_proto));
 
           // Get the ort model from ModelProto model
-          auto logger_ptr = std::make_unique<logging::Logger>(logging::LoggingManager::DefaultLogger());
+          auto logger_ptr = GetOrtEnv()->GetLoggingManager()->CreateLogger("orttraining");
           logging::Severity severity = logging::Severity::kINFO;
           if (options && options->value.session_log_severity_level >= 0) {
             severity = static_cast<logging::Severity>(options->value.session_log_severity_level);
diff --git a/orttraining/orttraining/python/orttraining_python_module.cc b/orttraining/orttraining/python/orttraining_python_module.cc
index 7ec924b6d9bb4..897ae7e6a94c9 100644
--- a/orttraining/orttraining/python/orttraining_python_module.cc
+++ b/orttraining/orttraining/python/orttraining_python_module.cc
@@ -123,13 +123,17 @@ bool GetProviderInstanceHash(const std::string& type,
   return false;
 }
 
-ORTTrainingPythonEnv::ORTTrainingPythonEnv() : ort_env_(GetEnv()) {
+ORTTrainingPythonEnv::ORTTrainingPythonEnv(std::unique_ptr<OrtEnv> ort_env) : ort_env_(std::move(ort_env)) {
   const auto& builtinEPs = GetAvailableExecutionProviderNames();
   available_training_eps_.assign(builtinEPs.begin(), builtinEPs.end());
 }
 
-std::shared_ptr<Environment> ORTTrainingPythonEnv::GetORTEnv() const {
-  return ort_env_;
+const OrtEnv& ORTTrainingPythonEnv::GetORTEnv() const {
+  return *ort_env_;
+}
+
+OrtEnv& ORTTrainingPythonEnv::GetORTEnv() {
+  return *ort_env_;
 }
 
 std::shared_ptr<IExecutionProvider> ORTTrainingPythonEnv::GetExecutionProviderInstance(const std::string& provider_type,
@@ -168,75 +172,33 @@ void ORTTrainingPythonEnv::ClearExecutionProviderInstances() {
   execution_provider_instances_map_.clear();
 }
 
-namespace {
-
-// This class provides a static shell for on-demand and thread-safe construction
-// of ORTTrainingPythonEnv object for both Inference and Training python layers.
-// ORTTrainingPythonEnv class contains instances of execution providers that have been
-// instantiated for training purposes. It depends on the Environment singleton to which it
-// holds a shared_ptr instance.
-//
-// 1) we make this class a singleton that is a function local static. The function local statics
-//    are constructed when the function is called the very first time. This fact has several important
-//    properties.
-//    - First, it is constructed before it is first needed possibly by another static object
-//      and destroyed after that object is destroyed.
-//    - Second, it is constructed in a thread safe manner.
-//    - Last, this order of construction/destruction is enforced across the compilation units, as opposed
-//      to the static objects that are simply declared in order in a single unit, but their lifespan is
-//      unconnected to that of in other compilation units. This is achieved automatically by run-time
-//      by execution atexit() to build a chain.
-// 2) This ORTTrainingPythonEnv is currently owned by a unique_ptr unlike the Environment singleton. This is
-//    because we currently do not see a need to refer to it by any of the Python objects or by other singletons.
-//    With this change this singleton is properly destroyed after python module is unloaded, but before the Environment.
-//    HOWEVER, because it holds instances of execution providers, we want to make sure that those instances are destroyed
-//    before those depended EP DLLs are unloaded so EP destructor can run.
-//    This static is destroyed when this compilation unit is unloaded and it generally happens
-//    AFTER EP dlls are unloaded. To mitigate that, we clear EP instances using python `atexit` (different from C atexit())
-//    mechanism which takes place after all python objects are GCed but before any DLLs are unloaded or
-//    runtime starts destroying globals.
-// 3) We guard against singleton resurrection attempts to detect code that runs when it should not
-//    and make necessary adjustments.
-//    For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6.
-class TrainingEnvInitialzer {
- public:
-  static ORTTrainingPythonEnv& Instance() {
-    // Guard against attempts to resurrect the singleton
-    if (TrainingEnvInitialzer::destroyed) {
-      ORT_THROW("Detected an attempt to resurrect destroyed Training Environment");
-    }
-
-    static TrainingEnvInitialzer training_env_holder;
-
-    return training_env_holder.Get();
-  }
-
- private:
-  TrainingEnvInitialzer() {
-    ORT_ENFORCE(InitArray());
-    Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
-    ort_training_env_ = std::make_unique<ORTTrainingPythonEnv>();
-  }
+static ORTTrainingPythonEnv* ort_training_env = nullptr;
 
-  ~TrainingEnvInitialzer() {
-    destroyed = true;
-  }
+OrtEnv* GetOrtEnv() {
+  return &ort_training_env->GetORTEnv();
+}
+onnxruntime::Environment& GetEnv() {
+  return ort_training_env->GetORTEnv().GetEnvironment();
+}
 
-  ORTTrainingPythonEnv& Get() noexcept {
-    return *ort_training_env_;
+static Status CreateOrtEnv() {
+  Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
+  OrtEnv::LoggingManagerConstructionInfo lm_info{nullptr, nullptr, ORT_LOGGING_LEVEL_WARNING, "Default"};
+  Status status;
+  std::unique_ptr<OrtEnv> ort_env(OrtEnv::GetInstance(lm_info, status));
+  if (!status.IsOK()) return status;
+#if !defined(__APPLE__) && !defined(ORT_MINIMAL_BUILD)
+  if (!InitProvidersSharedLibrary()) {
+    const logging::Logger& default_logger = ort_env->GetLoggingManager()->DefaultLogger();
+    LOGS(default_logger, WARNING) << "Init provider bridge failed.";
   }
-
-  std::unique_ptr<ORTTrainingPythonEnv> ort_training_env_;
-
-  static bool destroyed;
-};
-
-bool TrainingEnvInitialzer::destroyed = false;
-
-}  // namespace
+#endif
+  ort_training_env = new ORTTrainingPythonEnv(std::move(ort_env));
+  return Status::OK();
+}
 
 ORTTrainingPythonEnv& GetTrainingEnv() {
-  return TrainingEnvInitialzer::Instance();
+  return *ort_training_env;
 }
 
 void ResolveExtraProviderOptions(const std::vector<std::string>& provider_types,
@@ -306,29 +268,29 @@ void ORTTrainingRegisterExecutionProviders(InferenceSession* sess, const std::ve
   }
 }
 
-PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
+Status CreateTrainingPybindStateModule(py::module& m) {
   m.doc() = "pybind11 stateful interface to ORTTraining";
   RegisterExceptions(m);
+  if (!InitArray()) {
+    return Status(::onnxruntime::common::ONNXRUNTIME, ::onnxruntime::common::FAIL, "import numpy failed");
+  }
+  ORT_RETURN_IF_ERROR(CreateOrtEnv());
 
-  // Instantiate singletons
-  GetTrainingEnv();
   addGlobalMethods(m);
   addObjectMethods(m, ORTTrainingRegisterExecutionProviders);
   addOrtValueMethods(m);
   addSparseTensorMethods(m);
   addIoBindingMethods(m);
   addAdapterFormatMethods(m);
+  addObjectMethodsForTraining(m);
 
-#if !defined(__APPLE__) && \
-    (!defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS))
-  Ort::SessionOptions tmp_options;
-  if (!InitProvidersSharedLibrary()) {
-    const logging::Logger& default_logger = logging::LoggingManager::DefaultLogger();
-    LOGS(default_logger, WARNING) << "Init provider bridge failed.";
-  }
-#endif
+  return Status::OK();
+}
 
-  addObjectMethodsForTraining(m);
+PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
+  auto st = CreateTrainingPybindStateModule(m);
+  if (!st.IsOK())
+    throw pybind11::import_error(st.ErrorMessage());
 
 #ifdef ENABLE_LAZY_TENSOR
   addObjectMethodsForLazyTensor(m);
@@ -357,13 +319,6 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
       "Clean the execution provider instances used in ort training module.");
 
   m.def("has_collective_ops", []() -> bool { return HAS_COLLECTIVE_OPS; });
-
-  // See documentation for class TrainingEnvInitialzer earlier in this module
-  // for an explanation as to why this is needed.
-  auto atexit = py::module_::import("atexit");
-  atexit.attr("register")(py::cpp_function([]() {
-    GetTrainingEnv().ClearExecutionProviderInstances();
-  }));
 }
 
 }  // namespace python
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index ed3471bdb47a9..3cdae51ca7341 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -3,6 +3,6 @@
 lintrunner==0.12.7
 lintrunner-adapters==0.12.4
 # RUFF
-ruff==0.11.11
+ruff==0.11.13
 # CLANGFORMAT
 clang-format==19.1.7
diff --git a/rust/onnxruntime/src/lib.rs b/rust/onnxruntime/src/lib.rs
index ce4721ef4240f..189c9b53d046b 100644
--- a/rust/onnxruntime/src/lib.rs
+++ b/rust/onnxruntime/src/lib.rs
@@ -333,6 +333,8 @@ pub enum GraphOptimizationLevel {
     Basic = sys::GraphOptimizationLevel::ORT_ENABLE_BASIC as OnnxEnumInt,
     /// Extended optimization
     Extended = sys::GraphOptimizationLevel::ORT_ENABLE_EXTENDED as OnnxEnumInt,
+    /// Layout optimization
+    Layout = sys::GraphOptimizationLevel::ORT_ENABLE_LAYOUT as OnnxEnumInt,
     /// Add optimization
     All = sys::GraphOptimizationLevel::ORT_ENABLE_ALL as OnnxEnumInt,
 }
@@ -344,6 +346,7 @@ impl From<GraphOptimizationLevel> for sys::GraphOptimizationLevel {
             DisableAll => sys::GraphOptimizationLevel::ORT_DISABLE_ALL,
             Basic => sys::GraphOptimizationLevel::ORT_ENABLE_BASIC,
             Extended => sys::GraphOptimizationLevel::ORT_ENABLE_EXTENDED,
+            Layout => sys::GraphOptimizationLevel::ORT_ENABLE_LAYOUT,
             All => sys::GraphOptimizationLevel::ORT_ENABLE_ALL,
         }
     }
diff --git a/setup.py b/setup.py
index 6a1f126476158..37f615e9a5f8a 100644
--- a/setup.py
+++ b/setup.py
@@ -373,7 +373,6 @@ def finalize_options(self):
         "libQnnSaver.so",
         "libQnnSystem.so",
         "libHtpPrepare.so",
-        "ep_weight_sharing_ctx_gen",
     ]
     dl_libs.extend(qnn_deps)
     if nightly_build:
diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py
index d2739cd805fb0..6a8154681ed97 100644
--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@@ -187,4 +187,4 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
     parser.add_argument("src", help="src")
     args = parser.parse_args()
 
-    hipify(os.path.join(os.path.dirname(__file__), "hipify-perl"), args.src, args.output)
+    hipify(args.hipify_perl, args.src, args.output)
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 7f8039d237731..aab62a723d71c 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -103,7 +103,10 @@ extends:
       spotBugs:
         enabled: false
         justificationForDisabling: "Getting ##[error]1. SpotBugs Error gdn.unknownFormatResult - File: spotbugs.xml, which indicates that SpotBugs found one or more errors, which are not handled by the Guardian right now."
-
+      codeql:
+        compiled:
+          enabled: false
+          justificationForDisabling: 'CodeQL is taking nearly 6 hours resulting in timeouts in our production pipelines'
     stages:
     - template: stages/set_packaging_variables_stage.yml
       parameters:
@@ -173,71 +176,3 @@ extends:
         sln_platform: 'arm64'
         DoEsrp: ${{ parameters.DoEsrp }}
         DependsOnStageName: Windows_Nodejs_Packaging_x64
-
-    - template: nuget/templates/dml-vs-2022.yml
-      parameters:
-        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-        ArtifactName: 'drop-nuget-dml'
-        StageName: 'Windows_CI_GPU_DML_Dev'
-        BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --enable_generic_interface --build_nodejs --cmake_generator "Visual Studio 17 2022" --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-        BuildArch: 'x64'
-        msbuildArchitecture: 'amd64'
-        EnvSetupScript: 'setup_env.bat'
-        sln_platform: 'x64'
-        DoDebugBuild: 'false'
-        DoNugetPack: 'true'
-        DoEsrp: ${{ parameters.DoEsrp }}
-        NuPackScript: |
-          msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} /p:CurrentData=$(BuildDate) /p:CurrentTime=$(BuildTime)
-          copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
-          mkdir $(Build.ArtifactStagingDirectory)\testdata
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
-    - template: nuget/templates/dml-vs-2022.yml
-      parameters:
-        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-        ArtifactName: 'drop-win-dml-x86-zip'
-        StageName: 'Windows_CI_GPU_DML_Dev_x86'
-        BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --enable_generic_interface --cmake_generator "Visual Studio 17 2022" --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-        BuildArch: 'x86'
-        EnvSetupScript: 'setup_env_x86.bat'
-        sln_platform: 'Win32'
-        DoDebugBuild: 'false'
-        DoNugetPack: 'true'
-        DoEsrp: ${{ parameters.DoEsrp }}
-        RunTests: 'false'
-        NuPackScript: |
-          msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=x86 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-          cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-          ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-x86.zip
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-x86.zip $(Build.ArtifactStagingDirectory)
-          mkdir $(Build.ArtifactStagingDirectory)\testdata
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
-    - template: nuget/templates/dml-vs-2022.yml
-      parameters:
-        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-        ArtifactName: 'drop-win-dml-arm64-zip'
-        StageName: 'Windows_CI_GPU_DML_Dev_arm64'
-        BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --enable_generic_interface --build_nodejs --cmake_generator "Visual Studio 17 2022" --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-        BuildArch: 'x64'
-        EnvSetupScript: 'setup_env.bat'
-        sln_platform: 'arm64'
-        DoDebugBuild: 'false'
-        DoNugetPack: 'true'
-        DoEsrp: ${{ parameters.DoEsrp }}
-        RunTests: 'false'
-        NuPackScript: |
-          msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm64 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-          cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-          ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm64.zip
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm64.zip $(Build.ArtifactStagingDirectory)
-          mkdir $(Build.ArtifactStagingDirectory)\testdata
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
-
-
-    - template: stages/nuget_dml_packaging_stage.yml
-      parameters:
-        DoEsrp: ${{ parameters.DoEsrp }}
diff --git a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
new file mode 100644
index 0000000000000..9a2abff0ce8b6
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
@@ -0,0 +1,106 @@
+resources:
+  repositories:
+  - repository: 1esPipelines
+    type: git
+    name: 1ESPipelineTemplates/1ESPipelineTemplates
+    ref: refs/tags/release
+
+parameters:
+- name: DoEsrp
+  displayName: Run code sign tasks? Must be true if you are doing an ONNX Runtime release
+  type: boolean
+  default: true
+
+- name: IsReleaseBuild
+  displayName: Is a release build? Set it to true if you are doing an ONNX Runtime release.
+  type: boolean
+  default: false
+
+- name: PreReleaseVersionSuffixString
+  displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package.
+  type: string
+  values:
+  - alpha
+  - beta
+  - rc
+  - none
+  default: none
+
+- name: PreReleaseVersionSuffixNumber
+  displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
+  type: number
+  default: 0
+
+extends:
+  # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks.
+  # For non-production pipelines, use "Unofficial" as defined below.
+  # For productions pipelines, use "Official".
+  template: v1/1ES.Official.PipelineTemplate.yml@1esPipelines
+  parameters:
+    sdl:
+      binskim:
+        enabled: true
+        analyzeTargetGlob: $(Build.ArtifactStagingDirectory)/**.dll
+      sourceAnalysisPool: "Onnxruntime-Win-CPU-2022"
+      componentgovernance:
+        ignoreDirectories: $(Build.SourcesDirectory)/onnxruntime-inference-examples
+      sourceRepositoriesToScan:
+        exclude:
+        - repository: onnxruntime-inference-examples
+      spotBugs:
+        enabled: false
+        justificationForDisabling: "Getting ##[error]1. SpotBugs Error gdn.unknownFormatResult - File: spotbugs.xml, which indicates that SpotBugs found one or more errors, which are not handled by the Guardian right now."
+      codeql:
+        compiled:
+          enabled: false
+          justificationForDisabling: 'CodeQL is taking nearly 6 hours resulting in timeouts in our production pipelines'
+    stages:
+    - template: stages/set_packaging_variables_stage.yml
+      parameters:
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+        PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
+        PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
+    - template: nuget/templates/dml-vs-2022.yml
+      parameters:
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+        ArtifactName: 'drop-nuget-dml'
+        StageName: 'Windows_CI_GPU_DML_Dev'
+        BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --enable_generic_interface --build_nodejs --cmake_generator "Visual Studio 17 2022" --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+        BuildArch: 'x64'
+        msbuildArchitecture: 'amd64'
+        EnvSetupScript: 'setup_env.bat'
+        sln_platform: 'x64'
+        DoDebugBuild: 'false'
+        DoNugetPack: 'true'
+        DoEsrp: ${{ parameters.DoEsrp }}
+        NuPackScript: |
+          msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} /p:CurrentData=$(BuildDate) /p:CurrentTime=$(BuildTime)
+          copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
+          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
+          mkdir $(Build.ArtifactStagingDirectory)\testdata
+          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
+
+    - template: nuget/templates/dml-vs-2022.yml
+      parameters:
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+        ArtifactName: 'drop-win-dml-arm64-zip'
+        StageName: 'Windows_CI_GPU_DML_Dev_arm64'
+        BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --enable_generic_interface --build_nodejs --cmake_generator "Visual Studio 17 2022" --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+        BuildArch: 'x64'
+        EnvSetupScript: 'setup_env.bat'
+        sln_platform: 'arm64'
+        DoDebugBuild: 'false'
+        DoNugetPack: 'true'
+        DoEsrp: ${{ parameters.DoEsrp }}
+        RunTests: 'false'
+        NuPackScript: |
+          msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm64 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+          cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
+          ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm64.zip
+          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm64.zip $(Build.ArtifactStagingDirectory)
+          mkdir $(Build.ArtifactStagingDirectory)\testdata
+          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
+
+    - template: stages/nuget_dml_packaging_stage.yml
+      parameters:
+        DoEsrp: ${{ parameters.DoEsrp }}
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index a87b85eaac256..5f4fe509926bf 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -156,7 +156,7 @@ stages:
               ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
                 symbolExpiryTime: 60
               includePublicSymbolServer: true
-              symbolsArtifactName: ${{parameters.artifactNameNoVersionString}}
+              symbolsArtifactName: onnxruntime-dml-nuget-${{ parameters.BuildArch }}
               symbolsVersion: $(Build.BuildId)
               symbolProject: 'ONNX Runtime'
               subscription: 'OnnxrunTimeCodeSign_20240611'
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml
index af2069a3512a5..6538c15db5253 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml
@@ -9,7 +9,6 @@ parameters:
   # For training packages, to differentiate the artifact name we add '-training' suffix. This needs to be passed from
   # the parent pipeline.
   TestDataArtifactSuffix: ''
-  Skipx86Tests: 'false'
   CudaVersion: ''
   SpecificArtifact: false
   BuildId: ''
@@ -89,15 +88,6 @@ stages:
       env:
         PACKAGENAME: ${{ parameters.NugetPackageName }}
 
-    - ${{ if ne(parameters['Skipx86Tests'], 'true') }}:
-        - script: |
-           @echo "Running Runtest.bat"
-           test\Microsoft.ML.OnnxRuntime.EndToEndTests\runtest.bat $(Build.BinariesDirectory)\nuget-artifact net8.0 x86 $(NuGetPackageVersionNumber)
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-          displayName: 'Run End to End Test (C#) .Net Core x86'
-          env:
-            PACKAGENAME: ${{ parameters.NugetPackageName }}
-
     # TODO: Add .Net Framework AnyCPU test task
 
     - script: |
@@ -106,16 +96,4 @@ stages:
       workingDirectory: '$(Build.SourcesDirectory)\csharp'
       displayName: 'Run End to End Test (C#) .NetFramework x64'
       env:
-        PACKAGENAME: ${{ parameters.NugetPackageName }}
-
-    - ${{ if ne(parameters['Skipx86Tests'], 'true') }}:
-        - script: |
-           @echo "Running Runtest.bat"
-           test\Microsoft.ML.OnnxRuntime.EndToEndTests\runtest.bat $(Build.BinariesDirectory)\nuget-artifact net462 x86 $(NuGetPackageVersionNumber)
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-          displayName: 'Run End to End Test (C#) .NetFramework x86'
-          enabled: false
-          env:
-            PACKAGENAME: ${{ parameters.NugetPackageName }}
-
-    - template: ../../templates/clean-agent-build-directory-step.yml
+        PACKAGENAME: ${{ parameters.NugetPackageName }}
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index 34892d684b2e8..d8a4457a7049c 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -49,7 +49,7 @@ extends:
       codeql:
         compiled:
           enabled: false
-          justificationForDisabling: 'CodeQL is taking nearly 4 hours resulting in timeouts in our production pipelines'
+          justificationForDisabling: 'CodeQL is taking nearly 6 hours resulting in timeouts in our production pipelines'
     pool:
       name: 'onnxruntime-Win-CPU-2022'  # Name of your hosted pool
       os: windows  # OS of the image. This value cannot be a variable. Allowed values: windows, linux, macOS
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index a4fe78a7088e3..fc01bbd97e696 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -66,7 +66,6 @@ stages:
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
-    Skipx86Tests: 'true'
     CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
@@ -78,7 +77,6 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
-    Skipx86Tests: 'true'
     CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget_dml_packaging_stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget_dml_packaging_stage.yml
index 0f9314dbbedfb..cf4546f04bd56 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget_dml_packaging_stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget_dml_packaging_stage.yml
@@ -7,7 +7,6 @@ stages:
 - stage: NuGet_Packaging_DML
   dependsOn:
   - Windows_CI_GPU_DML_Dev
-  - Windows_CI_GPU_DML_Dev_x86
   - Windows_CI_GPU_DML_Dev_arm64
   condition: succeeded()
   jobs:
@@ -48,15 +47,6 @@ stages:
                 unzip %%~ni.zip -d %%~ni
                 del /Q %%~ni.zip
         
-                unzip win-dml-x86.zip -d win-x86
-                mkdir %%~ni\runtimes\win-x86
-                mkdir %%~ni\runtimes\win-x86\native
-        
-                move win-x86\runtimes\win-x86\native\onnxruntime.dll %%~ni\runtimes\win-x86\native\onnxruntime.dll
-                move win-x86\runtimes\win-x86\native\onnxruntime.lib %%~ni\runtimes\win-x86\native\onnxruntime.lib
-                move win-x86\runtimes\win-x86\native\onnxruntime.pdb %%~ni\runtimes\win-x86\native\onnxruntime.pdb
-                move win-x86\runtimes\win-x86\native\onnxruntime_providers_shared.dll %%~ni\runtimes\win-x86\native\onnxruntime_providers_shared.dll
-        
                 unzip win-dml-arm64.zip -d win-arm64
                 mkdir %%~ni\runtimes\win-arm64
                 mkdir %%~ni\runtimes\win-arm64\native
@@ -88,7 +78,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg'
-        PlatformsSupported: 'win-x64,win-x86,win-arm64'
+        PlatformsSupported: 'win-x64,win-arm64'
         VerifyNugetSigning: ${{ parameters.DoEsrp }}
 
     - task: 1ES.PublishPipelineArtifact@1
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 84445b117b495..728927f33886a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -67,7 +67,9 @@ stages:
     dependsOn: []
     jobs:
     - job: Windows_py_Wheels
-      pool: 'onnxruntime-Win-CPU-2022'
+      pool:
+        name: 'onnxruntime-Win-CPU-2022'
+        os: windows
       strategy:
         matrix:
           Python310_x64:
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index a6cf5b9a7713e..d4da4d56a9766 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -163,20 +163,6 @@ stages:
         targetPath: '$(Build.BinariesDirectory)/artifacts'
         artifactName: 'onnxruntime-ios-full-xcframework'
 
-
-
-- template: win-ci.yml
-  parameters:
-    DoEsrp: ${{ parameters.DoEsrp }}
-    stage_name_suffix: CPU_x86_${{ parameters.BuildVariant }}
-    buildArch: x86
-    msbuildPlatform: Win32
-    packageName: x86
-    buildparameter: ${{ parameters.AdditionalBuildFlags }} ${{ parameters.AdditionalWinBuildFlags}}
-    runTests: ${{ parameters.RunOnnxRuntimeTests }}
-    buildJava: false
-    buildNodejs: false
-
 - template: win-ci.yml
   parameters:
     DoEsrp: ${{ parameters.DoEsrp }}
@@ -296,7 +282,6 @@ stages:
   - Setup
   - Linux_C_API_Packaging_CPU
   - MacOS_C_API_Package_Publish
-  - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Android_Java_API_AAR_Packaging_Full
@@ -326,14 +311,6 @@ stages:
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
-    - template: flex-downloadPipelineArtifact.yml
-      parameters:
-        StepName: 'Download win-x86 Pipeline Artifact'
-        ArtifactName: 'onnxruntime-win-x86'
-        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-
     - template: flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download win-arm64 Pipeline Artifact'
@@ -484,7 +461,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
-        PlatformsSupported: 'win-x64,win-x86,linux-x64,linux-arm64,osx-x64'
+        PlatformsSupported: 'win-x64,linux-x64,linux-arm64,osx-x64'
         VerifyNugetSigning: false
 
     - task: 1ES.PublishPipelineArtifact@1
@@ -806,7 +783,6 @@ stages:
 - template: ../nuget/templates/test_win.yml
   parameters:
     AgentPool: 'onnxruntime-Win-CPU-2022'
-    Skipx86Tests: false
     NugetPackageName: 'Microsoft.ML.OnnxRuntime'
     ArtifactSuffix: 'CPU'
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
index 6e687e3f7cd55..428685658d3df 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
@@ -48,20 +48,6 @@ steps:
       @echo ##vso[task.setvariable variable=vcvarsall]%vcvarsall%
     displayName: 'locate vcvarsall via vswhere'
 
-- ${{ if eq(parameters.BuildArch, 'x86') }}:
-  - script: |
-      @echo off
-      set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
-      for /f "usebackq delims=" %%i in (`%vswherepath% -latest -property installationPath`) do (
-        if exist "%%i\VC\Auxiliary\Build\vcvars32.bat" (
-          set vcvarsall="%%i\VC\Auxiliary\Build\vcvars32.bat"
-        )
-      )
-
-      @echo %vcvarsall% will be used as the VC compiler
-      @echo ##vso[task.setvariable variable=vcvarsall]%vcvarsall%
-    displayName: 'locate vcvarsall via vswhere'
-
 - task: BatchScript@1
   displayName: 'Setup VC env'
   inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 56f0a6aad7dd0..5c12f5a5440f2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -301,7 +301,6 @@ stages:
 - template: ../nuget/templates/test_win.yml
   parameters:
     AgentPool : 'onnxruntime-Win-CPU-2022'
-    Skipx86Tests : false
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Training'
     ArtifactSuffix: 'Training-CPU'
     StageSuffix: 'Training_CPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml b/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml
index 2f7f3989e6055..c6b84bb0e7330 100644
--- a/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml
@@ -70,7 +70,6 @@ stages:
               7z.exe l -slt %%~ni.zip runtimes\linux-arm64\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo linux,aarch64,default,%%a >> $(Build.BinariesDirectory)\binary_size_data.txt
               7z.exe l -slt %%~ni.zip runtimes\osx-x64\native\libonnxruntime.dylib | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo osx,x64,default,%%a >> $(Build.BinariesDirectory)\binary_size_data.txt
               7z.exe l -slt %%~ni.zip runtimes\win-x64\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo win,x64,default,%%a >> $(Build.BinariesDirectory)\binary_size_data.txt
-              7z.exe l -slt %%~ni.zip runtimes\win-x86\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo win,x86,default,%%a >> $(Build.BinariesDirectory)\binary_size_data.txt
               )
             )
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index da869f708418c..9c80d330854b5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -148,10 +148,6 @@ stages:
             ${{ if contains(parameters.buildparameter, 'use_tensorrt') }}:
               DownloadCUDA: true
               DownloadTRT: true
-      - powershell: |
-          Write-Host "##vso[task.prependpath]C:\Program Files (x86)\dotnet"
-        displayName: 'Append dotnet x86  Directory to PATH'
-        condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
 
       - template: set-version-number-variables-step.yml
 
diff --git a/tools/ci_build/hipify-perl b/tools/ci_build/hipify-perl
deleted file mode 100755
index ac1393cf0da7d..0000000000000
--- a/tools/ci_build/hipify-perl
+++ /dev/null
@@ -1,13168 +0,0 @@
-#!/usr/bin/env perl
-
-##
-# Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-##
-
-# IMPORTANT: Do not change this file manually: it is generated by hipify-clang --perl
-
-# USAGE
-#
-my $USAGE =<<USAGE;
-
-    hipify-perl is a tool to translate CUDA source code into portable HIP C++
-
-    USAGE: hipify-perl [OPTIONS] INPUT_FILE
-
-    OPTIONS:
-
-      -cuda-kernel-execution-syntax - Keep CUDA kernel launch syntax (default)
-      -examine                      - Combines -no-output and -print-stats options
-      -exclude-dirs=s               - Exclude directories
-      -exclude-files=s              - Exclude files
-      -experimental                 - HIPIFY experimentally supported APIs
-      -help                         - Display available options
-      -hip-kernel-execution-syntax  - Transform CUDA kernel launch syntax to a regular HIP function call (overrides "--cuda-kernel-execution-syntax")
-      -inplace                      - Backup the input file in .prehip file, modify the input file inplace
-      -no-output                    - Don't write any translated output to stdout
-      -o=s                          - Output filename
-      -print-stats                  - Print translation statistics
-      -quiet-warnings               - Don't print warnings on unknown CUDA identifiers
-      -roc                          - Translate to roc instead of hip where it is possible
-      -version                      - The supported HIP version
-      -whitelist=s                  - Whitelist of identifiers
-USAGE
-#
-use warnings;
-use Cwd;
-use Getopt::Long;
-use File::Basename;
-my $whitelist = "";
-my $exclude_dirs =  "";
-my $exclude_files = "";
-my $fileName = "";
-my $hipFileName = "";
-my %ft;
-my %Tkernels;
-my %tags = ();
-my %tagsTotal = ();
-my %tagsToConvertedTags = ();
-my %tagsToConvertedTagsTotal = ();
-my %convertedTags = ();
-my %convertedTagsTotal = ();
-
-GetOptions(
-      "cuda-kernel-execution-syntax" => \$cuda_kernel_execution_syntax  # Keep CUDA kernel launch syntax (default)
-    , "examine" => \$examine                                            # Combines -no-output and -print-stats options
-    , "exclude-dirs=s" => \$exclude_dirs                                # Exclude directories
-    , "exclude-files=s" => \$exclude_files                              # Exclude files
-    , "experimental" => \$experimental                                  # HIPIFY experimentally supported APIs
-    , "help" => \$help                                                  # Display available options
-    , "hip-kernel-execution-syntax" => \$hip_kernel_execution_syntax    # Transform CUDA kernel launch syntax to a regular HIP function call (overrides "--cuda-kernel-execution-syntax")
-    , "inplace" => \$inplace                                            # Backup the input file in .prehip file, modify the input file inplace
-    , "no-output" => \$no_output                                        # Don't write any translated output to stdout
-    , "o=s" => \$hipFileName                                            # Output filename
-    , "print-stats" => \$print_stats                                    # Print translation statistics
-    , "quiet-warnings" => \$quiet_warnings                              # Don't print warnings on unknown CUDA identifiers
-    , "roc" => \$roc                                                    # Translate to roc instead of hip where it is possible
-    , "version" => \$version                                            # The supported HIP version
-    , "whitelist=s" => \$whitelist                                      # Whitelist of identifiers
-);
-
-$cuda_kernel_execution_syntax = 1;
-
-my %deprecated_funcs = (
-    "pruneInfo_t" => "12.2",
-    "pruneInfo" => "12.2",
-    "nvrtcGetNVVMSize" => "12.0",
-    "nvrtcGetNVVM" => "12.0",
-    "cusparseZsctr" => "11.0",
-    "cusparseZnnz_compress" => "12.2",
-    "cusparseZhybsv_solve" => "10.2",
-    "cusparseZhybsv_analysis" => "10.2",
-    "cusparseZhybmv" => "10.2",
-    "cusparseZhyb2dense" => "10.2",
-    "cusparseZhyb2csr" => "10.2",
-    "cusparseZhyb2csc" => "10.2",
-    "cusparseZgtsv_nopivot" => "10.2",
-    "cusparseZgtsvStridedBatch" => "10.2",
-    "cusparseZgtsv" => "10.2",
-    "cusparseZgthrz" => "11.0",
-    "cusparseZgthr" => "11.0",
-    "cusparseZgemmi" => "11.0",
-    "cusparseZgebsr2csr" => "12.4",
-    "cusparseZdoti" => "10.2",
-    "cusparseZdotci" => "10.2",
-    "cusparseZdense2hyb" => "10.2",
-    "cusparseZdense2csr" => "11.1",
-    "cusparseZdense2csc" => "11.1",
-    "cusparseZcsru2csr_bufferSizeExt" => "12.2",
-    "cusparseZcsru2csr" => "12.2",
-    "cusparseZcsrsv_solve" => "10.2",
-    "cusparseZcsrsv_analysis" => "10.2",
-    "cusparseZcsrsv2_solve" => "11.3",
-    "cusparseZcsrsv2_bufferSizeExt" => "11.3",
-    "cusparseZcsrsv2_bufferSize" => "11.3",
-    "cusparseZcsrsv2_analysis" => "11.3",
-    "cusparseZcsrsm_solve" => "10.2",
-    "cusparseZcsrsm_analysis" => "10.2",
-    "cusparseZcsrsm2_solve" => "11.3",
-    "cusparseZcsrsm2_bufferSizeExt" => "11.3",
-    "cusparseZcsrsm2_analysis" => "11.3",
-    "cusparseZcsrmv_mp" => "10.2",
-    "cusparseZcsrmv" => "10.2",
-    "cusparseZcsrmm2" => "10.2",
-    "cusparseZcsrmm" => "10.2",
-    "cusparseZcsrilu02_numericBoost" => "12.2",
-    "cusparseZcsrilu02_bufferSizeExt" => "12.2",
-    "cusparseZcsrilu02_bufferSize" => "12.2",
-    "cusparseZcsrilu02_analysis" => "12.2",
-    "cusparseZcsrilu02" => "12.2",
-    "cusparseZcsrilu0" => "10.2",
-    "cusparseZcsric02_bufferSizeExt" => "12.2",
-    "cusparseZcsric02_bufferSize" => "12.2",
-    "cusparseZcsric02_analysis" => "12.2",
-    "cusparseZcsric02" => "12.2",
-    "cusparseZcsric0" => "10.2",
-    "cusparseZcsrgemm2_bufferSizeExt" => "11.0",
-    "cusparseZcsrgemm2" => "11.0",
-    "cusparseZcsrgemm" => "10.2",
-    "cusparseZcsrgeam" => "10.2",
-    "cusparseZcsrcolor" => "12.2",
-    "cusparseZcsr2hyb" => "10.2",
-    "cusparseZcsr2dense" => "11.1",
-    "cusparseZcsr2csru" => "12.2",
-    "cusparseZcsr2csr_compress" => "12.2",
-    "cusparseZcsr2csc" => "10.2",
-    "cusparseZcsr2bsr" => "12.4",
-    "cusparseZcsc2hyb" => "10.2",
-    "cusparseZcsc2dense" => "11.1",
-    "cusparseZbsrxmv" => "12.2",
-    "cusparseZbsrsv2_solve" => "12.2",
-    "cusparseZbsrsv2_bufferSizeExt" => "12.2",
-    "cusparseZbsrsv2_bufferSize" => "12.2",
-    "cusparseZbsrsv2_analysis" => "12.2",
-    "cusparseZbsrsm2_solve" => "12.2",
-    "cusparseZbsrsm2_bufferSizeExt" => "12.2",
-    "cusparseZbsrsm2_bufferSize" => "12.2",
-    "cusparseZbsrsm2_analysis" => "12.2",
-    "cusparseZbsrilu02_numericBoost" => "12.2",
-    "cusparseZbsrilu02_bufferSizeExt" => "12.2",
-    "cusparseZbsrilu02_bufferSize" => "12.2",
-    "cusparseZbsrilu02_analysis" => "12.2",
-    "cusparseZbsrilu02" => "12.2",
-    "cusparseZbsric02_bufferSizeExt" => "12.2",
-    "cusparseZbsric02_bufferSize" => "12.2",
-    "cusparseZbsric02_analysis" => "12.2",
-    "cusparseZbsric02" => "12.2",
-    "cusparseZaxpyi" => "11.0",
-    "cusparseXgebsr2csr" => "12.4",
-    "cusparseXcsrsv2_zeroPivot" => "11.3",
-    "cusparseXcsrsm2_zeroPivot" => "11.3",
-    "cusparseXcsrilu02_zeroPivot" => "12.2",
-    "cusparseXcsric02_zeroPivot" => "12.2",
-    "cusparseXcsrgemmNnz" => "10.2",
-    "cusparseXcsrgemm2Nnz" => "11.0",
-    "cusparseXcsrgeamNnz" => "10.2",
-    "cusparseXcsr2bsrNnz" => "12.4",
-    "cusparseXbsrsv2_zeroPivot" => "12.2",
-    "cusparseXbsrsm2_zeroPivot" => "12.2",
-    "cusparseXbsrilu02_zeroPivot" => "12.2",
-    "cusparseXbsric02_zeroPivot" => "12.2",
-    "cusparseSsctr" => "11.0",
-    "cusparseSroti" => "11.0",
-    "cusparseSpruneDense2csr_bufferSizeExt" => "12.2",
-    "cusparseSpruneDense2csrNnzByPercentage" => "12.2",
-    "cusparseSpruneDense2csrNnz" => "12.2",
-    "cusparseSpruneDense2csrByPercentage_bufferSizeExt" => "12.2",
-    "cusparseSpruneDense2csrByPercentage" => "12.2",
-    "cusparseSpruneDense2csr" => "12.2",
-    "cusparseSpruneCsr2csr_bufferSizeExt" => "12.2",
-    "cusparseSpruneCsr2csrNnzByPercentage" => "12.2",
-    "cusparseSpruneCsr2csrNnz" => "12.2",
-    "cusparseSpruneCsr2csrByPercentage_bufferSizeExt" => "12.2",
-    "cusparseSpruneCsr2csrByPercentage" => "12.2",
-    "cusparseSpruneCsr2csr" => "12.2",
-    "cusparseSolvePolicy_t" => "12.2",
-    "cusparseSolveAnalysisInfo_t" => "10.2",
-    "cusparseSolveAnalysisInfo" => "10.2",
-    "cusparseSnnz_compress" => "12.2",
-    "cusparseShybsv_solve" => "10.2",
-    "cusparseShybsv_analysis" => "10.2",
-    "cusparseShybmv" => "10.2",
-    "cusparseShyb2dense" => "10.2",
-    "cusparseShyb2csr" => "10.2",
-    "cusparseShyb2csc" => "10.2",
-    "cusparseSgtsv_nopivot" => "10.2",
-    "cusparseSgtsvStridedBatch" => "10.2",
-    "cusparseSgtsv" => "10.2",
-    "cusparseSgthrz" => "11.0",
-    "cusparseSgthr" => "11.0",
-    "cusparseSgemmi" => "11.0",
-    "cusparseSgebsr2csr" => "12.4",
-    "cusparseSdoti" => "10.2",
-    "cusparseSdense2hyb" => "10.2",
-    "cusparseSdense2csr" => "11.1",
-    "cusparseSdense2csc" => "11.1",
-    "cusparseScsru2csr_bufferSizeExt" => "12.2",
-    "cusparseScsru2csr" => "12.2",
-    "cusparseScsrsv_solve" => "10.2",
-    "cusparseScsrsv_analysis" => "10.2",
-    "cusparseScsrsv2_solve" => "11.3",
-    "cusparseScsrsv2_bufferSizeExt" => "11.3",
-    "cusparseScsrsv2_bufferSize" => "11.3",
-    "cusparseScsrsv2_analysis" => "11.3",
-    "cusparseScsrsm_solve" => "10.2",
-    "cusparseScsrsm_analysis" => "10.2",
-    "cusparseScsrsm2_solve" => "11.3",
-    "cusparseScsrsm2_bufferSizeExt" => "11.3",
-    "cusparseScsrsm2_analysis" => "11.3",
-    "cusparseScsrmv_mp" => "10.2",
-    "cusparseScsrmv" => "10.2",
-    "cusparseScsrmm2" => "10.2",
-    "cusparseScsrmm" => "10.2",
-    "cusparseScsrilu02_numericBoost" => "12.2",
-    "cusparseScsrilu02_bufferSizeExt" => "12.2",
-    "cusparseScsrilu02_bufferSize" => "12.2",
-    "cusparseScsrilu02_analysis" => "12.2",
-    "cusparseScsrilu02" => "12.2",
-    "cusparseScsrilu0" => "10.2",
-    "cusparseScsric02_bufferSizeExt" => "12.2",
-    "cusparseScsric02_bufferSize" => "12.2",
-    "cusparseScsric02_analysis" => "12.2",
-    "cusparseScsric02" => "12.2",
-    "cusparseScsric0" => "10.2",
-    "cusparseScsrgemm2_bufferSizeExt" => "11.0",
-    "cusparseScsrgemm2" => "11.0",
-    "cusparseScsrgemm" => "10.2",
-    "cusparseScsrgeam" => "10.2",
-    "cusparseScsrcolor" => "12.2",
-    "cusparseScsr2hyb" => "10.2",
-    "cusparseScsr2dense" => "11.1",
-    "cusparseScsr2csru" => "12.2",
-    "cusparseScsr2csr_compress" => "12.2",
-    "cusparseScsr2csc" => "10.2",
-    "cusparseScsr2bsr" => "12.4",
-    "cusparseScsc2hyb" => "10.2",
-    "cusparseScsc2dense" => "11.1",
-    "cusparseSbsrxmv" => "12.2",
-    "cusparseSbsrsv2_solve" => "12.2",
-    "cusparseSbsrsv2_bufferSizeExt" => "12.2",
-    "cusparseSbsrsv2_bufferSize" => "12.2",
-    "cusparseSbsrsv2_analysis" => "12.2",
-    "cusparseSbsrsm2_solve" => "12.2",
-    "cusparseSbsrsm2_bufferSizeExt" => "12.2",
-    "cusparseSbsrsm2_bufferSize" => "12.2",
-    "cusparseSbsrsm2_analysis" => "12.2",
-    "cusparseSbsrilu02_numericBoost" => "12.2",
-    "cusparseSbsrilu02_bufferSizeExt" => "12.2",
-    "cusparseSbsrilu02_bufferSize" => "12.2",
-    "cusparseSbsrilu02_analysis" => "12.2",
-    "cusparseSbsrilu02" => "12.2",
-    "cusparseSbsric02_bufferSizeExt" => "12.2",
-    "cusparseSbsric02_bufferSize" => "12.2",
-    "cusparseSbsric02_analysis" => "12.2",
-    "cusparseSbsric02" => "12.2",
-    "cusparseSaxpyi" => "11.0",
-    "cusparseRot" => "12.2",
-    "cusparseHybPartition_t" => "10.2",
-    "cusparseHybMat_t" => "10.2",
-    "cusparseHybMat" => "10.2",
-    "cusparseHpruneDense2csr_bufferSizeExt" => "12.2",
-    "cusparseHpruneDense2csrNnzByPercentage" => "12.2",
-    "cusparseHpruneDense2csrNnz" => "12.2",
-    "cusparseHpruneDense2csrByPercentage_bufferSizeExt" => "12.2",
-    "cusparseHpruneDense2csrByPercentage" => "12.2",
-    "cusparseHpruneDense2csr" => "12.2",
-    "cusparseHpruneCsr2csr_bufferSizeExt" => "12.2",
-    "cusparseHpruneCsr2csrNnzByPercentage" => "12.2",
-    "cusparseHpruneCsr2csrNnz" => "12.2",
-    "cusparseHpruneCsr2csrByPercentage_bufferSizeExt" => "12.2",
-    "cusparseHpruneCsr2csrByPercentage" => "12.2",
-    "cusparseHpruneCsr2csr" => "12.2",
-    "cusparseDsctr" => "11.0",
-    "cusparseDroti" => "11.0",
-    "cusparseDpruneDense2csr_bufferSizeExt" => "12.2",
-    "cusparseDpruneDense2csrNnzByPercentage" => "12.2",
-    "cusparseDpruneDense2csrNnz" => "12.2",
-    "cusparseDpruneDense2csrByPercentage_bufferSizeExt" => "12.2",
-    "cusparseDpruneDense2csrByPercentage" => "12.2",
-    "cusparseDpruneDense2csr" => "12.2",
-    "cusparseDpruneCsr2csr_bufferSizeExt" => "12.2",
-    "cusparseDpruneCsr2csrNnzByPercentage" => "12.2",
-    "cusparseDpruneCsr2csrNnz" => "12.2",
-    "cusparseDpruneCsr2csrByPercentage_bufferSizeExt" => "12.2",
-    "cusparseDpruneCsr2csrByPercentage" => "12.2",
-    "cusparseDpruneCsr2csr" => "12.2",
-    "cusparseDnnz_compress" => "12.2",
-    "cusparseDhybsv_solve" => "10.2",
-    "cusparseDhybsv_analysis" => "10.2",
-    "cusparseDhybmv" => "10.2",
-    "cusparseDhyb2dense" => "10.2",
-    "cusparseDhyb2csr" => "10.2",
-    "cusparseDhyb2csc" => "10.2",
-    "cusparseDgtsv_nopivot" => "10.2",
-    "cusparseDgtsvStridedBatch" => "10.2",
-    "cusparseDgtsv" => "10.2",
-    "cusparseDgthrz" => "11.0",
-    "cusparseDgthr" => "11.0",
-    "cusparseDgemmi" => "11.0",
-    "cusparseDgebsr2csr" => "12.4",
-    "cusparseDestroySolveAnalysisInfo" => "10.2",
-    "cusparseDestroyPruneInfo" => "12.2",
-    "cusparseDestroyHybMat" => "10.2",
-    "cusparseDestroyCsru2csrInfo" => "12.2",
-    "cusparseDestroyCsrsv2Info" => "11.3",
-    "cusparseDestroyCsrsm2Info" => "11.3",
-    "cusparseDestroyCsrilu02Info" => "12.2",
-    "cusparseDestroyCsric02Info" => "12.2",
-    "cusparseDestroyCsrgemm2Info" => "11.0",
-    "cusparseDestroyColorInfo" => "12.2",
-    "cusparseDestroyBsrsv2Info" => "12.2",
-    "cusparseDestroyBsrsm2Info" => "12.2",
-    "cusparseDestroyBsrilu02Info" => "12.2",
-    "cusparseDestroyBsric02Info" => "12.2",
-    "cusparseDdoti" => "10.2",
-    "cusparseDdense2hyb" => "10.2",
-    "cusparseDdense2csr" => "11.1",
-    "cusparseDdense2csc" => "11.1",
-    "cusparseDcsru2csr_bufferSizeExt" => "12.2",
-    "cusparseDcsru2csr" => "12.2",
-    "cusparseDcsrsv_solve" => "10.2",
-    "cusparseDcsrsv_analysis" => "10.2",
-    "cusparseDcsrsv2_solve" => "11.3",
-    "cusparseDcsrsv2_bufferSizeExt" => "11.3",
-    "cusparseDcsrsv2_bufferSize" => "11.3",
-    "cusparseDcsrsv2_analysis" => "11.3",
-    "cusparseDcsrsm_solve" => "10.2",
-    "cusparseDcsrsm_analysis" => "10.2",
-    "cusparseDcsrsm2_solve" => "11.3",
-    "cusparseDcsrsm2_bufferSizeExt" => "11.3",
-    "cusparseDcsrsm2_analysis" => "11.3",
-    "cusparseDcsrmv_mp" => "10.2",
-    "cusparseDcsrmv" => "10.2",
-    "cusparseDcsrmm2" => "10.2",
-    "cusparseDcsrmm" => "10.2",
-    "cusparseDcsrilu02_numericBoost" => "12.2",
-    "cusparseDcsrilu02_bufferSizeExt" => "12.2",
-    "cusparseDcsrilu02_bufferSize" => "12.2",
-    "cusparseDcsrilu02_analysis" => "12.2",
-    "cusparseDcsrilu02" => "12.2",
-    "cusparseDcsrilu0" => "10.2",
-    "cusparseDcsric02_bufferSizeExt" => "12.2",
-    "cusparseDcsric02_bufferSize" => "12.2",
-    "cusparseDcsric02_analysis" => "12.2",
-    "cusparseDcsric02" => "12.2",
-    "cusparseDcsric0" => "10.2",
-    "cusparseDcsrgemm2_bufferSizeExt" => "11.0",
-    "cusparseDcsrgemm2" => "11.0",
-    "cusparseDcsrgemm" => "10.2",
-    "cusparseDcsrgeam" => "10.2",
-    "cusparseDcsrcolor" => "12.2",
-    "cusparseDcsr2hyb" => "10.2",
-    "cusparseDcsr2dense" => "11.1",
-    "cusparseDcsr2csru" => "12.2",
-    "cusparseDcsr2csr_compress" => "12.2",
-    "cusparseDcsr2csc" => "10.2",
-    "cusparseDcsr2bsr" => "12.4",
-    "cusparseDcsc2hyb" => "10.2",
-    "cusparseDcsc2dense" => "11.1",
-    "cusparseDbsrxmv" => "12.2",
-    "cusparseDbsrsv2_solve" => "12.2",
-    "cusparseDbsrsv2_bufferSizeExt" => "12.2",
-    "cusparseDbsrsv2_bufferSize" => "12.2",
-    "cusparseDbsrsv2_analysis" => "12.2",
-    "cusparseDbsrsm2_solve" => "12.2",
-    "cusparseDbsrsm2_bufferSizeExt" => "12.2",
-    "cusparseDbsrsm2_bufferSize" => "12.2",
-    "cusparseDbsrsm2_analysis" => "12.2",
-    "cusparseDbsrilu02_numericBoost" => "12.2",
-    "cusparseDbsrilu02_bufferSizeExt" => "12.2",
-    "cusparseDbsrilu02_bufferSize" => "12.2",
-    "cusparseDbsrilu02_analysis" => "12.2",
-    "cusparseDbsrilu02" => "12.2",
-    "cusparseDbsric02_bufferSizeExt" => "12.2",
-    "cusparseDbsric02_bufferSize" => "12.2",
-    "cusparseDbsric02_analysis" => "12.2",
-    "cusparseDbsric02" => "12.2",
-    "cusparseDaxpyi" => "11.0",
-    "cusparseCsrsv_solveEx" => "10.2",
-    "cusparseCsrsv_analysisEx" => "10.2",
-    "cusparseCsrmvEx_bufferSize" => "11.2",
-    "cusparseCsrmvEx" => "11.2",
-    "cusparseCsrilu0Ex" => "10.2",
-    "cusparseCsr2cscEx" => "10.2",
-    "cusparseCsctr" => "11.0",
-    "cusparseCreateSolveAnalysisInfo" => "10.2",
-    "cusparseCreatePruneInfo" => "12.2",
-    "cusparseCreateIdentityPermutation" => "12.2",
-    "cusparseCreateHybMat" => "10.2",
-    "cusparseCreateCsru2csrInfo" => "12.2",
-    "cusparseCreateCsrsv2Info" => "11.3",
-    "cusparseCreateCsrsm2Info" => "11.3",
-    "cusparseCreateCsrilu02Info" => "12.2",
-    "cusparseCreateCsric02Info" => "12.2",
-    "cusparseCreateCsrgemm2Info" => "11.0",
-    "cusparseCreateCooAoS" => "11.2",
-    "cusparseCreateColorInfo" => "12.2",
-    "cusparseCreateBsrsv2Info" => "12.2",
-    "cusparseCreateBsrsm2Info" => "12.2",
-    "cusparseCreateBsrilu02Info" => "12.2",
-    "cusparseCreateBsric02Info" => "12.2",
-    "cusparseCooAoSGet" => "11.2",
-    "cusparseConstrainedGeMM_bufferSize" => "11.2",
-    "cusparseConstrainedGeMM" => "11.2",
-    "cusparseColorInfo_t" => "12.2",
-    "cusparseColorInfo" => "12.2",
-    "cusparseColorAlg_t" => "12.2",
-    "cusparseCnnz_compress" => "12.2",
-    "cusparseChybsv_solve" => "10.2",
-    "cusparseChybsv_analysis" => "10.2",
-    "cusparseChybmv" => "10.2",
-    "cusparseChyb2dense" => "10.2",
-    "cusparseChyb2csr" => "10.2",
-    "cusparseChyb2csc" => "10.2",
-    "cusparseCgtsv_nopivot" => "10.2",
-    "cusparseCgtsvStridedBatch" => "10.2",
-    "cusparseCgtsv" => "10.2",
-    "cusparseCgthrz" => "11.0",
-    "cusparseCgthr" => "11.0",
-    "cusparseCgemmi" => "11.0",
-    "cusparseCgebsr2csr" => "12.4",
-    "cusparseCdoti" => "10.2",
-    "cusparseCdotci" => "10.2",
-    "cusparseCdense2hyb" => "10.2",
-    "cusparseCdense2csr" => "11.1",
-    "cusparseCdense2csc" => "11.1",
-    "cusparseCcsru2csr_bufferSizeExt" => "12.2",
-    "cusparseCcsru2csr" => "12.2",
-    "cusparseCcsrsv_solve" => "10.2",
-    "cusparseCcsrsv_analysis" => "10.2",
-    "cusparseCcsrsv2_solve" => "11.3",
-    "cusparseCcsrsv2_bufferSizeExt" => "11.3",
-    "cusparseCcsrsv2_bufferSize" => "11.3",
-    "cusparseCcsrsv2_analysis" => "11.3",
-    "cusparseCcsrsm_solve" => "10.2",
-    "cusparseCcsrsm_analysis" => "10.2",
-    "cusparseCcsrsm2_solve" => "11.3",
-    "cusparseCcsrsm2_bufferSizeExt" => "11.3",
-    "cusparseCcsrsm2_analysis" => "11.3",
-    "cusparseCcsrmv_mp" => "10.2",
-    "cusparseCcsrmv" => "10.2",
-    "cusparseCcsrmm2" => "10.2",
-    "cusparseCcsrmm" => "10.2",
-    "cusparseCcsrilu02_numericBoost" => "12.2",
-    "cusparseCcsrilu02_bufferSizeExt" => "12.2",
-    "cusparseCcsrilu02_bufferSize" => "12.2",
-    "cusparseCcsrilu02_analysis" => "12.2",
-    "cusparseCcsrilu02" => "12.2",
-    "cusparseCcsrilu0" => "10.2",
-    "cusparseCcsric02_bufferSizeExt" => "12.2",
-    "cusparseCcsric02_bufferSize" => "12.2",
-    "cusparseCcsric02_analysis" => "12.2",
-    "cusparseCcsric02" => "12.2",
-    "cusparseCcsric0" => "10.2",
-    "cusparseCcsrgemm2_bufferSizeExt" => "11.0",
-    "cusparseCcsrgemm2" => "11.0",
-    "cusparseCcsrgemm" => "10.2",
-    "cusparseCcsrgeam" => "10.2",
-    "cusparseCcsrcolor" => "12.2",
-    "cusparseCcsr2hyb" => "10.2",
-    "cusparseCcsr2dense" => "11.1",
-    "cusparseCcsr2csru" => "12.2",
-    "cusparseCcsr2csr_compress" => "12.2",
-    "cusparseCcsr2csc" => "10.2",
-    "cusparseCcsr2bsr" => "12.4",
-    "cusparseCcsc2hyb" => "10.2",
-    "cusparseCcsc2dense" => "11.1",
-    "cusparseCbsrxmv" => "12.2",
-    "cusparseCbsrsv2_solve" => "12.2",
-    "cusparseCbsrsv2_bufferSizeExt" => "12.2",
-    "cusparseCbsrsv2_bufferSize" => "12.2",
-    "cusparseCbsrsv2_analysis" => "12.2",
-    "cusparseCbsrsm2_solve" => "12.2",
-    "cusparseCbsrsm2_bufferSizeExt" => "12.2",
-    "cusparseCbsrsm2_bufferSize" => "12.2",
-    "cusparseCbsrsm2_analysis" => "12.2",
-    "cusparseCbsrilu02_numericBoost" => "12.2",
-    "cusparseCbsrilu02_bufferSizeExt" => "12.2",
-    "cusparseCbsrilu02_bufferSize" => "12.2",
-    "cusparseCbsrilu02_analysis" => "12.2",
-    "cusparseCbsrilu02" => "12.2",
-    "cusparseCbsric02_bufferSizeExt" => "12.2",
-    "cusparseCbsric02_bufferSize" => "12.2",
-    "cusparseCbsric02_analysis" => "12.2",
-    "cusparseCbsric02" => "12.2",
-    "cusparseCaxpyi" => "11.0",
-    "cudnnTransformTensorEx" => "9.0.0",
-    "cudnnTransformTensor" => "9.0.0",
-    "cudnnTransformFilter" => "9.0.0",
-    "cudnnTensorTransformStruct" => "9.0.0",
-    "cudnnTensorTransformDescriptor_t" => "9.0.0",
-    "cudnnSetTensorTransformDescriptor" => "9.0.0",
-    "cudnnSetSeqDataDescriptor" => "9.0.0",
-    "cudnnSetReduceTensorDescriptor" => "9.0.0",
-    "cudnnSetRNNProjectionLayers" => "8.0.1",
-    "cudnnSetRNNPaddingMode" => "8.0.1",
-    "cudnnSetRNNMatrixMathType" => "8.0.1",
-    "cudnnSetRNNDescriptor_v6" => "8.0.1",
-    "cudnnSetRNNDescriptor_v5" => "7.6.5",
-    "cudnnSetRNNDescriptor" => "7.6.5",
-    "cudnnSetRNNBiasMode" => "8.0.1",
-    "cudnnSetRNNAlgorithmDescriptor" => "8.0.2",
-    "cudnnSetPoolingNdDescriptor" => "9.0.0",
-    "cudnnSetPooling2dDescriptor" => "9.0.0",
-    "cudnnSetPersistentRNNPlan" => "8.0.1",
-    "cudnnSetOpTensorDescriptor" => "9.0.0",
-    "cudnnSetFusedOpsVariantParamPackAttribute" => "9.0.0",
-    "cudnnSetFusedOpsConstParamPackAttribute" => "9.0.0",
-    "cudnnSetFilterNdDescriptor" => "9.0.0",
-    "cudnnSetFilter4dDescriptor" => "9.0.0",
-    "cudnnSetConvolutionReorderType" => "9.0.0",
-    "cudnnSetConvolutionNdDescriptor" => "9.0.0",
-    "cudnnSetConvolutionMathType" => "9.0.0",
-    "cudnnSetConvolutionGroupCount" => "9.0.0",
-    "cudnnSetConvolution2dDescriptor" => "9.0.0",
-    "cudnnSetCTCLossDescriptor_v8" => "9.0.0",
-    "cudnnSetCTCLossDescriptorEx" => "9.0.0",
-    "cudnnSetCTCLossDescriptor" => "9.0.0",
-    "cudnnSetAttnDescriptor" => "9.0.0",
-    "cudnnSetAlgorithmPerformance" => "8.0.2",
-    "cudnnSetAlgorithmDescriptor" => "8.0.2",
-    "cudnnSetActivationDescriptorSwishBeta" => "9.0.0",
-    "cudnnSetActivationDescriptor" => "9.0.0",
-    "cudnnSeqDataDescriptor_t" => "9.0.0",
-    "cudnnScaleTensor" => "9.0.0",
-    "cudnnSaveAlgorithm" => "8.0.2",
-    "cudnnRuntimeTag_t" => "9.0.0",
-    "cudnnRestoreAlgorithm" => "8.0.2",
-    "cudnnReorderType_t" => "9.0.0",
-    "cudnnReorderFilterAndBias" => "9.0.0",
-    "cudnnReduceTensorStruct" => "9.0.0",
-    "cudnnReduceTensorIndices_t" => "9.0.0",
-    "cudnnReduceTensorDescriptor_t" => "9.0.0",
-    "cudnnReduceTensor" => "9.0.0",
-    "cudnnRNNSetClip" => "8.0.1",
-    "cudnnRNNGetClip" => "8.0.1",
-    "cudnnRNNForwardTrainingEx" => "8.0.1",
-    "cudnnRNNForwardTraining" => "8.0.1",
-    "cudnnRNNForwardInferenceEx" => "8.0.1",
-    "cudnnRNNForwardInference" => "8.0.1",
-    "cudnnRNNBackwardWeightsEx" => "8.0.2",
-    "cudnnRNNBackwardWeights" => "8.0.2",
-    "cudnnRNNBackwardDataEx" => "8.0.2",
-    "cudnnRNNBackwardData" => "8.0.2",
-    "cudnnQueryRuntimeError" => "9.0.0",
-    "cudnnPoolingStruct" => "9.0.0",
-    "cudnnPoolingMode_t" => "9.0.0",
-    "cudnnPoolingForward" => "9.0.0",
-    "cudnnPoolingDescriptor_t" => "9.0.0",
-    "cudnnPoolingBackward" => "9.0.0",
-    "cudnnOpTensorStruct" => "9.0.0",
-    "cudnnOpTensorDescriptor_t" => "9.0.0",
-    "cudnnOpTensor" => "9.0.0",
-    "cudnnNormalizationForwardTraining" => "9.0.0",
-    "cudnnNormalizationForwardInference" => "9.0.0",
-    "cudnnNormalizationBackward" => "9.0.0",
-    "cudnnNormOps_t" => "9.0.0",
-    "cudnnNormMode_t" => "9.0.0",
-    "cudnnNormAlgo_t" => "9.0.0",
-    "cudnnMultiHeadAttnForward" => "9.0.0",
-    "cudnnMultiHeadAttnBackwardWeights" => "9.0.0",
-    "cudnnMultiHeadAttnBackwardData" => "9.0.0",
-    "cudnnMakeFusedOpsPlan" => "9.0.0",
-    "cudnnInitTransformDest" => "9.0.0",
-    "cudnnIndicesType_t" => "9.0.0",
-    "cudnnIm2Col" => "9.0.0",
-    "cudnnGetTensorTransformDescriptor" => "9.0.0",
-    "cudnnGetSeqDataDescriptor" => "9.0.0",
-    "cudnnGetReductionWorkspaceSize" => "9.0.0",
-    "cudnnGetReductionIndicesSize" => "9.0.0",
-    "cudnnGetReduceTensorDescriptor" => "9.0.0",
-    "cudnnGetRNNWorkspaceSize" => "8.0.1",
-    "cudnnGetRNNTrainingReserveSize" => "8.0.1",
-    "cudnnGetRNNProjectionLayers" => "8.0.1",
-    "cudnnGetRNNParamsSize" => "8.0.1",
-    "cudnnGetRNNPaddingMode" => "8.0.1",
-    "cudnnGetRNNMatrixMathType" => "8.0.1",
-    "cudnnGetRNNLinLayerMatrixParams" => "8.0.1",
-    "cudnnGetRNNLinLayerBiasParams" => "8.0.1",
-    "cudnnGetRNNForwardTrainingAlgorithmMaxCount" => "8.0.2",
-    "cudnnGetRNNForwardInferenceAlgorithmMaxCount" => "8.0.2",
-    "cudnnGetRNNDescriptor_v6" => "8.0.1",
-    "cudnnGetRNNDescriptor" => "7.6.5",
-    "cudnnGetRNNBiasMode" => "8.0.1",
-    "cudnnGetRNNBackwardWeightsAlgorithmMaxCount" => "8.0.2",
-    "cudnnGetRNNBackwardDataAlgorithmMaxCount" => "8.0.2",
-    "cudnnGetPoolingNdForwardOutputDim" => "9.0.0",
-    "cudnnGetPoolingNdDescriptor" => "9.0.0",
-    "cudnnGetPooling2dForwardOutputDim" => "9.0.0",
-    "cudnnGetPooling2dDescriptor" => "9.0.0",
-    "cudnnGetOpTensorDescriptor" => "9.0.0",
-    "cudnnGetNormalizationTrainingReserveSpaceSize" => "9.0.0",
-    "cudnnGetNormalizationForwardTrainingWorkspaceSize" => "9.0.0",
-    "cudnnGetNormalizationBackwardWorkspaceSize" => "9.0.0",
-    "cudnnGetMultiHeadAttnWeights" => "9.0.0",
-    "cudnnGetMultiHeadAttnBuffers" => "9.0.0",
-    "cudnnGetFusedOpsVariantParamPackAttribute" => "9.0.0",
-    "cudnnGetFusedOpsConstParamPackAttribute" => "9.0.0",
-    "cudnnGetFoldedConvBackwardDataDescriptors" => "9.0.0",
-    "cudnnGetFilterSizeInBytes" => "9.0.0",
-    "cudnnGetFilterNdDescriptor" => "9.0.0",
-    "cudnnGetFilter4dDescriptor" => "9.0.0",
-    "cudnnGetConvolutionReorderType" => "9.0.0",
-    "cudnnGetConvolutionNdForwardOutputDim" => "9.0.0",
-    "cudnnGetConvolutionNdDescriptor" => "9.0.0",
-    "cudnnGetConvolutionMathType" => "9.0.0",
-    "cudnnGetConvolutionGroupCount" => "9.0.0",
-    "cudnnGetConvolutionForwardWorkspaceSize" => "9.0.0",
-    "cudnnGetConvolutionForwardAlgorithm_v7" => "9.0.0",
-    "cudnnGetConvolutionForwardAlgorithmMaxCount" => "9.0.0",
-    "cudnnGetConvolutionForwardAlgorithm" => "7.6.5",
-    "cudnnGetConvolutionBackwardFilterWorkspaceSize" => "9.0.0",
-    "cudnnGetConvolutionBackwardFilterAlgorithm_v7" => "9.0.0",
-    "cudnnGetConvolutionBackwardFilterAlgorithm" => "7.6.5",
-    "cudnnGetConvolutionBackwardDataWorkspaceSize" => "9.0.0",
-    "cudnnGetConvolutionBackwardDataAlgorithm_v7" => "9.0.0",
-    "cudnnGetConvolutionBackwardDataAlgorithmMaxCount" => "9.0.0",
-    "cudnnGetConvolutionBackwardDataAlgorithm" => "7.6.5",
-    "cudnnGetConvolution2dForwardOutputDim" => "9.0.0",
-    "cudnnGetConvolution2dDescriptor" => "9.0.0",
-    "cudnnGetCTCLossDescriptor_v8" => "9.0.0",
-    "cudnnGetCTCLossDescriptorEx" => "9.0.0",
-    "cudnnGetCTCLossDescriptor" => "9.0.0",
-    "cudnnGetBatchNormalizationTrainingExReserveSpaceSize" => "9.0.0",
-    "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize" => "9.0.0",
-    "cudnnGetBatchNormalizationBackwardExWorkspaceSize" => "9.0.0",
-    "cudnnGetAttnDescriptor" => "9.0.0",
-    "cudnnGetAlgorithmSpaceSize" => "8.0.2",
-    "cudnnGetAlgorithmPerformance" => "8.0.2",
-    "cudnnGetAlgorithmDescriptor" => "8.0.2",
-    "cudnnGetActivationDescriptorSwishBeta" => "9.0.0",
-    "cudnnGetActivationDescriptor" => "9.0.0",
-    "cudnnFusedOps_t" => "9.0.0",
-    "cudnnFusedOpsVariantParamStruct" => "9.0.0",
-    "cudnnFusedOpsVariantParamPack_t" => "9.0.0",
-    "cudnnFusedOpsVariantParamLabel_t" => "9.0.0",
-    "cudnnFusedOpsPointerPlaceHolder_t" => "9.0.0",
-    "cudnnFusedOpsPlan_t" => "9.0.0",
-    "cudnnFusedOpsPlanStruct" => "9.0.0",
-    "cudnnFusedOpsExecute" => "9.0.0",
-    "cudnnFusedOpsConstParamStruct" => "9.0.0",
-    "cudnnFusedOpsConstParamPack_t" => "9.0.0",
-    "cudnnFusedOpsConstParamLabel_t" => "9.0.0",
-    "cudnnFindRNNForwardTrainingAlgorithmEx" => "8.0.2",
-    "cudnnFindRNNForwardInferenceAlgorithmEx" => "8.0.2",
-    "cudnnFindRNNBackwardWeightsAlgorithmEx" => "8.0.2",
-    "cudnnFindRNNBackwardDataAlgorithmEx" => "8.0.2",
-    "cudnnFindConvolutionForwardAlgorithmEx" => "9.0.0",
-    "cudnnFindConvolutionForwardAlgorithm" => "9.0.0",
-    "cudnnFindConvolutionBackwardFilterAlgorithmEx" => "9.0.0",
-    "cudnnFindConvolutionBackwardFilterAlgorithm" => "9.0.0",
-    "cudnnFindConvolutionBackwardDataAlgorithmEx" => "9.0.0",
-    "cudnnFindConvolutionBackwardDataAlgorithm" => "9.0.0",
-    "cudnnFilterStruct" => "9.0.0",
-    "cudnnDestroyTensorTransformDescriptor" => "9.0.0",
-    "cudnnDestroySeqDataDescriptor" => "9.0.0",
-    "cudnnDestroyReduceTensorDescriptor" => "9.0.0",
-    "cudnnDestroyPoolingDescriptor" => "9.0.0",
-    "cudnnDestroyPersistentRNNPlan" => "8.0.1",
-    "cudnnDestroyOpTensorDescriptor" => "9.0.0",
-    "cudnnDestroyFusedOpsVariantParamPack" => "9.0.0",
-    "cudnnDestroyFusedOpsPlan" => "9.0.0",
-    "cudnnDestroyFusedOpsConstParamPack" => "9.0.0",
-    "cudnnDestroyFilterDescriptor" => "9.0.0",
-    "cudnnDestroyConvolutionDescriptor" => "9.0.0",
-    "cudnnDestroyAttnDescriptor" => "9.0.0",
-    "cudnnDestroyAlgorithmPerformance" => "8.0.2",
-    "cudnnDestroyAlgorithmDescriptor" => "8.0.2",
-    "cudnnDestroyActivationDescriptor" => "9.0.0",
-    "cudnnDeriveNormTensorDescriptor" => "9.0.0",
-    "cudnnDeriveBNTensorDescriptor" => "9.0.0",
-    "cudnnCreateTensorTransformDescriptor" => "9.0.0",
-    "cudnnCreateSeqDataDescriptor" => "9.0.0",
-    "cudnnCreateReduceTensorDescriptor" => "9.0.0",
-    "cudnnCreatePoolingDescriptor" => "9.0.0",
-    "cudnnCreatePersistentRNNPlan" => "8.0.1",
-    "cudnnCreateOpTensorDescriptor" => "9.0.0",
-    "cudnnCreateFusedOpsVariantParamPack" => "9.0.0",
-    "cudnnCreateFusedOpsPlan" => "9.0.0",
-    "cudnnCreateFusedOpsConstParamPack" => "9.0.0",
-    "cudnnCreateFilterDescriptor" => "9.0.0",
-    "cudnnCreateConvolutionDescriptor" => "9.0.0",
-    "cudnnCreateAttnDescriptor" => "9.0.0",
-    "cudnnCreateAlgorithmPerformance" => "8.0.2",
-    "cudnnCreateAlgorithmDescriptor" => "8.0.2",
-    "cudnnCreateActivationDescriptor" => "9.0.0",
-    "cudnnCopyAlgorithmDescriptor" => "8.0.2",
-    "cudnnConvolutionFwdPreference_t" => "7.6.5",
-    "cudnnConvolutionFwdAlgoPerf_t" => "9.0.0",
-    "cudnnConvolutionForward" => "9.0.0",
-    "cudnnConvolutionDescriptor_t" => "9.0.0",
-    "cudnnConvolutionBwdFilterPreference_t" => "7.6.5",
-    "cudnnConvolutionBwdFilterAlgoPerf_t" => "9.0.0",
-    "cudnnConvolutionBwdDataPreference_t" => "7.6.5",
-    "cudnnConvolutionBwdDataAlgoPerf_t" => "9.0.0",
-    "cudnnConvolutionBwdDataAlgoPerfStruct" => "9.0.0",
-    "cudnnConvolutionBiasActivationForward" => "9.0.0",
-    "cudnnConvolutionBackwardFilter" => "9.0.0",
-    "cudnnConvolutionBackwardData" => "9.0.0",
-    "cudnnConvolutionBackwardBias" => "9.0.0",
-    "cudnnBatchNormalizationForwardTrainingEx" => "9.0.0",
-    "cudnnBatchNormalizationForwardTraining" => "9.0.0",
-    "cudnnBatchNormalizationForwardInference" => "9.0.0",
-    "cudnnBatchNormalizationBackwardEx" => "9.0.0",
-    "cudnnBatchNormalizationBackward" => "9.0.0",
-    "cudnnBatchNormOps_t" => "9.0.0",
-    "cudnnBatchNormMode_t" => "9.0.0",
-    "cudnnAttnDescriptor_t" => "9.0.0",
-    "cudnnAddTensor" => "9.0.0",
-    "cudnnActivationStruct" => "9.0.0",
-    "cudnnActivationMode_t" => "9.0.0",
-    "cudnnActivationForward" => "9.0.0",
-    "cudnnActivationDescriptor_t" => "9.0.0",
-    "cudnnActivationBackward" => "9.0.0",
-    "cudaThreadSynchronize" => "10.0",
-    "cudaThreadSetLimit" => "10.0",
-    "cudaThreadSetCacheConfig" => "10.0",
-    "cudaThreadGetLimit" => "10.0",
-    "cudaThreadGetCacheConfig" => "10.0",
-    "cudaThreadExit" => "10.0",
-    "cudaSharedMemConfig" => "12.4",
-    "cudaSetDoubleForHost" => "10.0",
-    "cudaSetDoubleForDevice" => "10.0",
-    "cudaMemcpyToArrayAsync" => "10.1",
-    "cudaMemcpyToArray" => "10.1",
-    "cudaMemcpyFromArrayAsync" => "10.1",
-    "cudaMemcpyFromArray" => "10.1",
-    "cudaMemcpyArrayToArray" => "10.1",
-    "cudaLaunchCooperativeKernelMultiDevice" => "11.3",
-    "cudaGLUnregisterBufferObject" => "10.0",
-    "cudaGLUnmapBufferObjectAsync" => "10.0",
-    "cudaGLUnmapBufferObject" => "10.0",
-    "cudaGLSetGLDevice" => "10.0",
-    "cudaGLSetBufferObjectMapFlags" => "10.0",
-    "cudaGLRegisterBufferObject" => "10.0",
-    "cudaGLMapBufferObjectAsync" => "10.0",
-    "cudaGLMapBufferObject" => "10.0",
-    "cudaFuncSetSharedMemConfig" => "12.4",
-    "cudaErrorTextureNotBound" => "3.1",
-    "cudaErrorTextureFetchFailed" => "3.1",
-    "cudaErrorSynchronizationError" => "3.1",
-    "cudaErrorProfilerNotInitialized" => "5.0",
-    "cudaErrorProfilerAlreadyStopped" => "5.0",
-    "cudaErrorProfilerAlreadyStarted" => "5.0",
-    "cudaErrorPriorLaunchFailure" => "3.1",
-    "cudaErrorNotYetImplemented" => "4.1",
-    "cudaErrorMixedDeviceExecution" => "3.1",
-    "cudaErrorMemoryValueTooLarge" => "3.1",
-    "cudaErrorInvalidHostPointer" => "10.1",
-    "cudaErrorInvalidDevicePointer" => "10.1",
-    "cudaErrorApiFailureBase" => "4.1",
-    "cudaErrorAddressOfConstant" => "3.1",
-    "cudaDeviceSetSharedMemConfig" => "12.4",
-    "cudaDeviceGetSharedMemConfig" => "12.4",
-    "cudaDevAttrMaxTimelineSemaphoreInteropSupported" => "11.5",
-    "cudaD3D9UnregisterResource" => "10.0",
-    "cudaD3D9UnmapResources" => "10.0",
-    "cudaD3D9ResourceSetMapFlags" => "10.0",
-    "cudaD3D9ResourceGetSurfaceDimensions" => "10.0",
-    "cudaD3D9ResourceGetMappedSize" => "10.0",
-    "cudaD3D9ResourceGetMappedPointer" => "10.0",
-    "cudaD3D9ResourceGetMappedPitch" => "10.0",
-    "cudaD3D9ResourceGetMappedArray" => "10.0",
-    "cudaD3D9MapResources" => "10.0",
-    "cudaD3D11SetDirect3DDevice" => "10.0",
-    "cudaD3D11GetDirect3DDevice" => "10.0",
-    "cudaD3D10UnregisterResource" => "10.0",
-    "cudaD3D10UnmapResources" => "10.0",
-    "cudaD3D10SetDirect3DDevice" => "10.0",
-    "cudaD3D10ResourceSetMapFlags" => "10.0",
-    "cudaD3D10ResourceGetSurfaceDimensions" => "10.0",
-    "cudaD3D10ResourceGetMappedSize" => "10.0",
-    "cudaD3D10ResourceGetMappedPointer" => "10.0",
-    "cudaD3D10ResourceGetMappedPitch" => "10.0",
-    "cudaD3D10ResourceGetMappedArray" => "10.0",
-    "cudaD3D10RegisterResource" => "10.0",
-    "cudaD3D10MapResources" => "10.0",
-    "cudaD3D10GetDirect3DDevice" => "10.0",
-    "cuTexRefSetMipmappedArray" => "11.0",
-    "cuTexRefSetMipmapLevelClamp" => "11.0",
-    "cuTexRefSetMipmapLevelBias" => "11.0",
-    "cuTexRefSetMipmapFilterMode" => "11.0",
-    "cuTexRefSetMaxAnisotropy" => "11.0",
-    "cuTexRefSetFormat" => "11.0",
-    "cuTexRefSetFlags" => "11.0",
-    "cuTexRefSetFilterMode" => "11.0",
-    "cuTexRefSetBorderColor" => "11.0",
-    "cuTexRefSetArray" => "11.0",
-    "cuTexRefSetAddress_v2" => "11.0",
-    "cuTexRefSetAddressMode" => "11.0",
-    "cuTexRefSetAddress2D" => "11.0",
-    "cuTexRefSetAddress" => "11.0",
-    "cuTexRefGetMipmappedArray" => "11.0",
-    "cuTexRefGetMipmapLevelClamp" => "11.0",
-    "cuTexRefGetMipmapLevelBias" => "11.0",
-    "cuTexRefGetMipmapFilterMode" => "11.0",
-    "cuTexRefGetMaxAnisotropy" => "11.0",
-    "cuTexRefGetFormat" => "11.0",
-    "cuTexRefGetFlags" => "11.0",
-    "cuTexRefGetFilterMode" => "11.0",
-    "cuTexRefGetBorderColor" => "11.0",
-    "cuTexRefGetArray" => "11.0",
-    "cuTexRefGetAddress_v2" => "11.0",
-    "cuTexRefGetAddressMode" => "11.0",
-    "cuTexRefGetAddress" => "11.0",
-    "cuTexRefDestroy" => "11.0",
-    "cuTexRefCreate" => "11.0",
-    "cuSurfRefSetArray" => "11.0",
-    "cuSurfRefGetArray" => "11.0",
-    "cuParamSetv" => "9.2",
-    "cuParamSeti" => "9.2",
-    "cuParamSetf" => "9.2",
-    "cuParamSetTexRef" => "9.2",
-    "cuParamSetSize" => "9.2",
-    "cuModuleGetTexRef" => "12.0",
-    "cuModuleGetSurfRef" => "12.0",
-    "cuLaunchGridAsync" => "9.2",
-    "cuLaunchGrid" => "9.2",
-    "cuLaunchCooperativeKernelMultiDevice" => "11.3",
-    "cuLaunch" => "9.2",
-    "cuGLUnregisterBufferObject" => "9.2",
-    "cuGLUnmapBufferObjectAsync" => "9.2",
-    "cuGLUnmapBufferObject" => "9.2",
-    "cuGLSetBufferObjectMapFlags" => "9.2",
-    "cuGLRegisterBufferObject" => "9.2",
-    "cuGLMapBufferObjectAsync" => "9.2",
-    "cuGLMapBufferObject" => "9.2",
-    "cuGLInit" => "9.2",
-    "cuGLCtxCreate" => "9.2",
-    "cuFuncSetSharedSize" => "9.2",
-    "cuFuncSetSharedMemConfig" => "",
-    "cuFuncSetBlockShape" => "9.2",
-    "cuDeviceGetProperties" => "9.2",
-    "cuDeviceComputeCapability" => "9.2",
-    "cuD3D9UnregisterResource" => "9.2",
-    "cuD3D9UnmapResources" => "9.2",
-    "cuD3D9ResourceSetMapFlags" => "9.2",
-    "cuD3D9ResourceGetSurfaceDimensions" => "9.2",
-    "cuD3D9ResourceGetMappedSize" => "9.2",
-    "cuD3D9ResourceGetMappedPointer" => "9.2",
-    "cuD3D9ResourceGetMappedPitch" => "9.2",
-    "cuD3D9ResourceGetMappedArray" => "9.2",
-    "cuD3D9RegisterResource" => "9.2",
-    "cuD3D9MapResources" => "9.2",
-    "cuD3D11GetDirect3DDevice" => "9.2",
-    "cuD3D11CtxCreateOnDevice" => "9.2",
-    "cuD3D11CtxCreate" => "9.2",
-    "cuD3D10UnregisterResource" => "9.2",
-    "cuD3D10UnmapResources" => "9.2",
-    "cuD3D10ResourceSetMapFlags" => "9.2",
-    "cuD3D10ResourceGetSurfaceDimensions" => "9.2",
-    "cuD3D10ResourceGetMappedSize" => "9.2",
-    "cuD3D10ResourceGetMappedPointer" => "9.2",
-    "cuD3D10ResourceGetMappedPitch" => "9.2",
-    "cuD3D10ResourceGetMappedArray" => "9.2",
-    "cuD3D10RegisterResource" => "9.2",
-    "cuD3D10MapResources" => "9.2",
-    "cuD3D10GetDirect3DDevice" => "9.2",
-    "cuD3D10CtxCreateOnDevice" => "9.2",
-    "cuD3D10CtxCreate" => "9.2",
-    "cuCtxSetSharedMemConfig" => "",
-    "cuCtxGetSharedMemConfig" => "",
-    "csru2csrInfo_t" => "12.2",
-    "csru2csrInfo" => "12.2",
-    "csrilu02Info_t" => "12.2",
-    "csrilu02Info" => "12.2",
-    "csric02Info_t" => "12.2",
-    "csric02Info" => "12.2",
-    "bsrsv2Info_t" => "12.2",
-    "bsrsv2Info" => "12.2",
-    "bsrsm2Info_t" => "12.2",
-    "bsrsm2Info" => "12.2",
-    "bsrilu02Info_t" => "12.2",
-    "bsrilu02Info" => "12.2",
-    "CU_JIT_REFERENCED_VARIABLE_NAMES" => "12.0",
-    "CU_JIT_REFERENCED_VARIABLE_COUNT" => "12.0",
-    "CU_JIT_REFERENCED_KERNEL_NAMES" => "12.0",
-    "CU_JIT_REFERENCED_KERNEL_COUNT" => "12.0",
-    "CU_JIT_PREC_SQRT" => "12.0",
-    "CU_JIT_PREC_DIV" => "12.0",
-    "CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES" => "12.0",
-    "CU_JIT_LTO" => "12.0",
-    "CU_JIT_INPUT_NVVM" => "12.0",
-    "CU_JIT_FTZ" => "12.0",
-    "CU_JIT_FMA" => "12.0",
-    "CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED" => "10.0",
-    "CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED" => "10.1",
-    "CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED" => "11.2",
-    "CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK" => "5.0",
-    "CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK" => "5.0",
-    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH" => "5.0",
-    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES" => "5.0",
-    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT" => "5.0",
-    "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH" => "11.2",
-    "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP" => "5.0",
-    "CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER" => "5.0",
-    "CU_CTX_BLOCKING_SYNC" => "4.0",
-    "CUSPARSE_SOLVE_POLICY_USE_LEVEL" => "12.2",
-    "CUSPARSE_SOLVE_POLICY_NO_LEVEL" => "12.2",
-    "CUSPARSE_MV_ALG_DEFAULT" => "11.3",
-    "CUSPARSE_MM_ALG_DEFAULT" => "11.0",
-    "CUSPARSE_HYB_PARTITION_USER" => "10.2",
-    "CUSPARSE_HYB_PARTITION_MAX" => "10.2",
-    "CUSPARSE_HYB_PARTITION_AUTO" => "10.2",
-    "CUSPARSE_CSRMV_ALG2" => "11.2",
-    "CUSPARSE_CSRMV_ALG1" => "11.2",
-    "CUSPARSE_CSRMM_ALG1" => "11.0",
-    "CUSPARSE_COOMV_ALG" => "11.2",
-    "CUSPARSE_COOMM_ALG3" => "11.0",
-    "CUSPARSE_COOMM_ALG2" => "11.0",
-    "CUSPARSE_COOMM_ALG1" => "11.0",
-    "CUSPARSE_COLOR_ALG1" => "12.2",
-    "CUSPARSE_COLOR_ALG0" => "12.2",
-    "CUDNN_TYPE_NAN_PROPOGATION" => "9.0.0",
-    "CUDNN_STATUS_VERSION_MISMATCH" => "9.0.0",
-    "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING" => "9.0.0",
-    "CUDNN_STATUS_MAPPING_ERROR" => "9.0.0",
-    "CUDNN_STATUS_INVALID_VALUE" => "9.0.0",
-    "CUDNN_STATUS_ARCH_MISMATCH" => "9.0.0",
-    "CUDNN_STATUS_ALLOC_FAILED" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_NO_INDICES" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_NORM2" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_NORM1" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_MUL" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_MIN" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_MAX" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_FLATTENED_INDICES" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_AVG" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_AMAX" => "9.0.0",
-    "CUDNN_REDUCE_TENSOR_ADD" => "9.0.0",
-    "CUDNN_PROPAGATE_NAN" => "9.0.0",
-    "CUDNN_POOLING_MAX_DETERMINISTIC" => "9.0.0",
-    "CUDNN_POOLING_MAX" => "9.0.0",
-    "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING" => "9.0.0",
-    "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING" => "9.0.0",
-    "CUDNN_NO_REORDER" => "9.0.0",
-    "CUDNN_NOT_PROPAGATE_NAN" => "9.0.0",
-    "CUDNN_NORM_PER_CHANNEL" => "9.0.0",
-    "CUDNN_NORM_PER_ACTIVATION" => "9.0.0",
-    "CUDNN_NORM_OPS_NORM_ADD_ACTIVATION" => "9.0.0",
-    "CUDNN_NORM_OPS_NORM_ACTIVATION" => "9.0.0",
-    "CUDNN_NORM_OPS_NORM" => "9.0.0",
-    "CUDNN_NORM_ALGO_STANDARD" => "9.0.0",
-    "CUDNN_NORM_ALGO_PERSIST" => "9.0.0",
-    "CUDNN_KNOB_TYPE_WINO_TILE" => "9.0.0",
-    "CUDNN_KNOB_TYPE_USE_TEX" => "9.0.0",
-    "CUDNN_KNOB_TYPE_TILE_CGA" => "9.0.0",
-    "CUDNN_KNOB_TYPE_SPLIT_RS" => "9.0.0",
-    "CUDNN_KNOB_TYPE_SPLIT_K" => "9.0.0",
-    "CUDNN_KNOB_TYPE_SPLIT_H" => "9.0.0",
-    "CUDNN_KNOB_TYPE_SLICED" => "9.0.0",
-    "CUDNN_KNOB_TYPE_SINGLEBUFFER" => "9.0.0",
-    "CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK" => "9.0.0",
-    "CUDNN_KNOB_TYPE_LDGC" => "9.0.0",
-    "CUDNN_KNOB_TYPE_LDGB" => "9.0.0",
-    "CUDNN_KNOB_TYPE_LDGA" => "9.0.0",
-    "CUDNN_KNOB_TYPE_KBLOCK" => "9.0.0",
-    "CUDNN_KNOB_TYPE_IDX_MODE" => "9.0.0",
-    "CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE" => "9.0.0",
-    "CUDNN_KNOB_TYPE_CHUNK_K" => "9.0.0",
-    "CUDNN_DEFAULT_REORDER" => "9.0.0",
-    "CUDNN_DATA_UINT8x4" => "9.0.0",
-    "CUDNN_DATA_INT8x4" => "9.0.0",
-    "CUDNN_DATA_INT8x32" => "9.0.0",
-    "CUDNN_CROSS_CORRELATION" => "9.0.0",
-    "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT" => "7.6.5",
-    "CUDNN_CONVOLUTION_FWD_PREFER_FASTEST" => "7.6.5",
-    "CUDNN_CONVOLUTION_FWD_NO_WORKSPACE" => "7.6.5",
-    "CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT" => "7.6.5",
-    "CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST" => "7.6.5",
-    "CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE" => "7.6.5",
-    "CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT" => "7.6.5",
-    "CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST" => "7.6.5",
-    "CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE" => "7.6.5",
-    "CUDNN_CONVOLUTION" => "9.0.0",
-    "CUDNN_BATCHNORM_SPATIAL_PERSISTENT" => "9.0.0",
-    "CUDNN_BATCHNORM_SPATIAL" => "9.0.0",
-    "CUDNN_BATCHNORM_PER_ACTIVATION" => "9.0.0",
-    "CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION" => "9.0.0",
-    "CUDNN_BATCHNORM_OPS_BN_ACTIVATION" => "9.0.0",
-    "CUDNN_BATCHNORM_OPS_BN" => "9.0.0",
-    "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA" => "9.0.0",
-    "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA" => "9.0.0",
-    "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA" => "9.0.0",
-    "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA" => "9.0.0",
-    "CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT" => "9.0.0",
-    "CUDNN_ACTIVATION_TANH" => "9.0.0",
-    "CUDNN_ACTIVATION_SWISH" => "9.0.0",
-    "CUDNN_ACTIVATION_SIGMOID" => "9.0.0",
-    "CUDNN_ACTIVATION_RELU" => "9.0.0",
-    "CUDNN_ACTIVATION_IDENTITY" => "9.0.0",
-    "CUDNN_ACTIVATION_ELU" => "9.0.0",
-    "CUDNN_ACTIVATION_CLIPPED_RELU" => "9.0.0",
-    "CUDNN_8BIT_INDICES" => "9.0.0",
-    "CUDNN_64BIT_INDICES" => "9.0.0",
-    "CUDNN_32BIT_INDICES" => "9.0.0",
-    "CUDNN_16BIT_INDICES" => "9.0.0",
-    "CUDA_ERROR_PROFILER_NOT_INITIALIZED" => "5.0",
-    "CUDA_ERROR_PROFILER_ALREADY_STOPPED" => "5.0",
-    "CUDA_ERROR_PROFILER_ALREADY_STARTED" => "5.0",
-    "CUDA_ERROR_CONTEXT_ALREADY_CURRENT" => "3.2",
-    "CUDA_ARRAY3D_2DARRAY" => "5.0",
-    "CUBLAS_TENSOR_OP_MATH" => "11.0"
-);
-
-my %removed_funcs = (
-    "texture" => "12.0",
-    "surfaceReference" => "12.0",
-    "cusparseZsctr" => "12.0",
-    "cusparseZhybsv_solve" => "11.0",
-    "cusparseZhybsv_analysis" => "11.0",
-    "cusparseZhybmv" => "11.0",
-    "cusparseZhyb2dense" => "11.0",
-    "cusparseZhyb2csr" => "11.0",
-    "cusparseZhyb2csc" => "11.0",
-    "cusparseZgtsv_nopivot" => "11.0",
-    "cusparseZgtsvStridedBatch" => "11.0",
-    "cusparseZgtsv" => "11.0",
-    "cusparseZgthrz" => "12.0",
-    "cusparseZgthr" => "12.0",
-    "cusparseZgemmi" => "12.0",
-    "cusparseZdoti" => "11.0",
-    "cusparseZdotci" => "11.0",
-    "cusparseZdense2hyb" => "11.0",
-    "cusparseZdense2csr" => "12.0",
-    "cusparseZdense2csc" => "12.0",
-    "cusparseZcsrsv_solve" => "11.0",
-    "cusparseZcsrsv_analysis" => "11.0",
-    "cusparseZcsrsv2_solve" => "12.0",
-    "cusparseZcsrsv2_bufferSizeExt" => "12.0",
-    "cusparseZcsrsv2_bufferSize" => "12.0",
-    "cusparseZcsrsv2_analysis" => "12.0",
-    "cusparseZcsrsm_solve" => "11.0",
-    "cusparseZcsrsm_analysis" => "11.0",
-    "cusparseZcsrsm2_solve" => "12.0",
-    "cusparseZcsrsm2_bufferSizeExt" => "12.0",
-    "cusparseZcsrsm2_analysis" => "12.0",
-    "cusparseZcsrmv_mp" => "11.0",
-    "cusparseZcsrmv" => "11.0",
-    "cusparseZcsrmm2" => "11.0",
-    "cusparseZcsrmm" => "11.0",
-    "cusparseZcsrilu0" => "11.0",
-    "cusparseZcsric0" => "11.0",
-    "cusparseZcsrgemm2_bufferSizeExt" => "12.0",
-    "cusparseZcsrgemm2" => "12.0",
-    "cusparseZcsrgemm" => "11.0",
-    "cusparseZcsrgeam" => "11.0",
-    "cusparseZcsr2hyb" => "11.0",
-    "cusparseZcsr2dense" => "12.0",
-    "cusparseZcsr2csc" => "11.0",
-    "cusparseZcsc2hyb" => "11.0",
-    "cusparseZcsc2dense" => "12.0",
-    "cusparseZaxpyi" => "12.0",
-    "cusparseXcsrsv2_zeroPivot" => "12.0",
-    "cusparseXcsrsm2_zeroPivot" => "12.0",
-    "cusparseXcsrgemmNnz" => "11.0",
-    "cusparseXcsrgemm2Nnz" => "12.0",
-    "cusparseXcsrgeamNnz" => "11.0",
-    "cusparseSsctr" => "12.0",
-    "cusparseSroti" => "12.0",
-    "cusparseSpMatSetStridedBatch" => "12.0",
-    "cusparseSolveAnalysisInfo_t" => "11.0",
-    "cusparseSolveAnalysisInfo" => "11.0",
-    "cusparseSideMode_t" => "11.5",
-    "cusparseShybsv_solve" => "11.0",
-    "cusparseShybsv_analysis" => "11.0",
-    "cusparseShybmv" => "11.0",
-    "cusparseShyb2dense" => "11.0",
-    "cusparseShyb2csr" => "11.0",
-    "cusparseShyb2csc" => "11.0",
-    "cusparseSgtsv_nopivot" => "11.0",
-    "cusparseSgtsvStridedBatch" => "11.0",
-    "cusparseSgtsv" => "11.0",
-    "cusparseSgthrz" => "12.0",
-    "cusparseSgthr" => "12.0",
-    "cusparseSgemmi" => "12.0",
-    "cusparseSdoti" => "11.0",
-    "cusparseSdense2hyb" => "11.0",
-    "cusparseSdense2csr" => "12.0",
-    "cusparseSdense2csc" => "12.0",
-    "cusparseScsrsv_solve" => "11.0",
-    "cusparseScsrsv_analysis" => "11.0",
-    "cusparseScsrsv2_solve" => "12.0",
-    "cusparseScsrsv2_bufferSizeExt" => "12.0",
-    "cusparseScsrsv2_bufferSize" => "12.0",
-    "cusparseScsrsv2_analysis" => "12.0",
-    "cusparseScsrsm_solve" => "11.0",
-    "cusparseScsrsm_analysis" => "11.0",
-    "cusparseScsrsm2_solve" => "12.0",
-    "cusparseScsrsm2_bufferSizeExt" => "12.0",
-    "cusparseScsrsm2_analysis" => "12.0",
-    "cusparseScsrmv_mp" => "11.0",
-    "cusparseScsrmv" => "11.0",
-    "cusparseScsrmm2" => "11.0",
-    "cusparseScsrmm" => "11.0",
-    "cusparseScsrilu0" => "11.0",
-    "cusparseScsric0" => "11.0",
-    "cusparseScsrgemm2_bufferSizeExt" => "12.0",
-    "cusparseScsrgemm2" => "12.0",
-    "cusparseScsrgemm" => "11.0",
-    "cusparseScsrgeam" => "11.0",
-    "cusparseScsr2hyb" => "11.0",
-    "cusparseScsr2dense" => "12.0",
-    "cusparseScsr2csc" => "11.0",
-    "cusparseScsc2hyb" => "11.0",
-    "cusparseScsc2dense" => "12.0",
-    "cusparseSaxpyi" => "12.0",
-    "cusparseHybPartition_t" => "11.0",
-    "cusparseHybMat_t" => "11.0",
-    "cusparseHybMat" => "11.0",
-    "cusparseGetLevelInfo" => "11.0",
-    "cusparseDsctr" => "12.0",
-    "cusparseDroti" => "12.0",
-    "cusparseDhybsv_solve" => "11.0",
-    "cusparseDhybsv_analysis" => "11.0",
-    "cusparseDhybmv" => "11.0",
-    "cusparseDhyb2dense" => "11.0",
-    "cusparseDhyb2csr" => "11.0",
-    "cusparseDhyb2csc" => "11.0",
-    "cusparseDgtsv_nopivot" => "11.0",
-    "cusparseDgtsvStridedBatch" => "11.0",
-    "cusparseDgtsv" => "11.0",
-    "cusparseDgthrz" => "12.0",
-    "cusparseDgthr" => "12.0",
-    "cusparseDgemmi" => "12.0",
-    "cusparseDestroySolveAnalysisInfo" => "11.0",
-    "cusparseDestroyHybMat" => "11.0",
-    "cusparseDestroyCsrsv2Info" => "12.0",
-    "cusparseDestroyCsrsm2Info" => "12.0",
-    "cusparseDestroyCsrgemm2Info" => "12.0",
-    "cusparseDdoti" => "11.0",
-    "cusparseDdense2hyb" => "11.0",
-    "cusparseDdense2csr" => "12.0",
-    "cusparseDdense2csc" => "12.0",
-    "cusparseDcsrsv_solve" => "11.0",
-    "cusparseDcsrsv_analysis" => "11.0",
-    "cusparseDcsrsv2_solve" => "12.0",
-    "cusparseDcsrsv2_bufferSizeExt" => "12.0",
-    "cusparseDcsrsv2_bufferSize" => "12.0",
-    "cusparseDcsrsv2_analysis" => "12.0",
-    "cusparseDcsrsm_solve" => "11.0",
-    "cusparseDcsrsm_analysis" => "11.0",
-    "cusparseDcsrsm2_solve" => "12.0",
-    "cusparseDcsrsm2_bufferSizeExt" => "12.0",
-    "cusparseDcsrsm2_analysis" => "12.0",
-    "cusparseDcsrmv_mp" => "11.0",
-    "cusparseDcsrmv" => "11.0",
-    "cusparseDcsrmm2" => "11.0",
-    "cusparseDcsrmm" => "11.0",
-    "cusparseDcsrilu0" => "11.0",
-    "cusparseDcsric0" => "11.0",
-    "cusparseDcsrgemm2_bufferSizeExt" => "12.0",
-    "cusparseDcsrgemm2" => "12.0",
-    "cusparseDcsrgemm" => "11.0",
-    "cusparseDcsrgeam" => "11.0",
-    "cusparseDcsr2hyb" => "11.0",
-    "cusparseDcsr2dense" => "12.0",
-    "cusparseDcsr2csc" => "11.0",
-    "cusparseDcsc2hyb" => "11.0",
-    "cusparseDcsc2dense" => "12.0",
-    "cusparseDaxpyi" => "12.0",
-    "cusparseCsrsv_solveEx" => "11.0",
-    "cusparseCsrsv_analysisEx" => "11.0",
-    "cusparseCsrmvEx_bufferSize" => "12.0",
-    "cusparseCsrmvEx" => "12.0",
-    "cusparseCsrilu0Ex" => "11.0",
-    "cusparseCsr2cscEx" => "11.0",
-    "cusparseCsctr" => "12.0",
-    "cusparseCreateSolveAnalysisInfo" => "11.0",
-    "cusparseCreateHybMat" => "11.0",
-    "cusparseCreateCsrsv2Info" => "12.0",
-    "cusparseCreateCsrsm2Info" => "12.0",
-    "cusparseCreateCsrgemm2Info" => "12.0",
-    "cusparseCreateCooAoS" => "12.0",
-    "cusparseCopyMatDescr" => "12.0",
-    "cusparseCooAoSGet" => "12.0",
-    "cusparseConstrainedGeMM_bufferSize" => "12.0",
-    "cusparseConstrainedGeMM" => "12.0",
-    "cusparseChybsv_solve" => "11.0",
-    "cusparseChybsv_analysis" => "11.0",
-    "cusparseChybmv" => "11.0",
-    "cusparseChyb2dense" => "11.0",
-    "cusparseChyb2csr" => "11.0",
-    "cusparseChyb2csc" => "11.0",
-    "cusparseCgtsv_nopivot" => "11.0",
-    "cusparseCgtsvStridedBatch" => "11.0",
-    "cusparseCgtsv" => "11.0",
-    "cusparseCgthrz" => "12.0",
-    "cusparseCgthr" => "12.0",
-    "cusparseCgemmi" => "12.0",
-    "cusparseCdoti" => "11.0",
-    "cusparseCdotci" => "11.0",
-    "cusparseCdense2hyb" => "11.0",
-    "cusparseCdense2csr" => "12.0",
-    "cusparseCdense2csc" => "12.0",
-    "cusparseCcsrsv_solve" => "11.0",
-    "cusparseCcsrsv_analysis" => "11.0",
-    "cusparseCcsrsv2_solve" => "12.0",
-    "cusparseCcsrsv2_bufferSizeExt" => "12.0",
-    "cusparseCcsrsv2_bufferSize" => "12.0",
-    "cusparseCcsrsv2_analysis" => "12.0",
-    "cusparseCcsrsm_solve" => "11.0",
-    "cusparseCcsrsm_analysis" => "11.0",
-    "cusparseCcsrsm2_solve" => "12.0",
-    "cusparseCcsrsm2_bufferSizeExt" => "12.0",
-    "cusparseCcsrsm2_analysis" => "12.0",
-    "cusparseCcsrmv_mp" => "11.0",
-    "cusparseCcsrmv" => "11.0",
-    "cusparseCcsrmm2" => "11.0",
-    "cusparseCcsrmm" => "11.0",
-    "cusparseCcsrilu0" => "11.0",
-    "cusparseCcsric0" => "11.0",
-    "cusparseCcsrgemm2_bufferSizeExt" => "12.0",
-    "cusparseCcsrgemm2" => "12.0",
-    "cusparseCcsrgemm" => "11.0",
-    "cusparseCcsrgeam" => "11.0",
-    "cusparseCcsr2hyb" => "11.0",
-    "cusparseCcsr2dense" => "12.0",
-    "cusparseCcsr2csc" => "11.0",
-    "cusparseCcsc2hyb" => "11.0",
-    "cusparseCcsc2dense" => "12.0",
-    "cusparseCaxpyi" => "12.0",
-    "cusparseAlgMode_t" => "12.0",
-    "cudnnSetRNNProjectionLayers" => "9.0.0",
-    "cudnnSetRNNPaddingMode" => "9.0.0",
-    "cudnnSetRNNMatrixMathType" => "9.0.0",
-    "cudnnSetRNNDescriptor_v6" => "9.0.0",
-    "cudnnSetRNNDescriptor_v5" => "8.0.1",
-    "cudnnSetRNNDescriptor" => "8.0.1",
-    "cudnnSetRNNBiasMode" => "9.0.0",
-    "cudnnSetRNNAlgorithmDescriptor" => "9.0.0",
-    "cudnnSetPersistentRNNPlan" => "9.0.0",
-    "cudnnSetAlgorithmPerformance" => "9.0.0",
-    "cudnnSetAlgorithmDescriptor" => "9.0.0",
-    "cudnnSaveAlgorithm" => "9.0.0",
-    "cudnnRestoreAlgorithm" => "9.0.0",
-    "cudnnRNNSetClip" => "9.0.0",
-    "cudnnRNNGetClip" => "9.0.0",
-    "cudnnRNNForwardTrainingEx" => "9.0.0",
-    "cudnnRNNForwardTraining" => "9.0.0",
-    "cudnnRNNForwardInferenceEx" => "9.0.0",
-    "cudnnRNNForwardInference" => "9.0.0",
-    "cudnnRNNBackwardWeightsEx" => "9.0.0",
-    "cudnnRNNBackwardWeights" => "9.0.0",
-    "cudnnRNNBackwardDataEx" => "9.0.0",
-    "cudnnRNNBackwardData" => "9.0.0",
-    "cudnnOpsTrainVersionCheck" => "9.0.0",
-    "cudnnGetRNNWorkspaceSize" => "9.0.0",
-    "cudnnGetRNNTrainingReserveSize" => "9.0.0",
-    "cudnnGetRNNProjectionLayers" => "9.0.0",
-    "cudnnGetRNNParamsSize" => "9.0.0",
-    "cudnnGetRNNPaddingMode" => "9.0.0",
-    "cudnnGetRNNMatrixMathType" => "9.0.0",
-    "cudnnGetRNNLinLayerMatrixParams" => "9.0.0",
-    "cudnnGetRNNLinLayerBiasParams" => "9.0.0",
-    "cudnnGetRNNForwardTrainingAlgorithmMaxCount" => "9.0.0",
-    "cudnnGetRNNForwardInferenceAlgorithmMaxCount" => "9.0.0",
-    "cudnnGetRNNDescriptor_v6" => "9.0.0",
-    "cudnnGetRNNDescriptor" => "8.0.1",
-    "cudnnGetRNNBiasMode" => "9.0.0",
-    "cudnnGetRNNBackwardWeightsAlgorithmMaxCount" => "9.0.0",
-    "cudnnGetRNNBackwardDataAlgorithmMaxCount" => "9.0.0",
-    "cudnnGetConvolutionForwardAlgorithm" => "8.0.1",
-    "cudnnGetConvolutionBackwardFilterAlgorithm" => "8.0.1",
-    "cudnnGetConvolutionBackwardDataAlgorithm" => "8.0.1",
-    "cudnnGetAlgorithmSpaceSize" => "9.0.0",
-    "cudnnGetAlgorithmPerformance" => "9.0.0",
-    "cudnnGetAlgorithmDescriptor" => "9.0.0",
-    "cudnnFindRNNForwardTrainingAlgorithmEx" => "9.0.0",
-    "cudnnFindRNNForwardInferenceAlgorithmEx" => "9.0.0",
-    "cudnnFindRNNBackwardWeightsAlgorithmEx" => "9.0.0",
-    "cudnnFindRNNBackwardDataAlgorithmEx" => "9.0.0",
-    "cudnnDestroyPersistentRNNPlan" => "9.0.0",
-    "cudnnDestroyAlgorithmPerformance" => "9.0.0",
-    "cudnnDestroyAlgorithmDescriptor" => "9.0.0",
-    "cudnnCreatePersistentRNNPlan" => "9.0.0",
-    "cudnnCreateAlgorithmPerformance" => "9.0.0",
-    "cudnnCreateAlgorithmDescriptor" => "9.0.0",
-    "cudnnCopyAlgorithmDescriptor" => "9.0.0",
-    "cudnnConvolutionFwdPreference_t" => "8.0.1",
-    "cudnnConvolutionBwdFilterPreference_t" => "8.0.1",
-    "cudnnConvolutionBwdDataPreference_t" => "8.0.1",
-    "cudnnAttnQueryMap_t" => "9.0.0",
-    "cudnnAlgorithm_t" => "9.0.0",
-    "cudnnAlgorithmUnionStruct" => "9.0.0",
-    "cudnnAlgorithmStruct" => "9.0.0",
-    "cudnnAlgorithmPerformance_t" => "9.0.0",
-    "cudnnAlgorithmPerformanceStruct" => "9.0.0",
-    "cudnnAlgorithmDescriptor_t" => "9.0.0",
-    "cudnnAdvTrainVersionCheck" => "9.0.0",
-    "cudnnAdvInferVersionCheck" => "9.0.0",
-    "cudaUnbindTexture" => "12.0",
-    "cudaSetupArgument" => "10.1",
-    "cudaProfilerInitialize" => "12.0",
-    "cudaOutputMode_t" => "12.0",
-    "cudaOutputMode" => "12.0",
-    "cudaLaunch" => "10.1",
-    "cudaKeyValuePair" => "12.0",
-    "cudaGetTextureReference" => "12.0",
-    "cudaGetTextureObjectTextureDesc_v2" => "12.0",
-    "cudaGetTextureAlignmentOffset" => "12.0",
-    "cudaGetSurfaceReference" => "12.0",
-    "cudaDevicePropDontCare" => "12.0",
-    "cudaCreateTextureObject_v2" => "12.0",
-    "cudaConfigureCall" => "10.1",
-    "cudaCSV" => "12.0",
-    "cudaBindTextureToMipmappedArray" => "12.0",
-    "cudaBindTextureToArray" => "12.0",
-    "cudaBindTexture2D" => "12.0",
-    "cudaBindTexture" => "12.0",
-    "cudaBindSurfaceToArray" => "12.0",
-    "csrsv2Info_t" => "12.0",
-    "csrsv2Info" => "12.0",
-    "csrsm2Info_t" => "12.0",
-    "csrsm2Info" => "12.0",
-    "csrgemm2Info_t" => "12.0",
-    "csrgemm2Info" => "12.0",
-    "CUstreamAttrID_enum" => "11.8",
-    "CUkernelNodeAttrValue_union" => "11.8",
-    "CUkernelNodeAttrID_enum" => "11.8",
-    "CU_TARGET_COMPUTE_73" => "10.0",
-    "CU_TARGET_COMPUTE_21" => "12.0",
-    "CU_TARGET_COMPUTE_20" => "12.0",
-    "CU_TARGET_COMPUTE_13" => "9.0",
-    "CU_TARGET_COMPUTE_12" => "9.0",
-    "CU_TARGET_COMPUTE_11" => "9.0",
-    "CU_TARGET_COMPUTE_10" => "9.0",
-    "CU_GRAPH_NODE_TYPE_COUNT" => "11.0",
-    "CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED" => "10.1",
-    "CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2" => "12.0",
-    "CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS" => "12.0",
-    "CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2" => "12.0",
-    "CU_COMPUTEMODE_EXCLUSIVE" => "8.0",
-    "CUSPARSE_SPMMA_PREPROCESS" => "11.2",
-    "CUSPARSE_SPMMA_ALG4" => "11.2",
-    "CUSPARSE_SPMMA_ALG3" => "11.2",
-    "CUSPARSE_SPMMA_ALG2" => "11.2",
-    "CUSPARSE_SPMMA_ALG1" => "11.2",
-    "CUSPARSE_SIDE_RIGHT" => "11.5",
-    "CUSPARSE_SIDE_LEFT" => "11.5",
-    "CUSPARSE_MV_ALG_DEFAULT" => "12.0",
-    "CUSPARSE_MM_ALG_DEFAULT" => "12.0",
-    "CUSPARSE_HYB_PARTITION_USER" => "11.0",
-    "CUSPARSE_HYB_PARTITION_MAX" => "11.0",
-    "CUSPARSE_HYB_PARTITION_AUTO" => "11.0",
-    "CUSPARSE_FORMAT_COO_AOS" => "12.0",
-    "CUSPARSE_CSRMV_ALG2" => "12.0",
-    "CUSPARSE_CSRMV_ALG1" => "12.0",
-    "CUSPARSE_CSRMM_ALG1" => "12.0",
-    "CUSPARSE_COOMV_ALG" => "12.0",
-    "CUSPARSE_COOMM_ALG3" => "12.0",
-    "CUSPARSE_COOMM_ALG2" => "12.0",
-    "CUSPARSE_COOMM_ALG1" => "12.0",
-    "CUSPARSE_ALG_NAIVE" => "11.0",
-    "CUSPARSE_ALG_MERGE_PATH" => "12.0",
-    "CUSPARSE_ALG1" => "11.0",
-    "CUSPARSE_ALG0" => "11.0",
-    "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT" => "8.0.1",
-    "CUDNN_CONVOLUTION_FWD_PREFER_FASTEST" => "8.0.1",
-    "CUDNN_CONVOLUTION_FWD_NO_WORKSPACE" => "8.0.1",
-    "CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT" => "8.0.1",
-    "CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST" => "8.0.1",
-    "CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE" => "8.0.1",
-    "CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT" => "8.0.1",
-    "CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST" => "8.0.1",
-    "CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE" => "8.0.1",
-    "CUDA_MEM_ALLOC_NODE_PARAMS_st" => "12.2",
-    "CUDA_BATCH_MEM_OP_NODE_PARAMS_st" => "12.2"
-);
-
-my %experimental_funcs = (
-    "cusolverDnXgetrs" => "6.2.0",
-    "cusolverDnXgetrf_bufferSize" => "6.2.0",
-    "cusolverDnXgetrf" => "6.2.0",
-    "cusolverDnSetAdvOptions" => "6.2.0",
-    "cusolverDnParams_t" => "6.2.0",
-    "cusolverDnFunction_t" => "6.2.0",
-    "cusolverDnDestroyParams" => "6.2.0",
-    "cusolverDnCreateParams" => "6.2.0",
-    "cusolverAlgMode_t" => "6.2.0",
-    "curandStateSobol64_t" => "6.2.0",
-    "curandStateSobol64" => "6.2.0",
-    "curandStateScrambledSobol64_t" => "6.2.0",
-    "curandStateScrambledSobol64" => "6.2.0",
-    "curandStateScrambledSobol32_t" => "6.2.0",
-    "curandStateScrambledSobol32" => "6.2.0",
-    "curandSetGeneratorOrdering" => "6.2.0",
-    "curandOrdering_t" => "6.2.0",
-    "curandOrdering" => "6.2.0",
-    "cudaStreamLegacy" => "6.2.0",
-    "cudaStreamBeginCaptureToGraph" => "6.2.0",
-    "cudaSetValidDevices" => "6.2.0",
-    "cudaMemcpy2DArrayToArray" => "6.2.0",
-    "cudaLaunchAttributeValue" => "6.2.0",
-    "cudaLaunchAttributePriority" => "6.2.0",
-    "cudaLaunchAttributeID" => "6.2.0",
-    "cudaLaunchAttributeCooperative" => "6.2.0",
-    "cudaLaunchAttributeAccessPolicyWindow" => "6.2.0",
-    "cudaKernelNodeAttributePriority" => "6.2.0",
-    "cudaGraphKernelNodePortProgrammatic" => "6.2.0",
-    "cudaGraphKernelNodePortLaunchCompletion" => "6.2.0",
-    "cudaGraphKernelNodePortDefault" => "6.2.0",
-    "cudaGraphInstantiateWithParams" => "6.2.0",
-    "cudaGraphEdgeData_st" => "6.2.0",
-    "cudaGraphEdgeData" => "6.2.0",
-    "cudaGraphDependencyType_enum" => "6.2.0",
-    "cudaGraphDependencyTypeProgrammatic" => "6.2.0",
-    "cudaGraphDependencyTypeDefault" => "6.2.0",
-    "cudaGraphDependencyType" => "6.2.0",
-    "cudaGraphAddNode" => "6.2.0",
-    "cudaGetFuncBySymbol" => "6.2.0",
-    "cudaGetDriverEntryPoint" => "6.2.0",
-    "cudaDriverEntryPointVersionNotSufficent" => "6.2.0",
-    "cudaDriverEntryPointSymbolNotFound" => "6.2.0",
-    "cudaDriverEntryPointSuccess" => "6.2.0",
-    "cudaDriverEntryPointQueryResult" => "6.2.0",
-    "cublasZtrsv_v2_64" => "6.2.0",
-    "cublasZtrsv_64" => "6.2.0",
-    "cublasZtrmv_v2_64" => "6.2.0",
-    "cublasZtrmv_64" => "6.2.0",
-    "cublasZtpsv_v2_64" => "6.2.0",
-    "cublasZtpsv_64" => "6.2.0",
-    "cublasZtpmv_v2_64" => "6.2.0",
-    "cublasZtpmv_64" => "6.2.0",
-    "cublasZtbsv_v2_64" => "6.2.0",
-    "cublasZtbsv_64" => "6.2.0",
-    "cublasZtbmv_v2_64" => "6.2.0",
-    "cublasZtbmv_64" => "6.2.0",
-    "cublasZsyr_v2_64" => "6.2.0",
-    "cublasZsyr_64" => "6.2.0",
-    "cublasZsyr2_v2_64" => "6.2.0",
-    "cublasZsyr2_64" => "6.2.0",
-    "cublasZsymv_v2_64" => "6.2.0",
-    "cublasZsymv_64" => "6.2.0",
-    "cublasZhpr_v2_64" => "6.2.0",
-    "cublasZhpr_64" => "6.2.0",
-    "cublasZhpr2_v2_64" => "6.2.0",
-    "cublasZhpr2_64" => "6.2.0",
-    "cublasZhpmv_v2_64" => "6.2.0",
-    "cublasZhpmv_64" => "6.2.0",
-    "cublasZher_v2_64" => "6.2.0",
-    "cublasZher_64" => "6.2.0",
-    "cublasZher2_v2_64" => "6.2.0",
-    "cublasZher2_64" => "6.2.0",
-    "cublasZhemv_v2_64" => "6.2.0",
-    "cublasZhemv_64" => "6.2.0",
-    "cublasZhbmv_v2_64" => "6.2.0",
-    "cublasZhbmv_64" => "6.2.0",
-    "cublasZgeru_v2_64" => "6.2.0",
-    "cublasZgeru_64" => "6.2.0",
-    "cublasZgerc_v2_64" => "6.2.0",
-    "cublasZgerc_64" => "6.2.0",
-    "cublasZgemv_v2_64" => "6.2.0",
-    "cublasZgemv_64" => "6.2.0",
-    "cublasZgemvStridedBatched_64" => "6.2.0",
-    "cublasZgemvBatched_64" => "6.2.0",
-    "cublasZgbmv_v2_64" => "6.2.0",
-    "cublasZgbmv_64" => "6.2.0",
-    "cublasStrsv_v2_64" => "6.2.0",
-    "cublasStrsv_64" => "6.2.0",
-    "cublasStrmv_v2_64" => "6.2.0",
-    "cublasStrmv_64" => "6.2.0",
-    "cublasStpsv_v2_64" => "6.2.0",
-    "cublasStpsv_64" => "6.2.0",
-    "cublasStpmv_v2_64" => "6.2.0",
-    "cublasStpmv_64" => "6.2.0",
-    "cublasStbsv_v2_64" => "6.2.0",
-    "cublasStbsv_64" => "6.2.0",
-    "cublasStbmv_v2_64" => "6.2.0",
-    "cublasStbmv_64" => "6.2.0",
-    "cublasSsyr_v2_64" => "6.2.0",
-    "cublasSsyr_64" => "6.2.0",
-    "cublasSsyr2_v2_64" => "6.2.0",
-    "cublasSsyr2_64" => "6.2.0",
-    "cublasSsymv_v2_64" => "6.2.0",
-    "cublasSsymv_64" => "6.2.0",
-    "cublasSspr_v2_64" => "6.2.0",
-    "cublasSspr_64" => "6.2.0",
-    "cublasSspr2_v2_64" => "6.2.0",
-    "cublasSspr2_64" => "6.2.0",
-    "cublasSspmv_v2_64" => "6.2.0",
-    "cublasSspmv_64" => "6.2.0",
-    "cublasSsbmv_v2_64" => "6.2.0",
-    "cublasSsbmv_64" => "6.2.0",
-    "cublasSger_v2_64" => "6.2.0",
-    "cublasSger_64" => "6.2.0",
-    "cublasSgemv_v2_64" => "6.2.0",
-    "cublasSgemv_64" => "6.2.0",
-    "cublasSgemvStridedBatched_64" => "6.2.0",
-    "cublasSgemvBatched_64" => "6.2.0",
-    "cublasSgbmv_v2_64" => "6.2.0",
-    "cublasSgbmv_64" => "6.2.0",
-    "cublasScalEx_64" => "6.2.0",
-    "cublasRotEx_64" => "6.2.0",
-    "cublasNrm2Ex_64" => "6.2.0",
-    "cublasDtrsv_v2_64" => "6.2.0",
-    "cublasDtrsv_64" => "6.2.0",
-    "cublasDtrmv_v2_64" => "6.2.0",
-    "cublasDtrmv_64" => "6.2.0",
-    "cublasDtpsv_v2_64" => "6.2.0",
-    "cublasDtpsv_64" => "6.2.0",
-    "cublasDtpmv_v2_64" => "6.2.0",
-    "cublasDtpmv_64" => "6.2.0",
-    "cublasDtbsv_v2_64" => "6.2.0",
-    "cublasDtbsv_64" => "6.2.0",
-    "cublasDtbmv_v2_64" => "6.2.0",
-    "cublasDtbmv_64" => "6.2.0",
-    "cublasDsyr_v2_64" => "6.2.0",
-    "cublasDsyr_64" => "6.2.0",
-    "cublasDsyr2_v2_64" => "6.2.0",
-    "cublasDsyr2_64" => "6.2.0",
-    "cublasDsymv_v2_64" => "6.2.0",
-    "cublasDsymv_64" => "6.2.0",
-    "cublasDspr_v2_64" => "6.2.0",
-    "cublasDspr_64" => "6.2.0",
-    "cublasDspr2_v2_64" => "6.2.0",
-    "cublasDspr2_64" => "6.2.0",
-    "cublasDspmv_v2_64" => "6.2.0",
-    "cublasDspmv_64" => "6.2.0",
-    "cublasDsbmv_v2_64" => "6.2.0",
-    "cublasDsbmv_64" => "6.2.0",
-    "cublasDotcEx_64" => "6.2.0",
-    "cublasDotEx_64" => "6.2.0",
-    "cublasDger_v2_64" => "6.2.0",
-    "cublasDger_64" => "6.2.0",
-    "cublasDgemv_v2_64" => "6.2.0",
-    "cublasDgemv_64" => "6.2.0",
-    "cublasDgemvStridedBatched_64" => "6.2.0",
-    "cublasDgemvBatched_64" => "6.2.0",
-    "cublasDgbmv_v2_64" => "6.2.0",
-    "cublasDgbmv_64" => "6.2.0",
-    "cublasCtrsv_v2_64" => "6.2.0",
-    "cublasCtrsv_64" => "6.2.0",
-    "cublasCtrmv_v2_64" => "6.2.0",
-    "cublasCtrmv_64" => "6.2.0",
-    "cublasCtpsv_v2_64" => "6.2.0",
-    "cublasCtpsv_64" => "6.2.0",
-    "cublasCtpmv_v2_64" => "6.2.0",
-    "cublasCtpmv_64" => "6.2.0",
-    "cublasCtbsv_v2_64" => "6.2.0",
-    "cublasCtbsv_64" => "6.2.0",
-    "cublasCtbmv_v2_64" => "6.2.0",
-    "cublasCtbmv_64" => "6.2.0",
-    "cublasCsyr_v2_64" => "6.2.0",
-    "cublasCsyr_64" => "6.2.0",
-    "cublasCsyr2_v2_64" => "6.2.0",
-    "cublasCsyr2_64" => "6.2.0",
-    "cublasCsymv_v2_64" => "6.2.0",
-    "cublasCsymv_64" => "6.2.0",
-    "cublasChpr_v2_64" => "6.2.0",
-    "cublasChpr_64" => "6.2.0",
-    "cublasChpr2_v2_64" => "6.2.0",
-    "cublasChpr2_64" => "6.2.0",
-    "cublasChpmv_v2_64" => "6.2.0",
-    "cublasChpmv_64" => "6.2.0",
-    "cublasCher_v2_64" => "6.2.0",
-    "cublasCher_64" => "6.2.0",
-    "cublasCher2_v2_64" => "6.2.0",
-    "cublasCher2_64" => "6.2.0",
-    "cublasChemv_v2_64" => "6.2.0",
-    "cublasChemv_64" => "6.2.0",
-    "cublasChbmv_v2_64" => "6.2.0",
-    "cublasChbmv_64" => "6.2.0",
-    "cublasCgeru_v2_64" => "6.2.0",
-    "cublasCgeru_64" => "6.2.0",
-    "cublasCgerc_v2_64" => "6.2.0",
-    "cublasCgerc_64" => "6.2.0",
-    "cublasCgemv_v2_64" => "6.2.0",
-    "cublasCgemv_64" => "6.2.0",
-    "cublasCgemvStridedBatched_64" => "6.2.0",
-    "cublasCgemvBatched_64" => "6.2.0",
-    "cublasCgbmv_v2_64" => "6.2.0",
-    "cublasCgbmv_64" => "6.2.0",
-    "cublasAxpyEx_64" => "6.2.0",
-    "cuStreamBeginCaptureToGraph" => "6.2.0",
-    "cuMemcpyHtoAAsync_v2" => "6.2.0",
-    "cuMemcpyHtoAAsync" => "6.2.0",
-    "cuMemcpyDtoA_v2" => "6.2.0",
-    "cuMemcpyDtoA" => "6.2.0",
-    "cuMemcpyAtoHAsync_v2" => "6.2.0",
-    "cuMemcpyAtoHAsync" => "6.2.0",
-    "cuMemcpyAtoD_v2" => "6.2.0",
-    "cuMemcpyAtoD" => "6.2.0",
-    "cuMemcpyAtoA_v2" => "6.2.0",
-    "cuMemcpyAtoA" => "6.2.0",
-    "cuGraphInstantiateWithParams" => "6.2.0",
-    "cuGraphAddNode" => "6.2.0",
-    "cuGetProcAddress" => "6.2.0",
-    "CUlaunchAttributeValue_union" => "6.2.0",
-    "CUlaunchAttributeValue" => "6.2.0",
-    "CUlaunchAttributeID_enum" => "6.2.0",
-    "CUlaunchAttributeID" => "6.2.0",
-    "CUgraphEdgeData_st" => "6.2.0",
-    "CUgraphEdgeData" => "6.2.0",
-    "CUgraphDependencyType_enum" => "6.2.0",
-    "CUgraphDependencyType" => "6.2.0",
-    "CUdriverProcAddressQueryResult_enum" => "6.2.0",
-    "CUdriverProcAddressQueryResult" => "6.2.0",
-    "CU_STREAM_LEGACY" => "6.2.0",
-    "CU_LAUNCH_ATTRIBUTE_PRIORITY" => "6.2.0",
-    "CU_LAUNCH_ATTRIBUTE_COOPERATIVE" => "6.2.0",
-    "CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW" => "6.2.0",
-    "CU_KERNEL_NODE_ATTRIBUTE_PRIORITY" => "6.2.0",
-    "CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC" => "6.2.0",
-    "CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER" => "6.2.0",
-    "CU_GRAPH_KERNEL_NODE_PORT_DEFAULT" => "6.2.0",
-    "CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC" => "6.2.0",
-    "CU_GRAPH_DEPENDENCY_TYPE_DEFAULT" => "6.2.0",
-    "CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT" => "6.2.0",
-    "CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND" => "6.2.0",
-    "CU_GET_PROC_ADDRESS_SUCCESS" => "6.2.0",
-    "CUSOLVER_ALG_1" => "6.2.0",
-    "CUSOLVER_ALG_0" => "6.2.0",
-    "CUSOLVERDN_GETRF" => "6.2.0",
-    "CURAND_ORDERING_QUASI_DEFAULT" => "6.2.0",
-    "CURAND_ORDERING_PSEUDO_SEEDED" => "6.2.0",
-    "CURAND_ORDERING_PSEUDO_LEGACY" => "6.2.0",
-    "CURAND_ORDERING_PSEUDO_DYNAMIC" => "6.2.0",
-    "CURAND_ORDERING_PSEUDO_DEFAULT" => "6.2.0",
-    "CURAND_ORDERING_PSEUDO_BEST" => "6.2.0",
-    "CUBLASLT_MATMUL_DESC_AMAX_D_POINTER" => "6.2.0"
-);
-
-$print_stats = 1 if $examine;
-$no_output = 1 if $examine;
-
-# Whitelist of cuda[A-Z] identifiers, which are commonly used in CUDA sources but don't map to any CUDA API:
-@whitelist = (
-      "cudaCloverField"
-    , "cudaColorSpinorField"
-    , "cudaCpsGauge"
-    , "cudaDevice"
-    , "cudaDeviceId"
-    , "cudaDevice_t"
-    , "cudaDevices"
-    , "cudaDimBlock"
-    , "cudaDimGrid"
-    , "cudaEigValueSet"
-    , "cudaEigVecSet"
-    , "cudaFatLink"
-    , "cudaForce"
-    , "cudaForce_ex"
-    , "cudaGauge"
-    , "cudaGaugeField"
-    , "cudaGaugeTemp"
-    , "cudaGauge_ex"
-    , "cudaGradInput"
-    , "cudaGradOutput"
-    , "cudaGridDim"
-    , "cudaIDs"
-    , "cudaInForce"
-    , "cudaInGauge"
-    , "cudaInGaugeEx"
-    , "cudaInLink"
-    , "cudaInLinkEx"
-    , "cudaIndices"
-    , "cudaInput"
-    , "cudaLongLink"
-    , "cudaLongLinkOprod"
-    , "cudaLongLinkOprod_ex"
-    , "cudaMemcpys"
-    , "cudaMom"
-    , "cudaOprod"
-    , "cudaOprod_ex"
-    , "cudaOutForce"
-    , "cudaOutGauge"
-    , "cudaOutput"
-    , "cudaParam"
-    , "cudaQdpGauge"
-    , "cudaQuark"
-    , "cudaResult"
-    , "cudaRitzVectors"
-    , "cudaSiteLink"
-    , "cudaSpinor"
-    , "cudaSpinorOut"
-    , "cudaStaple"
-    , "cudaULink"
-    , "cudaUnitarizedLink"
-);
-
-push(@whitelist, split(',', $whitelist));
-push(@exclude_dirlist, split(',', $exclude_dirs));
-push(@exclude_filelist, split(',', $exclude_files));
-
-# Turn exclude dirlist and exclude_filelist into hash maps
-%exclude_dirhash = map { $_ => 1 } @exclude_dirlist;
-%exclude_filehash = map { $_ => 1 } @exclude_filelist;
-
-@statNames = ("error", "init", "version", "device", "context", "module", "library", "memory", "virtual_memory", "ordered_memory", "multicast", "unified", "stream", "event", "external_resource", "stream_memory", "execution", "graph", "occupancy", "texture", "surface", "tensor", "peer", "graphics", "driver_entry_point", "cpp", "coredump", "driver_interact", "profiler", "openGL", "D3D9", "D3D10", "D3D11", "VDPAU", "EGL", "thread", "complex", "library", "device_library", "device_function", "device_type", "include", "include_cuda_main_header", "include_cuda_main_header_v2", "type", "literal", "numeric_literal", "define", "extern_shared", "kernel_launch");
-
-sub totalStats {
-    my %count = %{shift()};
-    my $total = 0;
-    foreach $key (keys %count) {
-        $total += $count{$key};
-    }
-    return $total;
-}
-
-sub printStats {
-    my %counts    = %{shift()};
-    my $warnings  = shift();
-    my $loc       = shift();
-    my $fileName  = shift();
-    my $global    = shift();
-    my $total     = totalStats(\%counts);
-    printf STDERR "\n[HIPIFY] info: file '$fileName' statistics:\n";
-    printf STDERR "  CONVERTED refs count: $total\n";
-    printf STDERR "  TOTAL lines of code: $loc\n";
-    printf STDERR "  WARNINGS: $warnings\n";
-    printf STDERR "[HIPIFY] info: CONVERTED refs by names:\n";
-    if ($global) {
-        foreach my $key (sort keys %tagsToConvertedTagsTotal) {
-            printf STDERR "  %s => %s: %d\n", $key, $tagsToConvertedTagsTotal{$key}, $convertedTagsTotal{$tagsToConvertedTagsTotal{$key}};
-        }
-    } else {
-        foreach my $key (sort keys %tagsToConvertedTags) {
-            printf STDERR "  %s => %s: %d\n", $key, $tagsToConvertedTags{$key}, $convertedTags{$tagsToConvertedTags{$key}};
-        }
-    }
-}
-
-sub addStats {
-    my $dest_ref  = shift();
-    my %adder     = %{shift()};
-    foreach $key (keys %adder) {
-        $dest_ref->{$key} += $adder{$key};
-    }
-}
-
-sub clearStats {
-    my $dest_ref  = shift();
-    my @statNames = @{shift()};
-    foreach $stat(@statNames) {
-        $dest_ref->{$stat} = 0;
-    }
-}
-
-sub subst {
-    my $a = shift();
-    my $b = shift();
-    my $t = shift();
-    my $i = "";
-    if ($t eq "include" or $t eq "include_cuda_main_header") {
-        $i = "(?<![\\!~`@#\\$%\\^&\\*\\-+=\\[\\]\\(\\)\\{\\}\\.\\,\\?'\\>])";
-    }
-    if (my $c += s/$i\b$a\b/$b/g) {
-        $ft{$t} += $c;
-        $tags{$a} +=$c;
-        $tagsTotal{$a} +=$c;
-        $convertedTags{$b} +=$c;
-        $convertedTagsTotal{$b} +=$c;
-        $tagsToConvertedTags{$a} = $b;
-        $tagsToConvertedTagsTotal{$a} = $b;
-    }
-}
-
-sub experimentalSubstitutions {
-    subst("cudaSetValidDevices", "hipSetValidDevices", "device");
-    subst("cuMemcpyAtoA", "hipMemcpyAtoA", "memory");
-    subst("cuMemcpyAtoA_v2", "hipMemcpyAtoA", "memory");
-    subst("cuMemcpyAtoD", "hipMemcpyAtoD", "memory");
-    subst("cuMemcpyAtoD_v2", "hipMemcpyAtoD", "memory");
-    subst("cuMemcpyAtoHAsync", "hipMemcpyAtoHAsync", "memory");
-    subst("cuMemcpyAtoHAsync_v2", "hipMemcpyAtoHAsync", "memory");
-    subst("cuMemcpyDtoA", "hipMemcpyDtoA", "memory");
-    subst("cuMemcpyDtoA_v2", "hipMemcpyDtoA", "memory");
-    subst("cuMemcpyHtoAAsync", "hipMemcpyHtoAAsync", "memory");
-    subst("cuMemcpyHtoAAsync_v2", "hipMemcpyHtoAAsync", "memory");
-    subst("cudaMemcpy2DArrayToArray", "hipMemcpy2DArrayToArray", "memory");
-    subst("cuStreamBeginCaptureToGraph", "hipStreamBeginCaptureToGraph", "stream");
-    subst("cudaStreamBeginCaptureToGraph", "hipStreamBeginCaptureToGraph", "stream");
-    subst("cuGraphAddNode", "hipGraphAddNode", "graph");
-    subst("cuGraphInstantiateWithParams", "hipGraphInstantiateWithParams", "graph");
-    subst("cudaGraphAddNode", "hipGraphAddNode", "graph");
-    subst("cudaGraphInstantiateWithParams", "hipGraphInstantiateWithParams", "graph");
-    subst("cuGetProcAddress", "hipGetProcAddress", "driver_entry_point");
-    subst("cudaGetDriverEntryPoint", "hipGetProcAddress", "driver_entry_point");
-    subst("cudaGetFuncBySymbol", "hipGetFuncBySymbol", "driver_interact");
-    subst("cublasAxpyEx_64", "hipblasAxpyEx_v2_64", "library");
-    subst("cublasCgbmv_64", "hipblasCgbmv_v2_64", "library");
-    subst("cublasCgbmv_v2_64", "hipblasCgbmv_v2_64", "library");
-    subst("cublasCgemvBatched_64", "hipblasCgemvBatched_v2_64", "library");
-    subst("cublasCgemvStridedBatched_64", "hipblasCgemvStridedBatched_v2_64", "library");
-    subst("cublasCgemv_64", "hipblasCgemv_v2_64", "library");
-    subst("cublasCgemv_v2_64", "hipblasCgemv_v2_64", "library");
-    subst("cublasCgerc_64", "hipblasCgerc_v2_64", "library");
-    subst("cublasCgerc_v2_64", "hipblasCgerc_v2_64", "library");
-    subst("cublasCgeru_64", "hipblasCgeru_v2_64", "library");
-    subst("cublasCgeru_v2_64", "hipblasCgeru_v2_64", "library");
-    subst("cublasChbmv_64", "hipblasChbmv_v2_64", "library");
-    subst("cublasChbmv_v2_64", "hipblasChbmv_v2_64", "library");
-    subst("cublasChemv_64", "hipblasChemv_v2_64", "library");
-    subst("cublasChemv_v2_64", "hipblasChemv_v2_64", "library");
-    subst("cublasCher2_64", "hipblasCher2_v2_64", "library");
-    subst("cublasCher2_v2_64", "hipblasCher2_v2_64", "library");
-    subst("cublasCher_64", "hipblasCher_v2_64", "library");
-    subst("cublasCher_v2_64", "hipblasCher_v2_64", "library");
-    subst("cublasChpmv_64", "hipblasChpmv_v2_64", "library");
-    subst("cublasChpmv_v2_64", "hipblasChpmv_v2_64", "library");
-    subst("cublasChpr2_64", "hipblasChpr2_v2_64", "library");
-    subst("cublasChpr2_v2_64", "hipblasChpr2_v2_64", "library");
-    subst("cublasChpr_64", "hipblasChpr_v2_64", "library");
-    subst("cublasChpr_v2_64", "hipblasChpr_v2_64", "library");
-    subst("cublasCsymv_64", "hipblasCsymv_v2_64", "library");
-    subst("cublasCsymv_v2_64", "hipblasCsymv_v2_64", "library");
-    subst("cublasCsyr2_64", "hipblasCsyr2_v2_64", "library");
-    subst("cublasCsyr2_v2_64", "hipblasCsyr2_v2_64", "library");
-    subst("cublasCsyr_64", "hipblasCsyr_v2_64", "library");
-    subst("cublasCsyr_v2_64", "hipblasCsyr_v2_64", "library");
-    subst("cublasCtbmv_64", "hipblasCtbmv_v2_64", "library");
-    subst("cublasCtbmv_v2_64", "hipblasCtbmv_v2_64", "library");
-    subst("cublasCtbsv_64", "hipblasCtbsv_v2_64", "library");
-    subst("cublasCtbsv_v2_64", "hipblasCtbsv_v2_64", "library");
-    subst("cublasCtpmv_64", "hipblasCtpmv_v2_64", "library");
-    subst("cublasCtpmv_v2_64", "hipblasCtpmv_v2_64", "library");
-    subst("cublasCtpsv_64", "hipblasCtpsv_v2_64", "library");
-    subst("cublasCtpsv_v2_64", "hipblasCtpsv_v2_64", "library");
-    subst("cublasCtrmv_64", "hipblasCtrmv_v2_64", "library");
-    subst("cublasCtrmv_v2_64", "hipblasCtrmv_v2_64", "library");
-    subst("cublasCtrsv_64", "hipblasCtrsv_v2_64", "library");
-    subst("cublasCtrsv_v2_64", "hipblasCtrsv_v2_64", "library");
-    subst("cublasDgbmv_64", "hipblasDgbmv_64", "library");
-    subst("cublasDgbmv_v2_64", "hipblasDgbmv_64", "library");
-    subst("cublasDgemvBatched_64", "hipblasDgemvBatched_64", "library");
-    subst("cublasDgemvStridedBatched_64", "hipblasDgemvStridedBatched_64", "library");
-    subst("cublasDgemv_64", "hipblasDgemv_64", "library");
-    subst("cublasDgemv_v2_64", "hipblasDgemv_64", "library");
-    subst("cublasDger_64", "hipblasDger_64", "library");
-    subst("cublasDger_v2_64", "hipblasDger_64", "library");
-    subst("cublasDotEx_64", "hipblasDotEx_v2_64", "library");
-    subst("cublasDotcEx_64", "hipblasDotcEx_v2_64", "library");
-    subst("cublasDsbmv_64", "hipblasDsbmv_64", "library");
-    subst("cublasDsbmv_v2_64", "hipblasDsbmv_64", "library");
-    subst("cublasDspmv_64", "hipblasDspmv_64", "library");
-    subst("cublasDspmv_v2_64", "hipblasDspmv_64", "library");
-    subst("cublasDspr2_64", "hipblasDspr2_64", "library");
-    subst("cublasDspr2_v2_64", "hipblasDspr2_64", "library");
-    subst("cublasDspr_64", "hipblasDspr_64", "library");
-    subst("cublasDspr_v2_64", "hipblasDspr_64", "library");
-    subst("cublasDsymv_64", "hipblasDsymv_64", "library");
-    subst("cublasDsymv_v2_64", "hipblasDsymv_64", "library");
-    subst("cublasDsyr2_64", "hipblasDsyr2_64", "library");
-    subst("cublasDsyr2_v2_64", "hipblasDsyr2_64", "library");
-    subst("cublasDsyr_64", "hipblasDsyr_64", "library");
-    subst("cublasDsyr_v2_64", "hipblasDsyr_64", "library");
-    subst("cublasDtbmv_64", "hipblasDtbmv_64", "library");
-    subst("cublasDtbmv_v2_64", "hipblasDtbmv_64", "library");
-    subst("cublasDtbsv_64", "hipblasDtbsv_64", "library");
-    subst("cublasDtbsv_v2_64", "hipblasDtbsv_64", "library");
-    subst("cublasDtpmv_64", "hipblasDtpmv_64", "library");
-    subst("cublasDtpmv_v2_64", "hipblasDtpmv_64", "library");
-    subst("cublasDtpsv_64", "hipblasDtpsv_64", "library");
-    subst("cublasDtpsv_v2_64", "hipblasDtpsv_64", "library");
-    subst("cublasDtrmv_64", "hipblasDtrmv_64", "library");
-    subst("cublasDtrmv_v2_64", "hipblasDtrmv_64", "library");
-    subst("cublasDtrsv_64", "hipblasDtrsv_64", "library");
-    subst("cublasDtrsv_v2_64", "hipblasDtrsv_64", "library");
-    subst("cublasNrm2Ex_64", "hipblasNrm2Ex_v2_64", "library");
-    subst("cublasRotEx_64", "hipblasRotEx_v2_64", "library");
-    subst("cublasScalEx_64", "hipblasScalEx_v2_64", "library");
-    subst("cublasSgbmv_64", "hipblasSgbmv_64", "library");
-    subst("cublasSgbmv_v2_64", "hipblasSgbmv_64", "library");
-    subst("cublasSgemvBatched_64", "hipblasSgemvBatched_64", "library");
-    subst("cublasSgemvStridedBatched_64", "hipblasSgemvStridedBatched_64", "library");
-    subst("cublasSgemv_64", "hipblasSgemv_64", "library");
-    subst("cublasSgemv_v2_64", "hipblasSgemv_64", "library");
-    subst("cublasSger_64", "hipblasSger_64", "library");
-    subst("cublasSger_v2_64", "hipblasSger_64", "library");
-    subst("cublasSsbmv_64", "hipblasSsbmv_64", "library");
-    subst("cublasSsbmv_v2_64", "hipblasSsbmv_64", "library");
-    subst("cublasSspmv_64", "hipblasSspmv_64", "library");
-    subst("cublasSspmv_v2_64", "hipblasSspmv_64", "library");
-    subst("cublasSspr2_64", "hipblasSspr2_64", "library");
-    subst("cublasSspr2_v2_64", "hipblasSspr2_64", "library");
-    subst("cublasSspr_64", "hipblasSspr_64", "library");
-    subst("cublasSspr_v2_64", "hipblasSspr_64", "library");
-    subst("cublasSsymv_64", "hipblasSsymv_64", "library");
-    subst("cublasSsymv_v2_64", "hipblasSsymv_64", "library");
-    subst("cublasSsyr2_64", "hipblasSsyr2_64", "library");
-    subst("cublasSsyr2_v2_64", "hipblasSsyr2_64", "library");
-    subst("cublasSsyr_64", "hipblasSsyr_64", "library");
-    subst("cublasSsyr_v2_64", "hipblasSsyr_64", "library");
-    subst("cublasStbmv_64", "hipblasStbmv_64", "library");
-    subst("cublasStbmv_v2_64", "hipblasStbmv_64", "library");
-    subst("cublasStbsv_64", "hipblasStbsv_64", "library");
-    subst("cublasStbsv_v2_64", "hipblasStbsv_64", "library");
-    subst("cublasStpmv_64", "hipblasStpmv_64", "library");
-    subst("cublasStpmv_v2_64", "hipblasStpmv_64", "library");
-    subst("cublasStpsv_64", "hipblasStpsv_64", "library");
-    subst("cublasStpsv_v2_64", "hipblasStpsv_64", "library");
-    subst("cublasStrmv_64", "hipblasStrmv_64", "library");
-    subst("cublasStrmv_v2_64", "hipblasStrmv_64", "library");
-    subst("cublasStrsv_64", "hipblasStrsv_64", "library");
-    subst("cublasStrsv_v2_64", "hipblasStrsv_64", "library");
-    subst("cublasZgbmv_64", "hipblasZgbmv_v2_64", "library");
-    subst("cublasZgbmv_v2_64", "hipblasZgbmv_v2_64", "library");
-    subst("cublasZgemvBatched_64", "hipblasZgemvBatched_v2_64", "library");
-    subst("cublasZgemvStridedBatched_64", "hipblasZgemvStridedBatched_v2_64", "library");
-    subst("cublasZgemv_64", "hipblasZgemv_v2_64", "library");
-    subst("cublasZgemv_v2_64", "hipblasZgemv_v2_64", "library");
-    subst("cublasZgerc_64", "hipblasZgerc_v2_64", "library");
-    subst("cublasZgerc_v2_64", "hipblasZgerc_v2_64", "library");
-    subst("cublasZgeru_64", "hipblasZgeru_v2_64", "library");
-    subst("cublasZgeru_v2_64", "hipblasZgeru_v2_64", "library");
-    subst("cublasZhbmv_64", "hipblasZhbmv_v2_64", "library");
-    subst("cublasZhbmv_v2_64", "hipblasZhbmv_v2_64", "library");
-    subst("cublasZhemv_64", "hipblasZhemv_v2_64", "library");
-    subst("cublasZhemv_v2_64", "hipblasZhemv_v2_64", "library");
-    subst("cublasZher2_64", "hipblasZher2_v2_64", "library");
-    subst("cublasZher2_v2_64", "hipblasZher2_v2_64", "library");
-    subst("cublasZher_64", "hipblasZher_v2_64", "library");
-    subst("cublasZher_v2_64", "hipblasZher_v2_64", "library");
-    subst("cublasZhpmv_64", "hipblasZhpmv_v2_64", "library");
-    subst("cublasZhpmv_v2_64", "hipblasZhpmv_v2_64", "library");
-    subst("cublasZhpr2_64", "hipblasZhpr2_v2_64", "library");
-    subst("cublasZhpr2_v2_64", "hipblasZhpr2_v2_64", "library");
-    subst("cublasZhpr_64", "hipblasZhpr_v2_64", "library");
-    subst("cublasZhpr_v2_64", "hipblasZhpr_v2_64", "library");
-    subst("cublasZsymv_64", "hipblasZsymv_v2_64", "library");
-    subst("cublasZsymv_v2_64", "hipblasZsymv_v2_64", "library");
-    subst("cublasZsyr2_64", "hipblasZsyr2_v2_64", "library");
-    subst("cublasZsyr2_v2_64", "hipblasZsyr2_v2_64", "library");
-    subst("cublasZsyr_64", "hipblasZsyr_v2_64", "library");
-    subst("cublasZsyr_v2_64", "hipblasZsyr_v2_64", "library");
-    subst("cublasZtbmv_64", "hipblasZtbmv_v2_64", "library");
-    subst("cublasZtbmv_v2_64", "hipblasZtbmv_v2_64", "library");
-    subst("cublasZtbsv_64", "hipblasZtbsv_v2_64", "library");
-    subst("cublasZtbsv_v2_64", "hipblasZtbsv_v2_64", "library");
-    subst("cublasZtpmv_64", "hipblasZtpmv_v2_64", "library");
-    subst("cublasZtpmv_v2_64", "hipblasZtpmv_v2_64", "library");
-    subst("cublasZtpsv_64", "hipblasZtpsv_v2_64", "library");
-    subst("cublasZtpsv_v2_64", "hipblasZtpsv_v2_64", "library");
-    subst("cublasZtrmv_64", "hipblasZtrmv_v2_64", "library");
-    subst("cublasZtrmv_v2_64", "hipblasZtrmv_v2_64", "library");
-    subst("cublasZtrsv_64", "hipblasZtrsv_v2_64", "library");
-    subst("cublasZtrsv_v2_64", "hipblasZtrsv_v2_64", "library");
-    subst("curandSetGeneratorOrdering", "hiprandSetGeneratorOrdering", "library");
-    subst("cusolverDnCreateParams", "hipsolverDnCreateParams", "library");
-    subst("cusolverDnDestroyParams", "hipsolverDnDestroyParams", "library");
-    subst("cusolverDnSetAdvOptions", "hipsolverDnSetAdvOptions", "library");
-    subst("cusolverDnXgetrf", "hipsolverDnXgetrf", "library");
-    subst("cusolverDnXgetrf_bufferSize", "hipsolverDnXgetrf_bufferSize", "library");
-    subst("cusolverDnXgetrs", "hipsolverDnXgetrs", "library");
-    subst("CUdriverProcAddressQueryResult", "hipDriverProcAddressQueryResult", "type");
-    subst("CUdriverProcAddressQueryResult_enum", "hipDriverProcAddressQueryResult", "type");
-    subst("CUgraphDependencyType", "hipGraphDependencyType", "type");
-    subst("CUgraphDependencyType_enum", "hipGraphDependencyType", "type");
-    subst("CUgraphEdgeData", "hipGraphEdgeData", "type");
-    subst("CUgraphEdgeData_st", "hipGraphEdgeData", "type");
-    subst("CUlaunchAttributeID", "hipLaunchAttributeID", "type");
-    subst("CUlaunchAttributeID_enum", "hipLaunchAttributeID", "type");
-    subst("CUlaunchAttributeValue", "hipLaunchAttributeValue", "type");
-    subst("CUlaunchAttributeValue_union", "hipLaunchAttributeValue", "type");
-    subst("cudaDriverEntryPointQueryResult", "hipDriverProcAddressQueryResult", "type");
-    subst("cudaGraphDependencyType", "hipGraphDependencyType", "type");
-    subst("cudaGraphDependencyType_enum", "hipGraphDependencyType", "type");
-    subst("cudaGraphEdgeData", "hipGraphEdgeData", "type");
-    subst("cudaGraphEdgeData_st", "hipGraphEdgeData", "type");
-    subst("cudaLaunchAttributeID", "hipLaunchAttributeID", "type");
-    subst("cudaLaunchAttributeValue", "hipLaunchAttributeValue", "type");
-    subst("curandOrdering", "hiprandOrdering", "type");
-    subst("curandOrdering_t", "hiprandOrdering_t", "type");
-    subst("curandStateScrambledSobol32", "hiprandStateScrambledSobol32", "type");
-    subst("curandStateScrambledSobol32_t", "hiprandStateScrambledSobol32_t", "type");
-    subst("curandStateScrambledSobol64", "hiprandStateScrambledSobol64", "type");
-    subst("curandStateScrambledSobol64_t", "hiprandStateScrambledSobol64_t", "type");
-    subst("curandStateSobol64", "hiprandStateSobol64", "type");
-    subst("curandStateSobol64_t", "hiprandStateSobol64_t", "type");
-    subst("cusolverAlgMode_t", "hipsolverAlgMode_t", "type");
-    subst("cusolverDnFunction_t", "hipsolverDnFunction_t", "type");
-    subst("cusolverDnParams_t", "hipsolverDnParams_t", "type");
-    subst("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", "HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", "numeric_literal");
-    subst("CURAND_ORDERING_PSEUDO_BEST", "HIPRAND_ORDERING_PSEUDO_BEST", "numeric_literal");
-    subst("CURAND_ORDERING_PSEUDO_DEFAULT", "HIPRAND_ORDERING_PSEUDO_DEFAULT", "numeric_literal");
-    subst("CURAND_ORDERING_PSEUDO_DYNAMIC", "HIPRAND_ORDERING_PSEUDO_DYNAMIC", "numeric_literal");
-    subst("CURAND_ORDERING_PSEUDO_LEGACY", "HIPRAND_ORDERING_PSEUDO_LEGACY", "numeric_literal");
-    subst("CURAND_ORDERING_PSEUDO_SEEDED", "HIPRAND_ORDERING_PSEUDO_SEEDED", "numeric_literal");
-    subst("CURAND_ORDERING_QUASI_DEFAULT", "HIPRAND_ORDERING_QUASI_DEFAULT", "numeric_literal");
-    subst("CUSOLVERDN_GETRF", "HIPSOLVERDN_GETRF", "numeric_literal");
-    subst("CUSOLVER_ALG_0", "HIPSOLVER_ALG_0", "numeric_literal");
-    subst("CUSOLVER_ALG_1", "HIPSOLVER_ALG_1", "numeric_literal");
-    subst("CU_GET_PROC_ADDRESS_SUCCESS", "HIP_GET_PROC_ADDRESS_SUCCESS", "numeric_literal");
-    subst("CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND", "HIP_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND", "numeric_literal");
-    subst("CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT", "HIP_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT", "numeric_literal");
-    subst("CU_GRAPH_DEPENDENCY_TYPE_DEFAULT", "hipGraphDependencyTypeDefault", "numeric_literal");
-    subst("CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC", "hipGraphDependencyTypeProgrammatic", "numeric_literal");
-    subst("CU_KERNEL_NODE_ATTRIBUTE_PRIORITY", "hipKernelNodeAttributePriority", "numeric_literal");
-    subst("CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW", "hipLaunchAttributeAccessPolicyWindow", "numeric_literal");
-    subst("CU_LAUNCH_ATTRIBUTE_COOPERATIVE", "hipLaunchAttributeCooperative", "numeric_literal");
-    subst("CU_LAUNCH_ATTRIBUTE_PRIORITY", "hipLaunchAttributePriority", "numeric_literal");
-    subst("cudaDriverEntryPointSuccess", "HIP_GET_PROC_ADDRESS_SUCCESS", "numeric_literal");
-    subst("cudaDriverEntryPointSymbolNotFound", "HIP_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND", "numeric_literal");
-    subst("cudaDriverEntryPointVersionNotSufficent", "HIP_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT", "numeric_literal");
-    subst("cudaGraphDependencyTypeDefault", "hipGraphDependencyTypeDefault", "numeric_literal");
-    subst("cudaGraphDependencyTypeProgrammatic", "hipGraphDependencyTypeProgrammatic", "numeric_literal");
-    subst("cudaKernelNodeAttributePriority", "hipKernelNodeAttributePriority", "numeric_literal");
-    subst("cudaLaunchAttributeAccessPolicyWindow", "hipLaunchAttributeAccessPolicyWindow", "numeric_literal");
-    subst("cudaLaunchAttributeCooperative", "hipLaunchAttributeCooperative", "numeric_literal");
-    subst("cudaLaunchAttributePriority", "hipLaunchAttributePriority", "numeric_literal");
-    subst("CU_GRAPH_KERNEL_NODE_PORT_DEFAULT", "hipGraphKernelNodePortDefault", "define");
-    subst("CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER", "hipGraphKernelNodePortLaunchCompletion", "define");
-    subst("CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC", "hipGraphKernelNodePortProgrammatic", "define");
-    subst("CU_STREAM_LEGACY", "hipStreamLegacy", "define");
-    subst("cudaGraphKernelNodePortDefault", "hipGraphKernelNodePortDefault", "define");
-    subst("cudaGraphKernelNodePortLaunchCompletion", "hipGraphKernelNodePortLaunchCompletion", "define");
-    subst("cudaGraphKernelNodePortProgrammatic", "hipGraphKernelNodePortProgrammatic", "define");
-    subst("cudaStreamLegacy", "hipStreamLegacy", "define");
-}
-
-sub rocSubstitutions {
-    subst("cublasAxpyEx", "rocblas_axpy_ex", "library");
-    subst("cublasAxpyEx_64", "rocblas_axpy_ex_64", "library");
-    subst("cublasCaxpy", "rocblas_caxpy", "library");
-    subst("cublasCaxpy_64", "rocblas_caxpy_64", "library");
-    subst("cublasCaxpy_v2", "rocblas_caxpy", "library");
-    subst("cublasCaxpy_v2_64", "rocblas_caxpy_64", "library");
-    subst("cublasCcopy", "rocblas_ccopy", "library");
-    subst("cublasCcopy_64", "rocblas_ccopy_64", "library");
-    subst("cublasCcopy_v2", "rocblas_ccopy", "library");
-    subst("cublasCcopy_v2_64", "rocblas_ccopy_64", "library");
-    subst("cublasCdgmm", "rocblas_cdgmm", "library");
-    subst("cublasCdotc", "rocblas_cdotc", "library");
-    subst("cublasCdotc_64", "rocblas_cdotc_64", "library");
-    subst("cublasCdotc_v2", "rocblas_cdotc", "library");
-    subst("cublasCdotc_v2_64", "rocblas_cdotc_64", "library");
-    subst("cublasCdotu", "rocblas_cdotu", "library");
-    subst("cublasCdotu_64", "rocblas_cdotu_64", "library");
-    subst("cublasCdotu_v2", "rocblas_cdotu", "library");
-    subst("cublasCdotu_v2_64", "rocblas_cdotu_64", "library");
-    subst("cublasCgbmv", "rocblas_cgbmv", "library");
-    subst("cublasCgbmv_v2", "rocblas_cgbmv", "library");
-    subst("cublasCgeam", "rocblas_cgeam", "library");
-    subst("cublasCgemm", "rocblas_cgemm", "library");
-    subst("cublasCgemmBatched", "rocblas_cgemm_batched", "library");
-    subst("cublasCgemmStridedBatched", "rocblas_cgemm_strided_batched", "library");
-    subst("cublasCgemm_v2", "rocblas_cgemm", "library");
-    subst("cublasCgemv", "rocblas_cgemv", "library");
-    subst("cublasCgemvBatched", "rocblas_cgemv_batched", "library");
-    subst("cublasCgemvStridedBatched", "rocblas_cgemv_strided_batched", "library");
-    subst("cublasCgemv_v2", "rocblas_cgemv", "library");
-    subst("cublasCgerc", "rocblas_cgerc", "library");
-    subst("cublasCgerc_v2", "rocblas_cgerc", "library");
-    subst("cublasCgeru", "rocblas_cgeru", "library");
-    subst("cublasCgeru_v2", "rocblas_cgeru", "library");
-    subst("cublasChbmv", "rocblas_chbmv", "library");
-    subst("cublasChbmv_v2", "rocblas_chbmv", "library");
-    subst("cublasChemm", "rocblas_chemm", "library");
-    subst("cublasChemm_v2", "rocblas_chemm", "library");
-    subst("cublasChemv", "rocblas_chemv", "library");
-    subst("cublasChemv_v2", "rocblas_chemv", "library");
-    subst("cublasCher", "rocblas_cher", "library");
-    subst("cublasCher2", "rocblas_cher2", "library");
-    subst("cublasCher2_v2", "rocblas_cher2", "library");
-    subst("cublasCher2k", "rocblas_cher2k", "library");
-    subst("cublasCher2k_v2", "rocblas_cher2k", "library");
-    subst("cublasCher_v2", "rocblas_cher", "library");
-    subst("cublasCherk", "rocblas_cherk", "library");
-    subst("cublasCherk_v2", "rocblas_cherk", "library");
-    subst("cublasCherkx", "rocblas_cherkx", "library");
-    subst("cublasChpmv", "rocblas_chpmv", "library");
-    subst("cublasChpmv_v2", "rocblas_chpmv", "library");
-    subst("cublasChpr", "rocblas_chpr", "library");
-    subst("cublasChpr2", "rocblas_chpr2", "library");
-    subst("cublasChpr2_v2", "rocblas_chpr2", "library");
-    subst("cublasChpr_v2", "rocblas_chpr", "library");
-    subst("cublasCreate", "rocblas_create_handle", "library");
-    subst("cublasCreate_v2", "rocblas_create_handle", "library");
-    subst("cublasCrot", "rocblas_crot", "library");
-    subst("cublasCrot_64", "rocblas_crot_64", "library");
-    subst("cublasCrot_v2", "rocblas_crot", "library");
-    subst("cublasCrot_v2_64", "rocblas_crot_64", "library");
-    subst("cublasCrotg", "rocblas_crotg", "library");
-    subst("cublasCrotg_v2", "rocblas_crotg", "library");
-    subst("cublasCscal", "rocblas_cscal", "library");
-    subst("cublasCscal_64", "rocblas_cscal_64", "library");
-    subst("cublasCscal_v2", "rocblas_cscal", "library");
-    subst("cublasCscal_v2_64", "rocblas_cscal_64", "library");
-    subst("cublasCsrot", "rocblas_csrot", "library");
-    subst("cublasCsrot_64", "rocblas_csrot_64", "library");
-    subst("cublasCsrot_v2", "rocblas_csrot", "library");
-    subst("cublasCsrot_v2_64", "rocblas_csrot_64", "library");
-    subst("cublasCsscal", "rocblas_csscal", "library");
-    subst("cublasCsscal_64", "rocblas_csscal_64", "library");
-    subst("cublasCsscal_v2", "rocblas_csscal", "library");
-    subst("cublasCsscal_v2_64", "rocblas_csscal_64", "library");
-    subst("cublasCswap", "rocblas_cswap", "library");
-    subst("cublasCswap_64", "rocblas_cswap_64", "library");
-    subst("cublasCswap_v2", "rocblas_cswap", "library");
-    subst("cublasCswap_v2_64", "rocblas_cswap_64", "library");
-    subst("cublasCsymm", "rocblas_csymm", "library");
-    subst("cublasCsymm_v2", "rocblas_csymm", "library");
-    subst("cublasCsymv", "rocblas_csymv", "library");
-    subst("cublasCsymv_v2", "rocblas_csymv", "library");
-    subst("cublasCsyr", "rocblas_csyr", "library");
-    subst("cublasCsyr2", "rocblas_csyr2", "library");
-    subst("cublasCsyr2_v2", "rocblas_csyr2", "library");
-    subst("cublasCsyr2k", "rocblas_csyr2k", "library");
-    subst("cublasCsyr2k_v2", "rocblas_csyr2k", "library");
-    subst("cublasCsyr_v2", "rocblas_csyr", "library");
-    subst("cublasCsyrk", "rocblas_csyrk", "library");
-    subst("cublasCsyrk_v2", "rocblas_csyrk", "library");
-    subst("cublasCsyrkx", "rocblas_csyrkx", "library");
-    subst("cublasCtbmv", "rocblas_ctbmv", "library");
-    subst("cublasCtbmv_v2", "rocblas_ctbmv", "library");
-    subst("cublasCtbsv", "rocblas_ctbsv", "library");
-    subst("cublasCtbsv_v2", "rocblas_ctbsv", "library");
-    subst("cublasCtpmv", "rocblas_ctpmv", "library");
-    subst("cublasCtpmv_v2", "rocblas_ctpmv", "library");
-    subst("cublasCtpsv", "rocblas_ctpsv", "library");
-    subst("cublasCtpsv_v2", "rocblas_ctpsv", "library");
-    subst("cublasCtrmm", "rocblas_ctrmm", "library");
-    subst("cublasCtrmm_v2", "rocblas_ctrmm", "library");
-    subst("cublasCtrmv", "rocblas_ctrmv", "library");
-    subst("cublasCtrmv_v2", "rocblas_ctrmv", "library");
-    subst("cublasCtrsm", "rocblas_ctrsm", "library");
-    subst("cublasCtrsmBatched", "rocblas_ctrsm_batched", "library");
-    subst("cublasCtrsm_v2", "rocblas_ctrsm", "library");
-    subst("cublasCtrsv", "rocblas_ctrsv", "library");
-    subst("cublasCtrsv_v2", "rocblas_ctrsv", "library");
-    subst("cublasDasum", "rocblas_dasum", "library");
-    subst("cublasDasum_64", "rocblas_dasum_64", "library");
-    subst("cublasDasum_v2", "rocblas_dasum", "library");
-    subst("cublasDasum_v2_64", "rocblas_dasum_64", "library");
-    subst("cublasDaxpy", "rocblas_daxpy", "library");
-    subst("cublasDaxpy_64", "rocblas_daxpy_64", "library");
-    subst("cublasDaxpy_v2", "rocblas_daxpy", "library");
-    subst("cublasDaxpy_v2_64", "rocblas_daxpy_64", "library");
-    subst("cublasDcopy", "rocblas_dcopy", "library");
-    subst("cublasDcopy_64", "rocblas_dcopy_64", "library");
-    subst("cublasDcopy_v2", "rocblas_dcopy", "library");
-    subst("cublasDcopy_v2_64", "rocblas_dcopy_64", "library");
-    subst("cublasDdgmm", "rocblas_ddgmm", "library");
-    subst("cublasDdot", "rocblas_ddot", "library");
-    subst("cublasDdot_64", "rocblas_ddot_64", "library");
-    subst("cublasDdot_v2", "rocblas_ddot", "library");
-    subst("cublasDdot_v2_64", "rocblas_ddot_64", "library");
-    subst("cublasDestroy", "rocblas_destroy_handle", "library");
-    subst("cublasDestroy_v2", "rocblas_destroy_handle", "library");
-    subst("cublasDgbmv", "rocblas_dgbmv", "library");
-    subst("cublasDgbmv_v2", "rocblas_dgbmv", "library");
-    subst("cublasDgeam", "rocblas_dgeam", "library");
-    subst("cublasDgemm", "rocblas_dgemm", "library");
-    subst("cublasDgemmBatched", "rocblas_dgemm_batched", "library");
-    subst("cublasDgemmStridedBatched", "rocblas_dgemm_strided_batched", "library");
-    subst("cublasDgemm_v2", "rocblas_dgemm", "library");
-    subst("cublasDgemv", "rocblas_dgemv", "library");
-    subst("cublasDgemv_v2", "rocblas_dgemv", "library");
-    subst("cublasDger", "rocblas_dger", "library");
-    subst("cublasDger_v2", "rocblas_dger", "library");
-    subst("cublasDnrm2", "rocblas_dnrm2", "library");
-    subst("cublasDnrm2_64", "rocblas_dnrm2_64", "library");
-    subst("cublasDnrm2_v2", "rocblas_dnrm2", "library");
-    subst("cublasDnrm2_v2_64", "rocblas_dnrm2_64", "library");
-    subst("cublasDotEx", "rocblas_dot_ex", "library");
-    subst("cublasDotEx_64", "rocblas_dot_ex_64", "library");
-    subst("cublasDotcEx", "rocblas_dotc_ex", "library");
-    subst("cublasDotcEx_64", "rocblas_dotc_ex_64", "library");
-    subst("cublasDrot", "rocblas_drot", "library");
-    subst("cublasDrot_64", "rocblas_drot_64", "library");
-    subst("cublasDrot_v2", "rocblas_drot", "library");
-    subst("cublasDrot_v2_64", "rocblas_drot_64", "library");
-    subst("cublasDrotg", "rocblas_drotg", "library");
-    subst("cublasDrotg_v2", "rocblas_drotg", "library");
-    subst("cublasDrotm", "rocblas_drotm", "library");
-    subst("cublasDrotm_64", "rocblas_drotm_64", "library");
-    subst("cublasDrotm_v2", "rocblas_drotm", "library");
-    subst("cublasDrotm_v2_64", "rocblas_drotm_64", "library");
-    subst("cublasDrotmg", "rocblas_drotmg", "library");
-    subst("cublasDrotmg_v2", "rocblas_drotmg", "library");
-    subst("cublasDsbmv", "rocblas_dsbmv", "library");
-    subst("cublasDsbmv_v2", "rocblas_dsbmv", "library");
-    subst("cublasDscal", "rocblas_dscal", "library");
-    subst("cublasDscal_64", "rocblas_dscal_64", "library");
-    subst("cublasDscal_v2", "rocblas_dscal", "library");
-    subst("cublasDscal_v2_64", "rocblas_dscal_64", "library");
-    subst("cublasDspmv", "rocblas_dspmv", "library");
-    subst("cublasDspmv_v2", "rocblas_dspmv", "library");
-    subst("cublasDspr", "rocblas_dspr", "library");
-    subst("cublasDspr2", "rocblas_dspr2", "library");
-    subst("cublasDspr2_v2", "rocblas_dspr2", "library");
-    subst("cublasDspr_v2", "rocblas_dspr", "library");
-    subst("cublasDswap", "rocblas_dswap", "library");
-    subst("cublasDswap_64", "rocblas_dswap_64", "library");
-    subst("cublasDswap_v2", "rocblas_dswap", "library");
-    subst("cublasDswap_v2_64", "rocblas_dswap_64", "library");
-    subst("cublasDsymm", "rocblas_dsymm", "library");
-    subst("cublasDsymm_v2", "rocblas_dsymm", "library");
-    subst("cublasDsymv", "rocblas_dsymv", "library");
-    subst("cublasDsymv_v2", "rocblas_dsymv", "library");
-    subst("cublasDsyr", "rocblas_dsyr", "library");
-    subst("cublasDsyr2", "rocblas_dsyr2", "library");
-    subst("cublasDsyr2_v2", "rocblas_dsyr2", "library");
-    subst("cublasDsyr2k", "rocblas_dsyr2k", "library");
-    subst("cublasDsyr2k_v2", "rocblas_dsyr2k", "library");
-    subst("cublasDsyr_v2", "rocblas_dsyr", "library");
-    subst("cublasDsyrk", "rocblas_dsyrk", "library");
-    subst("cublasDsyrk_v2", "rocblas_dsyrk", "library");
-    subst("cublasDsyrkx", "rocblas_dsyrkx", "library");
-    subst("cublasDtbmv", "rocblas_dtbmv", "library");
-    subst("cublasDtbmv_v2", "rocblas_dtbmv", "library");
-    subst("cublasDtbsv", "rocblas_dtbsv", "library");
-    subst("cublasDtbsv_v2", "rocblas_dtbsv", "library");
-    subst("cublasDtpmv", "rocblas_dtpmv", "library");
-    subst("cublasDtpmv_v2", "rocblas_dtpmv", "library");
-    subst("cublasDtpsv", "rocblas_dtpsv", "library");
-    subst("cublasDtpsv_v2", "rocblas_dtpsv", "library");
-    subst("cublasDtrmm", "rocblas_dtrmm", "library");
-    subst("cublasDtrmm_v2", "rocblas_dtrmm", "library");
-    subst("cublasDtrmv", "rocblas_dtrmv", "library");
-    subst("cublasDtrmv_v2", "rocblas_dtrmv", "library");
-    subst("cublasDtrsm", "rocblas_dtrsm", "library");
-    subst("cublasDtrsmBatched", "rocblas_dtrsm_batched", "library");
-    subst("cublasDtrsm_v2", "rocblas_dtrsm", "library");
-    subst("cublasDtrsv", "rocblas_dtrsv", "library");
-    subst("cublasDtrsv_v2", "rocblas_dtrsv", "library");
-    subst("cublasDzasum", "rocblas_dzasum", "library");
-    subst("cublasDzasum_64", "rocblas_dzasum_64", "library");
-    subst("cublasDzasum_v2", "rocblas_dzasum", "library");
-    subst("cublasDzasum_v2_64", "rocblas_dzasum_64", "library");
-    subst("cublasDznrm2", "rocblas_dznrm2", "library");
-    subst("cublasDznrm2_64", "rocblas_dznrm2_64", "library");
-    subst("cublasDznrm2_v2", "rocblas_dznrm2", "library");
-    subst("cublasDznrm2_v2_64", "rocblas_dznrm2_64", "library");
-    subst("cublasGemmBatchedEx", "rocblas_gemm_batched_ex", "library");
-    subst("cublasGemmEx", "rocblas_gemm_ex", "library");
-    subst("cublasGemmStridedBatchedEx", "rocblas_gemm_strided_batched_ex", "library");
-    subst("cublasGetAtomicsMode", "rocblas_get_atomics_mode", "library");
-    subst("cublasGetMathMode", "rocblas_get_math_mode", "library");
-    subst("cublasGetMatrix", "rocblas_get_matrix", "library");
-    subst("cublasGetMatrixAsync", "rocblas_get_matrix_async", "library");
-    subst("cublasGetPointerMode", "rocblas_get_pointer_mode", "library");
-    subst("cublasGetPointerMode_v2", "rocblas_get_pointer_mode", "library");
-    subst("cublasGetStatusString", "rocblas_status_to_string", "library");
-    subst("cublasGetStream", "rocblas_get_stream", "library");
-    subst("cublasGetStream_v2", "rocblas_get_stream", "library");
-    subst("cublasGetVector", "rocblas_get_vector", "library");
-    subst("cublasGetVectorAsync", "rocblas_get_vector_async", "library");
-    subst("cublasHSHgemvBatched", "rocblas_hshgemv_batched", "library");
-    subst("cublasHSHgemvStridedBatched", "rocblas_hshgemv_strided_batched", "library");
-    subst("cublasHSSgemvBatched", "rocblas_hssgemv_batched", "library");
-    subst("cublasHSSgemvStridedBatched", "rocblas_hssgemv_strided_batched", "library");
-    subst("cublasHgemm", "rocblas_hgemm", "library");
-    subst("cublasHgemmBatched", "rocblas_hgemm_batched", "library");
-    subst("cublasHgemmStridedBatched", "rocblas_hgemm_strided_batched", "library");
-    subst("cublasIcamax", "rocblas_icamax", "library");
-    subst("cublasIcamax_64", "rocblas_icamax_64", "library");
-    subst("cublasIcamax_v2", "rocblas_icamax", "library");
-    subst("cublasIcamax_v2_64", "rocblas_icamax_64", "library");
-    subst("cublasIcamin", "rocblas_icamin", "library");
-    subst("cublasIcamin_64", "rocblas_icamin_64", "library");
-    subst("cublasIcamin_v2", "rocblas_icamin", "library");
-    subst("cublasIcamin_v2_64", "rocblas_icamin_64", "library");
-    subst("cublasIdamax", "rocblas_idamax", "library");
-    subst("cublasIdamax_64", "rocblas_idamax_64", "library");
-    subst("cublasIdamax_v2", "rocblas_idamax", "library");
-    subst("cublasIdamax_v2_64", "rocblas_idamax_64", "library");
-    subst("cublasIdamin", "rocblas_idamin", "library");
-    subst("cublasIdamin_64", "rocblas_idamin_64", "library");
-    subst("cublasIdamin_v2", "rocblas_idamin", "library");
-    subst("cublasIdamin_v2_64", "rocblas_idamin_64", "library");
-    subst("cublasInit", "rocblas_initialize", "library");
-    subst("cublasIsamax", "rocblas_isamax", "library");
-    subst("cublasIsamax_64", "rocblas_isamax_64", "library");
-    subst("cublasIsamax_v2", "rocblas_isamax", "library");
-    subst("cublasIsamax_v2_64", "rocblas_isamax_64", "library");
-    subst("cublasIsamin", "rocblas_isamin", "library");
-    subst("cublasIsamin_64", "rocblas_isamin_64", "library");
-    subst("cublasIsamin_v2", "rocblas_isamin", "library");
-    subst("cublasIsamin_v2_64", "rocblas_isamin_64", "library");
-    subst("cublasIzamax", "rocblas_izamax", "library");
-    subst("cublasIzamax_64", "rocblas_izamax_64", "library");
-    subst("cublasIzamax_v2", "rocblas_izamax", "library");
-    subst("cublasIzamax_v2_64", "rocblas_izamax_64", "library");
-    subst("cublasIzamin", "rocblas_izamin", "library");
-    subst("cublasIzamin_64", "rocblas_izamin_64", "library");
-    subst("cublasIzamin_v2", "rocblas_izamin", "library");
-    subst("cublasIzamin_v2_64", "rocblas_izamin_64", "library");
-    subst("cublasNrm2Ex", "rocblas_nrm2_ex", "library");
-    subst("cublasNrm2Ex_64", "rocblas_nrm2_ex_64", "library");
-    subst("cublasRotEx", "rocblas_rot_ex", "library");
-    subst("cublasRotEx_64", "rocblas_rot_ex_64", "library");
-    subst("cublasSasum", "rocblas_sasum", "library");
-    subst("cublasSasum_64", "rocblas_sasum_64", "library");
-    subst("cublasSasum_v2", "rocblas_sasum", "library");
-    subst("cublasSasum_v2_64", "rocblas_sasum_64", "library");
-    subst("cublasSaxpy", "rocblas_saxpy", "library");
-    subst("cublasSaxpy_64", "rocblas_saxpy_64", "library");
-    subst("cublasSaxpy_v2", "rocblas_saxpy", "library");
-    subst("cublasSaxpy_v2_64", "rocblas_saxpy_64", "library");
-    subst("cublasScalEx", "rocblas_scal_ex", "library");
-    subst("cublasScalEx_64", "rocblas_scal_ex_64", "library");
-    subst("cublasScasum", "rocblas_scasum", "library");
-    subst("cublasScasum_64", "rocblas_scasum_64", "library");
-    subst("cublasScasum_v2", "rocblas_scasum", "library");
-    subst("cublasScasum_v2_64", "rocblas_scasum_64", "library");
-    subst("cublasScnrm2", "rocblas_scnrm2", "library");
-    subst("cublasScnrm2_64", "rocblas_scnrm2_64", "library");
-    subst("cublasScnrm2_v2", "rocblas_scnrm2", "library");
-    subst("cublasScnrm2_v2_64", "rocblas_scnrm2_64", "library");
-    subst("cublasScopy", "rocblas_scopy", "library");
-    subst("cublasScopy_64", "rocblas_scopy_64", "library");
-    subst("cublasScopy_v2", "rocblas_scopy", "library");
-    subst("cublasScopy_v2_64", "rocblas_scopy_64", "library");
-    subst("cublasSdgmm", "rocblas_sdgmm", "library");
-    subst("cublasSdot", "rocblas_sdot", "library");
-    subst("cublasSdot_64", "rocblas_sdot_64", "library");
-    subst("cublasSdot_v2", "rocblas_sdot", "library");
-    subst("cublasSdot_v2_64", "rocblas_sdot_64", "library");
-    subst("cublasSetAtomicsMode", "rocblas_set_atomics_mode", "library");
-    subst("cublasSetMathMode", "rocblas_set_math_mode", "library");
-    subst("cublasSetMatrix", "rocblas_set_matrix", "library");
-    subst("cublasSetMatrixAsync", "rocblas_set_matrix_async", "library");
-    subst("cublasSetPointerMode", "rocblas_set_pointer_mode", "library");
-    subst("cublasSetPointerMode_v2", "rocblas_set_pointer_mode", "library");
-    subst("cublasSetStream", "rocblas_set_stream", "library");
-    subst("cublasSetStream_v2", "rocblas_set_stream", "library");
-    subst("cublasSetVector", "rocblas_set_vector", "library");
-    subst("cublasSetVectorAsync", "rocblas_set_vector_async", "library");
-    subst("cublasSgbmv", "rocblas_sgbmv", "library");
-    subst("cublasSgbmv_v2", "rocblas_sgbmv", "library");
-    subst("cublasSgeam", "rocblas_sgeam", "library");
-    subst("cublasSgemm", "rocblas_sgemm", "library");
-    subst("cublasSgemmBatched", "rocblas_sgemm_batched", "library");
-    subst("cublasSgemmStridedBatched", "rocblas_sgemm_strided_batched", "library");
-    subst("cublasSgemm_v2", "rocblas_sgemm", "library");
-    subst("cublasSgemv", "rocblas_sgemv", "library");
-    subst("cublasSgemv_v2", "rocblas_sgemv", "library");
-    subst("cublasSger", "rocblas_sger", "library");
-    subst("cublasSger_v2", "rocblas_sger", "library");
-    subst("cublasSnrm2", "rocblas_snrm2", "library");
-    subst("cublasSnrm2_64", "rocblas_snrm2_64", "library");
-    subst("cublasSnrm2_v2", "rocblas_snrm2", "library");
-    subst("cublasSnrm2_v2_64", "rocblas_snrm2_64", "library");
-    subst("cublasSrot", "rocblas_srot", "library");
-    subst("cublasSrot_64", "rocblas_srot_64", "library");
-    subst("cublasSrot_v2", "rocblas_srot", "library");
-    subst("cublasSrot_v2_64", "rocblas_srot_64", "library");
-    subst("cublasSrotg", "rocblas_srotg", "library");
-    subst("cublasSrotg_v2", "rocblas_srotg", "library");
-    subst("cublasSrotm", "rocblas_srotm", "library");
-    subst("cublasSrotm_64", "rocblas_srotm_64", "library");
-    subst("cublasSrotm_v2", "rocblas_srotm", "library");
-    subst("cublasSrotm_v2_64", "rocblas_srotm_64", "library");
-    subst("cublasSrotmg", "rocblas_srotmg", "library");
-    subst("cublasSrotmg_v2", "rocblas_srotmg", "library");
-    subst("cublasSsbmv", "rocblas_ssbmv", "library");
-    subst("cublasSsbmv_v2", "rocblas_ssbmv", "library");
-    subst("cublasSscal", "rocblas_sscal", "library");
-    subst("cublasSscal_64", "rocblas_sscal_64", "library");
-    subst("cublasSscal_v2", "rocblas_sscal", "library");
-    subst("cublasSscal_v2_64", "rocblas_sscal_64", "library");
-    subst("cublasSspmv", "rocblas_sspmv", "library");
-    subst("cublasSspmv_v2", "rocblas_sspmv", "library");
-    subst("cublasSspr", "rocblas_sspr", "library");
-    subst("cublasSspr2", "rocblas_sspr2", "library");
-    subst("cublasSspr2_v2", "rocblas_sspr2", "library");
-    subst("cublasSspr_v2", "rocblas_sspr", "library");
-    subst("cublasSswap", "rocblas_sswap", "library");
-    subst("cublasSswap_64", "rocblas_sswap_64", "library");
-    subst("cublasSswap_v2", "rocblas_sswap", "library");
-    subst("cublasSswap_v2_64", "rocblas_sswap_64", "library");
-    subst("cublasSsymm", "rocblas_ssymm", "library");
-    subst("cublasSsymm_v2", "rocblas_ssymm", "library");
-    subst("cublasSsymv", "rocblas_ssymv", "library");
-    subst("cublasSsymv_v2", "rocblas_ssymv", "library");
-    subst("cublasSsyr", "rocblas_ssyr", "library");
-    subst("cublasSsyr2", "rocblas_ssyr2", "library");
-    subst("cublasSsyr2_v2", "rocblas_ssyr2", "library");
-    subst("cublasSsyr2k", "rocblas_ssyr2k", "library");
-    subst("cublasSsyr2k_v2", "rocblas_ssyr2k", "library");
-    subst("cublasSsyr_v2", "rocblas_ssyr", "library");
-    subst("cublasSsyrk", "rocblas_ssyrk", "library");
-    subst("cublasSsyrk_v2", "rocblas_ssyrk", "library");
-    subst("cublasSsyrkx", "rocblas_ssyrkx", "library");
-    subst("cublasStbmv", "rocblas_stbmv", "library");
-    subst("cublasStbmv_v2", "rocblas_stbmv", "library");
-    subst("cublasStbsv", "rocblas_stbsv", "library");
-    subst("cublasStbsv_v2", "rocblas_stbsv", "library");
-    subst("cublasStpmv", "rocblas_stpmv", "library");
-    subst("cublasStpmv_v2", "rocblas_stpmv", "library");
-    subst("cublasStpsv", "rocblas_stpsv", "library");
-    subst("cublasStpsv_v2", "rocblas_stpsv", "library");
-    subst("cublasStrmm", "rocblas_strmm", "library");
-    subst("cublasStrmm_v2", "rocblas_strmm", "library");
-    subst("cublasStrmv", "rocblas_strmv", "library");
-    subst("cublasStrmv_v2", "rocblas_strmv", "library");
-    subst("cublasStrsm", "rocblas_strsm", "library");
-    subst("cublasStrsmBatched", "rocblas_strsm_batched", "library");
-    subst("cublasStrsm_v2", "rocblas_strsm", "library");
-    subst("cublasStrsv", "rocblas_strsv", "library");
-    subst("cublasStrsv_v2", "rocblas_strsv", "library");
-    subst("cublasTSSgemvBatched", "rocblas_tssgemv_batched", "library");
-    subst("cublasTSSgemvStridedBatched", "rocblas_tssgemv_strided_batched", "library");
-    subst("cublasTSTgemvBatched", "rocblas_tstgemv_batched", "library");
-    subst("cublasTSTgemvStridedBatched", "rocblas_tstgemv_strided_batched", "library");
-    subst("cublasZaxpy", "rocblas_zaxpy", "library");
-    subst("cublasZaxpy_64", "rocblas_zaxpy_64", "library");
-    subst("cublasZaxpy_v2", "rocblas_zaxpy", "library");
-    subst("cublasZaxpy_v2_64", "rocblas_zaxpy_64", "library");
-    subst("cublasZcopy", "rocblas_zcopy", "library");
-    subst("cublasZcopy_64", "rocblas_zcopy_64", "library");
-    subst("cublasZcopy_v2", "rocblas_zcopy", "library");
-    subst("cublasZcopy_v2_64", "rocblas_zcopy_64", "library");
-    subst("cublasZdgmm", "rocblas_zdgmm", "library");
-    subst("cublasZdotc", "rocblas_zdotc", "library");
-    subst("cublasZdotc_64", "rocblas_zdotc_64", "library");
-    subst("cublasZdotc_v2", "rocblas_zdotc", "library");
-    subst("cublasZdotc_v2_64", "rocblas_zdotc_64", "library");
-    subst("cublasZdotu", "rocblas_zdotu", "library");
-    subst("cublasZdotu_64", "rocblas_zdotu_64", "library");
-    subst("cublasZdotu_v2", "rocblas_zdotu", "library");
-    subst("cublasZdotu_v2_64", "rocblas_zdotu_64", "library");
-    subst("cublasZdrot", "rocblas_zdrot", "library");
-    subst("cublasZdrot_64", "rocblas_zdrot_64", "library");
-    subst("cublasZdrot_v2", "rocblas_zdrot", "library");
-    subst("cublasZdrot_v2_64", "rocblas_zdrot_64", "library");
-    subst("cublasZdscal", "rocblas_zdscal", "library");
-    subst("cublasZdscal_64", "rocblas_zdscal_64", "library");
-    subst("cublasZdscal_v2", "rocblas_zdscal", "library");
-    subst("cublasZdscal_v2_64", "rocblas_zdscal_64", "library");
-    subst("cublasZgbmv", "rocblas_zgbmv", "library");
-    subst("cublasZgbmv_v2", "rocblas_zgbmv", "library");
-    subst("cublasZgeam", "rocblas_zgeam", "library");
-    subst("cublasZgemm", "rocblas_zgemm", "library");
-    subst("cublasZgemmBatched", "rocblas_zgemm_batched", "library");
-    subst("cublasZgemmStridedBatched", "rocblas_zgemm_strided_batched", "library");
-    subst("cublasZgemm_v2", "rocblas_zgemm", "library");
-    subst("cublasZgemv", "rocblas_zgemv", "library");
-    subst("cublasZgemvBatched", "rocblas_zgemv_batched", "library");
-    subst("cublasZgemvStridedBatched", "rocblas_zgemv_strided_batched", "library");
-    subst("cublasZgemv_v2", "rocblas_zgemv", "library");
-    subst("cublasZgerc", "rocblas_zgerc", "library");
-    subst("cublasZgerc_v2", "rocblas_zgerc", "library");
-    subst("cublasZgeru", "rocblas_zgeru", "library");
-    subst("cublasZgeru_v2", "rocblas_zgeru", "library");
-    subst("cublasZhbmv", "rocblas_zhbmv", "library");
-    subst("cublasZhbmv_v2", "rocblas_zhbmv", "library");
-    subst("cublasZhemm", "rocblas_zhemm", "library");
-    subst("cublasZhemm_v2", "rocblas_zhemm", "library");
-    subst("cublasZhemv", "rocblas_zhemv", "library");
-    subst("cublasZhemv_v2", "rocblas_zhemv", "library");
-    subst("cublasZher", "rocblas_zher", "library");
-    subst("cublasZher2", "rocblas_zher2", "library");
-    subst("cublasZher2_v2", "rocblas_zher2", "library");
-    subst("cublasZher2k", "rocblas_zher2k", "library");
-    subst("cublasZher2k_v2", "rocblas_zher2k", "library");
-    subst("cublasZher_v2", "rocblas_zher", "library");
-    subst("cublasZherk", "rocblas_zherk", "library");
-    subst("cublasZherk_v2", "rocblas_zherk", "library");
-    subst("cublasZherkx", "rocblas_zherkx", "library");
-    subst("cublasZhpmv", "rocblas_zhpmv", "library");
-    subst("cublasZhpmv_v2", "rocblas_zhpmv", "library");
-    subst("cublasZhpr", "rocblas_zhpr", "library");
-    subst("cublasZhpr2", "rocblas_zhpr2", "library");
-    subst("cublasZhpr2_v2", "rocblas_zhpr2", "library");
-    subst("cublasZhpr_v2", "rocblas_zhpr", "library");
-    subst("cublasZrot", "rocblas_zrot", "library");
-    subst("cublasZrot_64", "rocblas_zrot_64", "library");
-    subst("cublasZrot_v2", "rocblas_zrot", "library");
-    subst("cublasZrot_v2_64", "rocblas_zrot_64", "library");
-    subst("cublasZrotg", "rocblas_zrotg", "library");
-    subst("cublasZrotg_v2", "rocblas_zrotg", "library");
-    subst("cublasZscal", "rocblas_zscal", "library");
-    subst("cublasZscal_64", "rocblas_zscal_64", "library");
-    subst("cublasZscal_v2", "rocblas_zscal", "library");
-    subst("cublasZscal_v2_64", "rocblas_zscal_64", "library");
-    subst("cublasZswap", "rocblas_zswap", "library");
-    subst("cublasZswap_64", "rocblas_zswap_64", "library");
-    subst("cublasZswap_v2", "rocblas_zswap", "library");
-    subst("cublasZswap_v2_64", "rocblas_zswap_64", "library");
-    subst("cublasZsymm", "rocblas_zsymm", "library");
-    subst("cublasZsymm_v2", "rocblas_zsymm", "library");
-    subst("cublasZsymv", "rocblas_zsymv", "library");
-    subst("cublasZsymv_v2", "rocblas_zsymv", "library");
-    subst("cublasZsyr", "rocblas_zsyr", "library");
-    subst("cublasZsyr2", "rocblas_zsyr2", "library");
-    subst("cublasZsyr2_v2", "rocblas_zsyr2", "library");
-    subst("cublasZsyr2k", "rocblas_zsyr2k", "library");
-    subst("cublasZsyr2k_v2", "rocblas_zsyr2k", "library");
-    subst("cublasZsyr_v2", "rocblas_zsyr", "library");
-    subst("cublasZsyrk", "rocblas_zsyrk", "library");
-    subst("cublasZsyrk_v2", "rocblas_zsyrk", "library");
-    subst("cublasZsyrkx", "rocblas_zsyrkx", "library");
-    subst("cublasZtbmv", "rocblas_ztbmv", "library");
-    subst("cublasZtbmv_v2", "rocblas_ztbmv", "library");
-    subst("cublasZtbsv", "rocblas_ztbsv", "library");
-    subst("cublasZtbsv_v2", "rocblas_ztbsv", "library");
-    subst("cublasZtpmv", "rocblas_ztpmv", "library");
-    subst("cublasZtpmv_v2", "rocblas_ztpmv", "library");
-    subst("cublasZtpsv", "rocblas_ztpsv", "library");
-    subst("cublasZtpsv_v2", "rocblas_ztpsv", "library");
-    subst("cublasZtrmm", "rocblas_ztrmm", "library");
-    subst("cublasZtrmm_v2", "rocblas_ztrmm", "library");
-    subst("cublasZtrmv", "rocblas_ztrmv", "library");
-    subst("cublasZtrmv_v2", "rocblas_ztrmv", "library");
-    subst("cublasZtrsm", "rocblas_ztrsm", "library");
-    subst("cublasZtrsmBatched", "rocblas_ztrsm_batched", "library");
-    subst("cublasZtrsm_v2", "rocblas_ztrsm", "library");
-    subst("cublasZtrsv", "rocblas_ztrsv", "library");
-    subst("cublasZtrsv_v2", "rocblas_ztrsv", "library");
-    subst("cudnnActivationBackward", "miopenActivationBackward", "library");
-    subst("cudnnActivationForward", "miopenActivationForward", "library");
-    subst("cudnnBatchNormalizationBackward", "miopenBatchNormalizationBackward", "library");
-    subst("cudnnBatchNormalizationForwardInference", "miopenBatchNormalizationForwardInference", "library");
-    subst("cudnnBatchNormalizationForwardTraining", "miopenBatchNormalizationForwardTraining", "library");
-    subst("cudnnCTCLoss", "miopenCTCLoss", "library");
-    subst("cudnnConvolutionBackwardBias", "miopenConvolutionBackwardBias", "library");
-    subst("cudnnConvolutionBackwardData", "miopenConvolutionBackwardData", "library");
-    subst("cudnnConvolutionBiasActivationForward", "miopenConvolutionBiasActivationForward", "library");
-    subst("cudnnConvolutionForward", "miopenConvolutionForward", "library");
-    subst("cudnnCreate", "miopenCreate", "library");
-    subst("cudnnCreateActivationDescriptor", "miopenCreateActivationDescriptor", "library");
-    subst("cudnnCreateCTCLossDescriptor", "miopenCreateCTCLossDescriptor", "library");
-    subst("cudnnCreateConvolutionDescriptor", "miopenCreateConvolutionDescriptor", "library");
-    subst("cudnnCreateDropoutDescriptor", "miopenCreateDropoutDescriptor", "library");
-    subst("cudnnCreateLRNDescriptor", "miopenCreateLRNDescriptor", "library");
-    subst("cudnnCreatePoolingDescriptor", "miopenCreatePoolingDescriptor", "library");
-    subst("cudnnCreateRNNDescriptor", "miopenCreateRNNDescriptor", "library");
-    subst("cudnnCreateReduceTensorDescriptor", "miopenCreateReduceTensorDescriptor", "library");
-    subst("cudnnCreateTensorDescriptor", "miopenCreateTensorDescriptor", "library");
-    subst("cudnnDeriveBNTensorDescriptor", "miopenDeriveBNTensorDescriptor", "library");
-    subst("cudnnDestroy", "miopenDestroy", "library");
-    subst("cudnnDestroyActivationDescriptor", "miopenDestroyActivationDescriptor", "library");
-    subst("cudnnDestroyCTCLossDescriptor", "miopenDestroyCTCLossDescriptor", "library");
-    subst("cudnnDestroyConvolutionDescriptor", "miopenDestroyConvolutionDescriptor", "library");
-    subst("cudnnDestroyDropoutDescriptor", "miopenDestroyDropoutDescriptor", "library");
-    subst("cudnnDestroyLRNDescriptor", "miopenDestroyLRNDescriptor", "library");
-    subst("cudnnDestroyPoolingDescriptor", "miopenDestroyPoolingDescriptor", "library");
-    subst("cudnnDestroyRNNDescriptor", "miopenDestroyRNNDescriptor", "library");
-    subst("cudnnDestroyReduceTensorDescriptor", "miopenDestroyReduceTensorDescriptor", "library");
-    subst("cudnnDestroyTensorDescriptor", "miopenDestroyTensorDescriptor", "library");
-    subst("cudnnDropoutBackward", "miopenDropoutBackward", "library");
-    subst("cudnnDropoutForward", "miopenDropoutForward", "library");
-    subst("cudnnDropoutGetReserveSpaceSize", "miopenDropoutGetReserveSpaceSize", "library");
-    subst("cudnnDropoutGetStatesSize", "miopenDropoutGetStatesSize", "library");
-    subst("cudnnFindConvolutionForwardAlgorithmEx", "miopenFindConvolutionForwardAlgorithm", "library");
-    subst("cudnnGetCTCLossDescriptor", "miopenGetCTCLossDescriptor", "library");
-    subst("cudnnGetCTCLossWorkspaceSize", "miopenGetCTCLossWorkspaceSize", "library");
-    subst("cudnnGetConvolution2dForwardOutputDim", "miopenGetConvolutionForwardOutputDim", "library");
-    subst("cudnnGetConvolutionBackwardDataWorkspaceSize", "miopenConvolutionBackwardDataGetWorkSpaceSize", "library");
-    subst("cudnnGetConvolutionForwardWorkspaceSize", "miopenConvolutionForwardGetWorkSpaceSize", "library");
-    subst("cudnnGetDropoutDescriptor", "miopenGetDropoutDescriptor", "library");
-    subst("cudnnGetErrorString", "miopenGetErrorString", "library");
-    subst("cudnnGetLRNDescriptor", "miopenGetLRNDescriptor", "library");
-    subst("cudnnGetPooling2dDescriptor", "miopenGet2dPoolingDescriptor", "library");
-    subst("cudnnGetPooling2dForwardOutputDim", "miopenGetPoolingForwardOutputDim", "library");
-    subst("cudnnGetPoolingNdDescriptor", "miopenGetNdPoolingDescriptor", "library");
-    subst("cudnnGetPoolingNdForwardOutputDim", "miopenGetPoolingNdForwardOutputDim", "library");
-    subst("cudnnGetRNNDescriptor", "miopenGetRNNDescriptor_V2", "library");
-    subst("cudnnGetRNNDescriptor_v6", "miopenGetRNNDescriptor_V2", "library");
-    subst("cudnnGetRNNParamsSize", "miopenGetRNNParamsSize", "library");
-    subst("cudnnGetRNNTrainingReserveSize", "miopenGetRNNTrainingReserveSize", "library");
-    subst("cudnnGetRNNWorkspaceSize", "miopenGetRNNWorkspaceSize", "library");
-    subst("cudnnGetReduceTensorDescriptor", "miopenGetReduceTensorDescriptor", "library");
-    subst("cudnnGetReductionIndicesSize", "miopenGetReductionIndicesSize", "library");
-    subst("cudnnGetReductionWorkspaceSize", "miopenGetReductionWorkspaceSize", "library");
-    subst("cudnnGetStream", "miopenGetStream", "library");
-    subst("cudnnGetTensor4dDescriptor", "miopenGet4dTensorDescriptor", "library");
-    subst("cudnnRNNBackwardData", "miopenRNNBackwardData", "library");
-    subst("cudnnRNNBackwardWeights", "miopenRNNBackwardWeights", "library");
-    subst("cudnnRNNForwardInference", "miopenRNNForwardInference", "library");
-    subst("cudnnRNNForwardTraining", "miopenRNNForwardTraining", "library");
-    subst("cudnnReduceTensor", "miopenReduceTensor", "library");
-    subst("cudnnRestoreDropoutDescriptor", "miopenRestoreDropoutDescriptor", "library");
-    subst("cudnnScaleTensor", "miopenScaleTensor", "library");
-    subst("cudnnSetCTCLossDescriptor", "miopenSetCTCLossDescriptor", "library");
-    subst("cudnnSetConvolutionGroupCount", "miopenSetConvolutionGroupCount", "library");
-    subst("cudnnSetDropoutDescriptor", "miopenSetDropoutDescriptor", "library");
-    subst("cudnnSetLRNDescriptor", "miopenSetLRNDescriptor", "library");
-    subst("cudnnSetPooling2dDescriptor", "miopenSet2dPoolingDescriptor", "library");
-    subst("cudnnSetPoolingNdDescriptor", "miopenSetNdPoolingDescriptor", "library");
-    subst("cudnnSetRNNDescriptor_v6", "miopenSetRNNDescriptor_V2", "library");
-    subst("cudnnSetReduceTensorDescriptor", "miopenSetReduceTensorDescriptor", "library");
-    subst("cudnnSetStream", "miopenSetStream", "library");
-    subst("cudnnSetTensor", "miopenSetTensor", "library");
-    subst("cudnnSetTensor4dDescriptorEx", "miopenSet4dTensorDescriptorEx", "library");
-    subst("cudnnSoftmaxBackward", "miopenSoftmaxBackward_V2", "library");
-    subst("cudnnSoftmaxForward", "miopenSoftmaxForward_V2", "library");
-    subst("cudnnTransformTensor", "miopenTransformTensor", "library");
-    subst("cusolverDnCpotrf", "rocsolver_cpotrf", "library");
-    subst("cusolverDnCreate", "rocblas_create_handle", "library");
-    subst("cusolverDnDestroy", "rocblas_destroy_handle", "library");
-    subst("cusolverDnDpotrf", "rocsolver_dpotrf", "library");
-    subst("cusolverDnGetStream", "rocblas_get_stream", "library");
-    subst("cusolverDnSetStream", "rocblas_set_stream", "library");
-    subst("cusolverDnSpotrf", "rocsolver_spotrf", "library");
-    subst("cusolverDnZpotrf", "rocsolver_zpotrf", "library");
-    subst("cusparseAxpby", "rocsparse_axpby", "library");
-    subst("cusparseBlockedEllGet", "rocsparse_bell_get", "library");
-    subst("cusparseCaxpyi", "rocsparse_caxpyi", "library");
-    subst("cusparseCbsr2csr", "rocsparse_cbsr2csr", "library");
-    subst("cusparseCbsric02", "rocsparse_cbsric0", "library");
-    subst("cusparseCbsric02_analysis", "rocsparse_cbsric0_analysis", "library");
-    subst("cusparseCbsric02_bufferSize", "rocsparse_cbsric0_buffer_size", "library");
-    subst("cusparseCbsrilu02", "rocsparse_cbsrilu0", "library");
-    subst("cusparseCbsrilu02_analysis", "rocsparse_cbsrilu0_analysis", "library");
-    subst("cusparseCbsrilu02_bufferSize", "rocsparse_cbsrilu0_buffer_size", "library");
-    subst("cusparseCbsrilu02_numericBoost", "rocsparse_dcbsrilu0_numeric_boost", "library");
-    subst("cusparseCbsrmm", "rocsparse_cbsrmm", "library");
-    subst("cusparseCbsrmv", "rocsparse_cbsrmv", "library");
-    subst("cusparseCbsrsm2_analysis", "rocsparse_cbsrsm_analysis", "library");
-    subst("cusparseCbsrsm2_bufferSize", "rocsparse_cbsrsm_buffer_size", "library");
-    subst("cusparseCbsrsm2_solve", "rocsparse_cbsrsm_solve", "library");
-    subst("cusparseCbsrsv2_analysis", "rocsparse_cbsrsv_analysis", "library");
-    subst("cusparseCbsrsv2_bufferSize", "rocsparse_cbsrsv_buffer_size", "library");
-    subst("cusparseCbsrsv2_bufferSizeExt", "rocsparse_cbsrsv_buffer_size", "library");
-    subst("cusparseCbsrsv2_solve", "rocsparse_cbsrsv_solve", "library");
-    subst("cusparseCbsrxmv", "rocsparse_cbsrxmv", "library");
-    subst("cusparseCcsc2dense", "rocsparse_ccsc2dense", "library");
-    subst("cusparseCcsr2bsr", "rocsparse_ccsr2bsr", "library");
-    subst("cusparseCcsr2csr_compress", "rocsparse_ccsr2csr_compress", "library");
-    subst("cusparseCcsr2dense", "rocsparse_ccsr2dense", "library");
-    subst("cusparseCcsr2gebsr", "rocsparse_ccsr2gebsr", "library");
-    subst("cusparseCcsr2gebsr_bufferSize", "rocsparse_ccsr2gebsr_buffer_size", "library");
-    subst("cusparseCcsr2hyb", "rocsparse_ccsr2hyb", "library");
-    subst("cusparseCcsrcolor", "rocsparse_ccsrcolor", "library");
-    subst("cusparseCcsrgeam", "rocsparse_ccsrgeam", "library");
-    subst("cusparseCcsrgeam2", "rocsparse_ccsrgeam", "library");
-    subst("cusparseCcsrgemm2", "rocsparse_ccsrgemm", "library");
-    subst("cusparseCcsrgemm2_bufferSizeExt", "rocsparse_ccsrgemm_buffer_size", "library");
-    subst("cusparseCcsric02", "rocsparse_ccsric0", "library");
-    subst("cusparseCcsric02_analysis", "rocsparse_ccsric0_analysis", "library");
-    subst("cusparseCcsric02_bufferSize", "rocsparse_ccsric0_buffer_size", "library");
-    subst("cusparseCcsric02_bufferSizeExt", "rocsparse_ccsric0_buffer_size", "library");
-    subst("cusparseCcsrilu02", "rocsparse_ccsrilu0", "library");
-    subst("cusparseCcsrilu02_analysis", "rocsparse_ccsrilu0_analysis", "library");
-    subst("cusparseCcsrilu02_bufferSize", "rocsparse_ccsrilu0_buffer_size", "library");
-    subst("cusparseCcsrilu02_bufferSizeExt", "rocsparse_ccsrilu0_buffer_size", "library");
-    subst("cusparseCcsrilu02_numericBoost", "rocsparse_dccsrilu0_numeric_boost", "library");
-    subst("cusparseCcsrmm", "rocsparse_ccsrmm", "library");
-    subst("cusparseCcsrmm2", "rocsparse_ccsrmm", "library");
-    subst("cusparseCcsrmv", "rocsparse_ccsrmv", "library");
-    subst("cusparseCcsrsm2_analysis", "rocsparse_ccsrsm_analysis", "library");
-    subst("cusparseCcsrsm2_bufferSizeExt", "rocsparse_ccsrsm_buffer_size", "library");
-    subst("cusparseCcsrsm2_solve", "rocsparse_ccsrsm_solve", "library");
-    subst("cusparseCcsrsv2_analysis", "rocsparse_ccsrsv_analysis", "library");
-    subst("cusparseCcsrsv2_bufferSize", "rocsparse_ccsrsv_buffer_size", "library");
-    subst("cusparseCcsrsv2_bufferSizeExt", "rocsparse_ccsrsv_buffer_size", "library");
-    subst("cusparseCcsrsv2_solve", "rocsparse_ccsrsv_solve", "library");
-    subst("cusparseCdense2csc", "rocsparse_cdense2csc", "library");
-    subst("cusparseCdense2csr", "rocsparse_cdense2csr", "library");
-    subst("cusparseCdotci", "rocsparse_cdotci", "library");
-    subst("cusparseCdoti", "rocsparse_cdoti", "library");
-    subst("cusparseCgebsr2csr", "rocsparse_cgebsr2csr", "library");
-    subst("cusparseCgebsr2gebsc", "rocsparse_cgebsr2gebsc", "library");
-    subst("cusparseCgebsr2gebsc_bufferSize", "rocsparse_cgebsr2gebsc_buffer_size", "library");
-    subst("cusparseCgebsr2gebsr", "rocsparse_cgebsr2gebsr", "library");
-    subst("cusparseCgebsr2gebsr_bufferSize", "rocsparse_cgebsr2gebsr_buffer_size", "library");
-    subst("cusparseCgemvi", "rocsparse_cgemvi", "library");
-    subst("cusparseCgemvi_bufferSize", "rocsparse_cgemvi_buffer_size", "library");
-    subst("cusparseCgpsvInterleavedBatch", "rocsparse_cgpsv_interleaved_batch", "library");
-    subst("cusparseCgpsvInterleavedBatch_bufferSizeExt", "rocsparse_cgpsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseCgthr", "rocsparse_cgthr", "library");
-    subst("cusparseCgthrz", "rocsparse_cgthrz", "library");
-    subst("cusparseCgtsv2", "rocsparse_cgtsv", "library");
-    subst("cusparseCgtsv2StridedBatch", "rocsparse_cgtsv_no_pivot_strided_batch", "library");
-    subst("cusparseCgtsv2StridedBatch_bufferSizeExt", "rocsparse_cgtsv_no_pivot_strided_batch_buffer_size", "library");
-    subst("cusparseCgtsv2_bufferSizeExt", "rocsparse_cgtsv_buffer_size", "library");
-    subst("cusparseCgtsv2_nopivot", "rocsparse_cgtsv_no_pivot", "library");
-    subst("cusparseCgtsv2_nopivot_bufferSizeExt", "rocsparse_cgtsv_no_pivot_buffer_size", "library");
-    subst("cusparseCgtsvInterleavedBatch", "rocsparse_cgtsv_interleaved_batch", "library");
-    subst("cusparseCgtsvInterleavedBatch_bufferSizeExt", "rocsparse_cgtsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseChybmv", "rocsparse_chybmv", "library");
-    subst("cusparseCnnz", "rocsparse_cnnz", "library");
-    subst("cusparseCnnz_compress", "rocsparse_cnnz_compress", "library");
-    subst("cusparseConstBlockedEllGet", "rocsparse_const_bell_get", "library");
-    subst("cusparseConstCooGet", "rocsparse_const_coo_get", "library");
-    subst("cusparseConstCscGet", "rocsparse_const_csc_get", "library");
-    subst("cusparseConstCsrGet", "rocsparse_const_csr_get", "library");
-    subst("cusparseConstDnMatGet", "rocsparse_const_dnmat_get", "library");
-    subst("cusparseConstDnMatGetValues", "rocsparse_const_dnmat_get_values", "library");
-    subst("cusparseConstDnVecGet", "rocsparse_const_dnvec_get", "library");
-    subst("cusparseConstDnVecGetValues", "rocsparse_const_dnvec_get_values", "library");
-    subst("cusparseConstSpMatGetValues", "rocsparse_const_spmat_get_values", "library");
-    subst("cusparseConstSpVecGet", "rocsparse_const_spvec_get", "library");
-    subst("cusparseConstSpVecGetValues", "rocsparse_const_spvec_get_values", "library");
-    subst("cusparseCooAoSGet", "rocsparse_coo_aos_get", "library");
-    subst("cusparseCooGet", "rocsparse_coo_get", "library");
-    subst("cusparseCooSetPointers", "rocsparse_coo_set_pointers", "library");
-    subst("cusparseCooSetStridedBatch", "rocsparse_coo_set_strided_batch", "library");
-    subst("cusparseCopyMatDescr", "rocsparse_copy_mat_descr", "library");
-    subst("cusparseCreate", "rocsparse_create_handle", "library");
-    subst("cusparseCreateBlockedEll", "rocsparse_create_bell_descr", "library");
-    subst("cusparseCreateBsric02Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateBsrilu02Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateBsrsm2Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateBsrsv2Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateColorInfo", "rocsparse_create_color_info", "library");
-    subst("cusparseCreateConstBlockedEll", "rocsparse_create_const_bell_descr", "library");
-    subst("cusparseCreateConstCoo", "rocsparse_create_const_coo_descr", "library");
-    subst("cusparseCreateConstCsc", "rocsparse_create_const_csc_descr", "library");
-    subst("cusparseCreateConstCsr", "rocsparse_create_const_csr_descr", "library");
-    subst("cusparseCreateConstDnMat", "rocsparse_create_const_dnmat_descr", "library");
-    subst("cusparseCreateConstDnVec", "rocsparse_create_const_dnvec_descr", "library");
-    subst("cusparseCreateConstSpVec", "rocsparse_create_const_spvec_descr", "library");
-    subst("cusparseCreateCoo", "rocsparse_create_coo_descr", "library");
-    subst("cusparseCreateCooAoS", "rocsparse_create_coo_aos_descr", "library");
-    subst("cusparseCreateCsc", "rocsparse_create_csc_descr", "library");
-    subst("cusparseCreateCsr", "rocsparse_create_csr_descr", "library");
-    subst("cusparseCreateCsrgemm2Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateCsric02Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateCsrilu02Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateCsrsm2Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateCsrsv2Info", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateDnMat", "rocsparse_create_dnmat_descr", "library");
-    subst("cusparseCreateDnVec", "rocsparse_create_dnvec_descr", "library");
-    subst("cusparseCreateHybMat", "rocsparse_create_hyb_mat", "library");
-    subst("cusparseCreateIdentityPermutation", "rocsparse_create_identity_permutation", "library");
-    subst("cusparseCreateMatDescr", "rocsparse_create_mat_descr", "library");
-    subst("cusparseCreatePruneInfo", "rocsparse_create_mat_info", "library");
-    subst("cusparseCreateSpVec", "rocsparse_create_spvec_descr", "library");
-    subst("cusparseCscGet", "rocsparse_csc_get", "library");
-    subst("cusparseCscSetPointers", "rocsparse_csc_set_pointers", "library");
-    subst("cusparseCsctr", "rocsparse_csctr", "library");
-    subst("cusparseCsr2cscEx2_bufferSize", "rocsparse_csr2csc_buffer_size", "library");
-    subst("cusparseCsrGet", "rocsparse_csr_get", "library");
-    subst("cusparseCsrSetPointers", "rocsparse_csr_set_pointers", "library");
-    subst("cusparseCsrSetStridedBatch", "rocsparse_csr_set_strided_batch", "library");
-    subst("cusparseDaxpyi", "rocsparse_daxpyi", "library");
-    subst("cusparseDbsr2csr", "rocsparse_dbsr2csr", "library");
-    subst("cusparseDbsric02", "rocsparse_dbsric0", "library");
-    subst("cusparseDbsric02_analysis", "rocsparse_dbsric0_analysis", "library");
-    subst("cusparseDbsric02_bufferSize", "rocsparse_dbsric0_buffer_size", "library");
-    subst("cusparseDbsrilu02", "rocsparse_dbsrilu0", "library");
-    subst("cusparseDbsrilu02_analysis", "rocsparse_dbsrilu0_analysis", "library");
-    subst("cusparseDbsrilu02_bufferSize", "rocsparse_dbsrilu0_buffer_size", "library");
-    subst("cusparseDbsrilu02_numericBoost", "rocsparse_dbsrilu0_numeric_boost", "library");
-    subst("cusparseDbsrmm", "rocsparse_dbsrmm", "library");
-    subst("cusparseDbsrmv", "rocsparse_dbsrmv", "library");
-    subst("cusparseDbsrsm2_analysis", "rocsparse_dbsrsm_analysis", "library");
-    subst("cusparseDbsrsm2_bufferSize", "rocsparse_dbsrsm_buffer_size", "library");
-    subst("cusparseDbsrsm2_solve", "rocsparse_dbsrsm_solve", "library");
-    subst("cusparseDbsrsv2_analysis", "rocsparse_dbsrsv_analysis", "library");
-    subst("cusparseDbsrsv2_bufferSize", "rocsparse_dbsrsv_buffer_size", "library");
-    subst("cusparseDbsrsv2_bufferSizeExt", "rocsparse_dbsrsv_buffer_size", "library");
-    subst("cusparseDbsrsv2_solve", "rocsparse_dbsrsv_solve", "library");
-    subst("cusparseDbsrxmv", "rocsparse_dbsrxmv", "library");
-    subst("cusparseDcsc2dense", "rocsparse_dcsc2dense", "library");
-    subst("cusparseDcsr2bsr", "rocsparse_dcsr2bsr", "library");
-    subst("cusparseDcsr2csr_compress", "rocsparse_dcsr2csr_compress", "library");
-    subst("cusparseDcsr2dense", "rocsparse_dcsr2dense", "library");
-    subst("cusparseDcsr2gebsr", "rocsparse_dcsr2gebsr", "library");
-    subst("cusparseDcsr2gebsr_bufferSize", "rocsparse_dcsr2gebsr_buffer_size", "library");
-    subst("cusparseDcsr2hyb", "rocsparse_dcsr2hyb", "library");
-    subst("cusparseDcsrcolor", "rocsparse_dcsrcolor", "library");
-    subst("cusparseDcsrgeam", "rocsparse_dcsrgeam", "library");
-    subst("cusparseDcsrgeam2", "rocsparse_dcsrgeam", "library");
-    subst("cusparseDcsrgemm2", "rocsparse_dcsrgemm", "library");
-    subst("cusparseDcsrgemm2_bufferSizeExt", "rocsparse_dcsrgemm_buffer_size", "library");
-    subst("cusparseDcsric02", "rocsparse_dcsric0", "library");
-    subst("cusparseDcsric02_analysis", "rocsparse_dcsric0_analysis", "library");
-    subst("cusparseDcsric02_bufferSize", "rocsparse_dcsric0_buffer_size", "library");
-    subst("cusparseDcsric02_bufferSizeExt", "rocsparse_dcsric0_buffer_size", "library");
-    subst("cusparseDcsrilu02", "rocsparse_dcsrilu0", "library");
-    subst("cusparseDcsrilu02_analysis", "rocsparse_dcsrilu0_analysis", "library");
-    subst("cusparseDcsrilu02_bufferSize", "rocsparse_dcsrilu0_buffer_size", "library");
-    subst("cusparseDcsrilu02_bufferSizeExt", "rocsparse_dcsrilu0_buffer_size", "library");
-    subst("cusparseDcsrilu02_numericBoost", "rocsparse_dcsrilu0_numeric_boost", "library");
-    subst("cusparseDcsrmm", "rocsparse_dcsrmm", "library");
-    subst("cusparseDcsrmm2", "rocsparse_dcsrmm", "library");
-    subst("cusparseDcsrmv", "rocsparse_dcsrmv", "library");
-    subst("cusparseDcsrsm2_analysis", "rocsparse_dcsrsm_analysis", "library");
-    subst("cusparseDcsrsm2_bufferSizeExt", "rocsparse_dcsrsm_buffer_size", "library");
-    subst("cusparseDcsrsm2_solve", "rocsparse_dcsrsm_solve", "library");
-    subst("cusparseDcsrsv2_analysis", "rocsparse_dcsrsv_analysis", "library");
-    subst("cusparseDcsrsv2_bufferSize", "rocsparse_dcsrsv_buffer_size", "library");
-    subst("cusparseDcsrsv2_bufferSizeExt", "rocsparse_dcsrsv_buffer_size", "library");
-    subst("cusparseDcsrsv2_solve", "rocsparse_dcsrsv_solve", "library");
-    subst("cusparseDdense2csc", "rocsparse_ddense2csc", "library");
-    subst("cusparseDdense2csr", "rocsparse_ddense2csr", "library");
-    subst("cusparseDdoti", "rocsparse_ddoti", "library");
-    subst("cusparseDenseToSparse_analysis", "rocsparse_dense_to_sparse", "library");
-    subst("cusparseDenseToSparse_bufferSize", "rocsparse_dense_to_sparse", "library");
-    subst("cusparseDestroy", "rocsparse_destroy_handle", "library");
-    subst("cusparseDestroyBsric02Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyBsrilu02Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyBsrsm2Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyBsrsv2Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyColorInfo", "rocsparse_destroy_color_info", "library");
-    subst("cusparseDestroyCsrgemm2Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyCsric02Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyCsrilu02Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyCsrsm2Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyCsrsv2Info", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroyDnMat", "rocsparse_destroy_dnmat_descr", "library");
-    subst("cusparseDestroyDnVec", "rocsparse_destroy_dnvec_descr", "library");
-    subst("cusparseDestroyHybMat", "rocsparse_destroy_hyb_mat", "library");
-    subst("cusparseDestroyMatDescr", "rocsparse_destroy_mat_descr", "library");
-    subst("cusparseDestroyPruneInfo", "rocsparse_destroy_mat_info", "library");
-    subst("cusparseDestroySpMat", "rocsparse_destroy_spmat_descr", "library");
-    subst("cusparseDestroySpVec", "rocsparse_destroy_spvec_descr", "library");
-    subst("cusparseDgebsr2csr", "rocsparse_dgebsr2csr", "library");
-    subst("cusparseDgebsr2gebsc", "rocsparse_dgebsr2gebsc", "library");
-    subst("cusparseDgebsr2gebsc_bufferSize", "rocsparse_dgebsr2gebsc_buffer_size", "library");
-    subst("cusparseDgebsr2gebsr", "rocsparse_dgebsr2gebsr", "library");
-    subst("cusparseDgebsr2gebsr_bufferSize", "rocsparse_dgebsr2gebsr_buffer_size", "library");
-    subst("cusparseDgemvi", "rocsparse_dgemvi", "library");
-    subst("cusparseDgemvi_bufferSize", "rocsparse_dgemvi_buffer_size", "library");
-    subst("cusparseDgpsvInterleavedBatch", "rocsparse_dgpsv_interleaved_batch", "library");
-    subst("cusparseDgpsvInterleavedBatch_bufferSizeExt", "rocsparse_dgpsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseDgthr", "rocsparse_dgthr", "library");
-    subst("cusparseDgthrz", "rocsparse_dgthrz", "library");
-    subst("cusparseDgtsv2", "rocsparse_dgtsv", "library");
-    subst("cusparseDgtsv2StridedBatch", "rocsparse_dgtsv_no_pivot_strided_batch", "library");
-    subst("cusparseDgtsv2StridedBatch_bufferSizeExt", "rocsparse_dgtsv_no_pivot_strided_batch_buffer_size", "library");
-    subst("cusparseDgtsv2_bufferSizeExt", "rocsparse_dgtsv_buffer_size", "library");
-    subst("cusparseDgtsv2_nopivot", "rocsparse_dgtsv_no_pivot", "library");
-    subst("cusparseDgtsv2_nopivot_bufferSizeExt", "rocsparse_dgtsv_no_pivot_buffer_size", "library");
-    subst("cusparseDgtsvInterleavedBatch", "rocsparse_dgtsv_interleaved_batch", "library");
-    subst("cusparseDgtsvInterleavedBatch_bufferSizeExt", "rocsparse_dgtsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseDhybmv", "rocsparse_dhybmv", "library");
-    subst("cusparseDnMatGet", "rocsparse_dnmat_get", "library");
-    subst("cusparseDnMatGetStridedBatch", "rocsparse_dnmat_get_strided_batch", "library");
-    subst("cusparseDnMatGetValues", "rocsparse_dnmat_get_values", "library");
-    subst("cusparseDnMatSetStridedBatch", "rocsparse_dnmat_set_strided_batch", "library");
-    subst("cusparseDnMatSetValues", "rocsparse_dnmat_set_values", "library");
-    subst("cusparseDnVecGet", "rocsparse_dnvec_get", "library");
-    subst("cusparseDnVecGetValues", "rocsparse_dnvec_get_values", "library");
-    subst("cusparseDnVecSetValues", "rocsparse_dnvec_set_values", "library");
-    subst("cusparseDnnz", "rocsparse_dnnz", "library");
-    subst("cusparseDnnz_compress", "rocsparse_dnnz_compress", "library");
-    subst("cusparseDpruneCsr2csr", "rocsparse_dprune_csr2csr", "library");
-    subst("cusparseDpruneCsr2csrByPercentage", "rocsparse_dprune_csr2csr_by_percentage", "library");
-    subst("cusparseDpruneCsr2csrByPercentage_bufferSizeExt", "rocsparse_dprune_csr2csr_by_percentage_buffer_size", "library");
-    subst("cusparseDpruneCsr2csrNnz", "rocsparse_dprune_csr2csr_nnz", "library");
-    subst("cusparseDpruneCsr2csrNnzByPercentage", "rocsparse_dprune_csr2csr_nnz_by_percentage", "library");
-    subst("cusparseDpruneCsr2csr_bufferSizeExt", "rocsparse_dprune_csr2csr_buffer_size", "library");
-    subst("cusparseDpruneDense2csr", "rocsparse_dprune_dense2csr", "library");
-    subst("cusparseDpruneDense2csrByPercentage", "rocsparse_dprune_dense2csr_by_percentage", "library");
-    subst("cusparseDpruneDense2csrByPercentage_bufferSizeExt", "rocsparse_dprune_dense2csr_by_percentage_buffer_size", "library");
-    subst("cusparseDpruneDense2csrNnz", "rocsparse_dprune_dense2csr_nnz", "library");
-    subst("cusparseDpruneDense2csrNnzByPercentage", "rocsparse_dprune_dense2csr_nnz_by_percentage", "library");
-    subst("cusparseDpruneDense2csr_bufferSizeExt", "rocsparse_dprune_dense2csr_buffer_size", "library");
-    subst("cusparseDroti", "rocsparse_droti", "library");
-    subst("cusparseDsctr", "rocsparse_dsctr", "library");
-    subst("cusparseGather", "rocsparse_gather", "library");
-    subst("cusparseGetErrorName", "rocsparse_get_status_name", "library");
-    subst("cusparseGetErrorString", "rocsparse_get_status_description", "library");
-    subst("cusparseGetMatDiagType", "rocsparse_get_mat_diag_type", "library");
-    subst("cusparseGetMatFillMode", "rocsparse_get_mat_fill_mode", "library");
-    subst("cusparseGetMatIndexBase", "rocsparse_get_mat_index_base", "library");
-    subst("cusparseGetMatType", "rocsparse_get_mat_type", "library");
-    subst("cusparseGetPointerMode", "rocsparse_get_pointer_mode", "library");
-    subst("cusparseGetStream", "rocsparse_get_stream", "library");
-    subst("cusparseGetVersion", "rocsparse_get_version", "library");
-    subst("cusparseRot", "rocsparse_rot", "library");
-    subst("cusparseSDDMM", "rocsparse_sddmm", "library");
-    subst("cusparseSDDMM_bufferSize", "rocsparse_sddmm_buffer_size", "library");
-    subst("cusparseSDDMM_preprocess", "rocsparse_sddmm_preprocess", "library");
-    subst("cusparseSaxpyi", "rocsparse_saxpyi", "library");
-    subst("cusparseSbsr2csr", "rocsparse_sbsr2csr", "library");
-    subst("cusparseSbsric02", "rocsparse_sbsric0", "library");
-    subst("cusparseSbsric02_analysis", "rocsparse_sbsric0_analysis", "library");
-    subst("cusparseSbsric02_bufferSize", "rocsparse_sbsric0_buffer_size", "library");
-    subst("cusparseSbsrilu02", "rocsparse_sbsrilu0", "library");
-    subst("cusparseSbsrilu02_analysis", "rocsparse_sbsrilu0_analysis", "library");
-    subst("cusparseSbsrilu02_bufferSize", "rocsparse_sbsrilu0_buffer_size", "library");
-    subst("cusparseSbsrilu02_numericBoost", "rocsparse_dsbsrilu0_numeric_boost", "library");
-    subst("cusparseSbsrmm", "rocsparse_sbsrmm", "library");
-    subst("cusparseSbsrmv", "rocsparse_sbsrmv", "library");
-    subst("cusparseSbsrsm2_analysis", "rocsparse_sbsrsm_analysis", "library");
-    subst("cusparseSbsrsm2_bufferSize", "rocsparse_sbsrsm_buffer_size", "library");
-    subst("cusparseSbsrsm2_solve", "rocsparse_sbsrsm_solve", "library");
-    subst("cusparseSbsrsv2_analysis", "rocsparse_sbsrsv_analysis", "library");
-    subst("cusparseSbsrsv2_bufferSize", "rocsparse_sbsrsv_buffer_size", "library");
-    subst("cusparseSbsrsv2_bufferSizeExt", "rocsparse_sbsrsv_buffer_size", "library");
-    subst("cusparseSbsrsv2_solve", "rocsparse_sbsrsv_solve", "library");
-    subst("cusparseSbsrxmv", "rocsparse_sbsrxmv", "library");
-    subst("cusparseScatter", "rocsparse_scatter", "library");
-    subst("cusparseScsc2dense", "rocsparse_scsc2dense", "library");
-    subst("cusparseScsr2bsr", "rocsparse_scsr2bsr", "library");
-    subst("cusparseScsr2csr_compress", "rocsparse_scsr2csr_compress", "library");
-    subst("cusparseScsr2dense", "rocsparse_scsr2dense", "library");
-    subst("cusparseScsr2gebsr", "rocsparse_scsr2gebsr", "library");
-    subst("cusparseScsr2gebsr_bufferSize", "rocsparse_scsr2gebsr_buffer_size", "library");
-    subst("cusparseScsr2hyb", "rocsparse_scsr2hyb", "library");
-    subst("cusparseScsrcolor", "rocsparse_scsrcolor", "library");
-    subst("cusparseScsrgeam", "rocsparse_scsrgeam", "library");
-    subst("cusparseScsrgeam2", "rocsparse_scsrgeam", "library");
-    subst("cusparseScsrgemm2", "rocsparse_scsrgemm", "library");
-    subst("cusparseScsrgemm2_bufferSizeExt", "rocsparse_scsrgemm_buffer_size", "library");
-    subst("cusparseScsric02", "rocsparse_scsric0", "library");
-    subst("cusparseScsric02_analysis", "rocsparse_scsric0_analysis", "library");
-    subst("cusparseScsric02_bufferSize", "rocsparse_scsric0_buffer_size", "library");
-    subst("cusparseScsric02_bufferSizeExt", "rocsparse_scsric0_buffer_size", "library");
-    subst("cusparseScsrilu02", "rocsparse_scsrilu0", "library");
-    subst("cusparseScsrilu02_analysis", "rocsparse_scsrilu0_analysis", "library");
-    subst("cusparseScsrilu02_bufferSize", "rocsparse_scsrilu0_buffer_size", "library");
-    subst("cusparseScsrilu02_bufferSizeExt", "rocsparse_scsrilu0_buffer_size", "library");
-    subst("cusparseScsrilu02_numericBoost", "rocsparse_dscsrilu0_numeric_boost", "library");
-    subst("cusparseScsrmm", "rocsparse_scsrmm", "library");
-    subst("cusparseScsrmm2", "rocsparse_scsrmm", "library");
-    subst("cusparseScsrmv", "rocsparse_scsrmv", "library");
-    subst("cusparseScsrsm2_analysis", "rocsparse_scsrsm_analysis", "library");
-    subst("cusparseScsrsm2_bufferSizeExt", "rocsparse_scsrsm_buffer_size", "library");
-    subst("cusparseScsrsm2_solve", "rocsparse_scsrsm_solve", "library");
-    subst("cusparseScsrsv2_analysis", "rocsparse_scsrsv_analysis", "library");
-    subst("cusparseScsrsv2_bufferSize", "rocsparse_scsrsv_buffer_size", "library");
-    subst("cusparseScsrsv2_bufferSizeExt", "rocsparse_scsrsv_buffer_size", "library");
-    subst("cusparseScsrsv2_solve", "rocsparse_scsrsv_solve", "library");
-    subst("cusparseSdense2csc", "rocsparse_sdense2csc", "library");
-    subst("cusparseSdense2csr", "rocsparse_sdense2csr", "library");
-    subst("cusparseSdoti", "rocsparse_sdoti", "library");
-    subst("cusparseSetMatDiagType", "rocsparse_set_mat_diag_type", "library");
-    subst("cusparseSetMatFillMode", "rocsparse_set_mat_fill_mode", "library");
-    subst("cusparseSetMatIndexBase", "rocsparse_set_mat_index_base", "library");
-    subst("cusparseSetMatType", "rocsparse_set_mat_type", "library");
-    subst("cusparseSetPointerMode", "rocsparse_set_pointer_mode", "library");
-    subst("cusparseSetStream", "rocsparse_set_stream", "library");
-    subst("cusparseSgebsr2csr", "rocsparse_sgebsr2csr", "library");
-    subst("cusparseSgebsr2gebsc", "rocsparse_sgebsr2gebsc", "library");
-    subst("cusparseSgebsr2gebsc_bufferSize", "rocsparse_sgebsr2gebsc_buffer_size", "library");
-    subst("cusparseSgebsr2gebsr", "rocsparse_sgebsr2gebsr", "library");
-    subst("cusparseSgebsr2gebsr_bufferSize", "rocsparse_sgebsr2gebsr_buffer_size", "library");
-    subst("cusparseSgemvi", "rocsparse_sgemvi", "library");
-    subst("cusparseSgemvi_bufferSize", "rocsparse_sgemvi_buffer_size", "library");
-    subst("cusparseSgpsvInterleavedBatch", "rocsparse_sgpsv_interleaved_batch", "library");
-    subst("cusparseSgpsvInterleavedBatch_bufferSizeExt", "rocsparse_sgpsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseSgthr", "rocsparse_sgthr", "library");
-    subst("cusparseSgthrz", "rocsparse_sgthrz", "library");
-    subst("cusparseSgtsv2", "rocsparse_sgtsv", "library");
-    subst("cusparseSgtsv2StridedBatch", "rocsparse_sgtsv_no_pivot_strided_batch", "library");
-    subst("cusparseSgtsv2StridedBatch_bufferSizeExt", "rocsparse_sgtsv_no_pivot_strided_batch_buffer_size", "library");
-    subst("cusparseSgtsv2_bufferSizeExt", "rocsparse_sgtsv_buffer_size", "library");
-    subst("cusparseSgtsv2_nopivot", "rocsparse_sgtsv_no_pivot", "library");
-    subst("cusparseSgtsv2_nopivot_bufferSizeExt", "rocsparse_sgtsv_no_pivot_buffer_size", "library");
-    subst("cusparseSgtsvInterleavedBatch", "rocsparse_sgtsv_interleaved_batch", "library");
-    subst("cusparseSgtsvInterleavedBatch_bufferSizeExt", "rocsparse_sgtsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseShybmv", "rocsparse_shybmv", "library");
-    subst("cusparseSnnz", "rocsparse_snnz", "library");
-    subst("cusparseSnnz_compress", "rocsparse_snnz_compress", "library");
-    subst("cusparseSpMM", "rocsparse_spmm", "library");
-    subst("cusparseSpMM_bufferSize", "rocsparse_spmm", "library");
-    subst("cusparseSpMM_preprocess", "rocsparse_spmm", "library");
-    subst("cusparseSpMV", "rocsparse_spmv", "library");
-    subst("cusparseSpMV_bufferSize", "rocsparse_spmv", "library");
-    subst("cusparseSpMatGetAttribute", "rocsparse_spmat_get_attribute", "library");
-    subst("cusparseSpMatGetFormat", "rocsparse_spmat_get_format", "library");
-    subst("cusparseSpMatGetIndexBase", "rocsparse_spmat_get_index_base", "library");
-    subst("cusparseSpMatGetSize", "rocsparse_spmat_get_size", "library");
-    subst("cusparseSpMatGetStridedBatch", "rocsparse_spmat_get_strided_batch", "library");
-    subst("cusparseSpMatGetValues", "rocsparse_spmat_get_values", "library");
-    subst("cusparseSpMatSetAttribute", "rocsparse_spmat_set_attribute", "library");
-    subst("cusparseSpMatSetStridedBatch", "rocsparse_spmat_set_strided_batch", "library");
-    subst("cusparseSpMatSetValues", "rocsparse_spmat_set_values", "library");
-    subst("cusparseSpSM_analysis", "rocsparse_spsm", "library");
-    subst("cusparseSpSM_solve", "rocsparse_spsm", "library");
-    subst("cusparseSpSV_bufferSize", "rocsparse_spsv", "library");
-    subst("cusparseSpVV", "rocsparse_spvv", "library");
-    subst("cusparseSpVV_bufferSize", "rocsparse_spvv", "library");
-    subst("cusparseSpVecGet", "rocsparse_spvec_get", "library");
-    subst("cusparseSpVecGetIndexBase", "rocsparse_spvec_get_index_base", "library");
-    subst("cusparseSpVecGetValues", "rocsparse_spvec_get_values", "library");
-    subst("cusparseSpVecSetValues", "rocsparse_spvec_set_values", "library");
-    subst("cusparseSparseToDense", "rocsparse_sparse_to_dense", "library");
-    subst("cusparseSparseToDense_bufferSize", "rocsparse_sparse_to_dense", "library");
-    subst("cusparseSpruneCsr2csr", "rocsparse_sprune_csr2csr", "library");
-    subst("cusparseSpruneCsr2csrByPercentage", "rocsparse_sprune_csr2csr_by_percentage", "library");
-    subst("cusparseSpruneCsr2csrByPercentage_bufferSizeExt", "rocsparse_sprune_csr2csr_by_percentage_buffer_size", "library");
-    subst("cusparseSpruneCsr2csrNnz", "rocsparse_sprune_csr2csr_nnz", "library");
-    subst("cusparseSpruneCsr2csrNnzByPercentage", "rocsparse_sprune_csr2csr_nnz_by_percentage", "library");
-    subst("cusparseSpruneCsr2csr_bufferSizeExt", "rocsparse_sprune_csr2csr_buffer_size", "library");
-    subst("cusparseSpruneDense2csr", "rocsparse_sprune_dense2csr", "library");
-    subst("cusparseSpruneDense2csrByPercentage", "rocsparse_sprune_dense2csr_by_percentage", "library");
-    subst("cusparseSpruneDense2csrByPercentage_bufferSizeExt", "rocsparse_sprune_dense2csr_by_percentage_buffer_size", "library");
-    subst("cusparseSpruneDense2csrNnz", "rocsparse_sprune_dense2csr_nnz", "library");
-    subst("cusparseSpruneDense2csrNnzByPercentage", "rocsparse_sprune_dense2csr_nnz_by_percentage", "library");
-    subst("cusparseSpruneDense2csr_bufferSizeExt", "rocsparse_sprune_dense2csr_buffer_size", "library");
-    subst("cusparseSroti", "rocsparse_sroti", "library");
-    subst("cusparseSsctr", "rocsparse_ssctr", "library");
-    subst("cusparseXbsric02_zeroPivot", "rocsparse_bsric0_zero_pivot", "library");
-    subst("cusparseXbsrilu02_zeroPivot", "rocsparse_bsrilu0_zero_pivot", "library");
-    subst("cusparseXbsrsm2_zeroPivot", "rocsparse_bsrsm_zero_pivot", "library");
-    subst("cusparseXbsrsv2_zeroPivot", "rocsparse_bsrsv_zero_pivot", "library");
-    subst("cusparseXcoo2csr", "rocsparse_coo2csr", "library");
-    subst("cusparseXcoosortByColumn", "rocsparse_coosort_by_column", "library");
-    subst("cusparseXcoosortByRow", "rocsparse_coosort_by_row", "library");
-    subst("cusparseXcoosort_bufferSizeExt", "rocsparse_coosort_buffer_size", "library");
-    subst("cusparseXcscsort", "rocsparse_cscsort", "library");
-    subst("cusparseXcscsort_bufferSizeExt", "rocsparse_cscsort_buffer_size", "library");
-    subst("cusparseXcsr2bsrNnz", "rocsparse_csr2bsr_nnz", "library");
-    subst("cusparseXcsr2coo", "rocsparse_csr2coo", "library");
-    subst("cusparseXcsr2gebsrNnz", "rocsparse_csr2gebsr_nnz", "library");
-    subst("cusparseXcsrgeam2Nnz", "rocsparse_csrgeam_nnz", "library");
-    subst("cusparseXcsrgeamNnz", "rocsparse_csrgeam_nnz", "library");
-    subst("cusparseXcsrgemm2Nnz", "rocsparse_csrgemm_nnz", "library");
-    subst("cusparseXcsric02_zeroPivot", "rocsparse_csric0_zero_pivot", "library");
-    subst("cusparseXcsrilu02_zeroPivot", "rocsparse_csrilu0_zero_pivot", "library");
-    subst("cusparseXcsrsm2_zeroPivot", "rocsparse_csrsm_zero_pivot", "library");
-    subst("cusparseXcsrsort", "rocsparse_csrsort", "library");
-    subst("cusparseXcsrsort_bufferSizeExt", "rocsparse_csrsort_buffer_size", "library");
-    subst("cusparseXcsrsv2_zeroPivot", "rocsparse_csrsv_zero_pivot", "library");
-    subst("cusparseXgebsr2gebsrNnz", "rocsparse_gebsr2gebsr_nnz", "library");
-    subst("cusparseZaxpyi", "rocsparse_zaxpyi", "library");
-    subst("cusparseZbsr2csr", "rocsparse_zbsr2csr", "library");
-    subst("cusparseZbsric02", "rocsparse_zbsric0", "library");
-    subst("cusparseZbsric02_analysis", "rocsparse_zbsric0_analysis", "library");
-    subst("cusparseZbsric02_bufferSize", "rocsparse_zbsric0_buffer_size", "library");
-    subst("cusparseZbsrilu02", "rocsparse_zbsrilu0", "library");
-    subst("cusparseZbsrilu02_analysis", "rocsparse_zbsrilu0_analysis", "library");
-    subst("cusparseZbsrilu02_bufferSize", "rocsparse_zbsrilu0_buffer_size", "library");
-    subst("cusparseZbsrilu02_numericBoost", "rocsparse_zbsrilu0_numeric_boost", "library");
-    subst("cusparseZbsrmm", "rocsparse_zbsrmm", "library");
-    subst("cusparseZbsrmv", "rocsparse_zbsrmv", "library");
-    subst("cusparseZbsrsm2_analysis", "rocsparse_zbsrsm_analysis", "library");
-    subst("cusparseZbsrsm2_bufferSize", "rocsparse_zbsrsm_buffer_size", "library");
-    subst("cusparseZbsrsm2_solve", "rocsparse_zbsrsm_solve", "library");
-    subst("cusparseZbsrsv2_analysis", "rocsparse_zbsrsv_analysis", "library");
-    subst("cusparseZbsrsv2_bufferSize", "rocsparse_zbsrsv_buffer_size", "library");
-    subst("cusparseZbsrsv2_bufferSizeExt", "rocsparse_zbsrsv_buffer_size", "library");
-    subst("cusparseZbsrsv2_solve", "rocsparse_zbsrsv_solve", "library");
-    subst("cusparseZbsrxmv", "rocsparse_zbsrxmv", "library");
-    subst("cusparseZcsc2dense", "rocsparse_zcsc2dense", "library");
-    subst("cusparseZcsr2bsr", "rocsparse_zcsr2bsr", "library");
-    subst("cusparseZcsr2csr_compress", "rocsparse_zcsr2csr_compress", "library");
-    subst("cusparseZcsr2dense", "rocsparse_zcsr2dense", "library");
-    subst("cusparseZcsr2gebsr", "rocsparse_zcsr2gebsr", "library");
-    subst("cusparseZcsr2gebsr_bufferSize", "rocsparse_zcsr2gebsr_buffer_size", "library");
-    subst("cusparseZcsr2hyb", "rocsparse_zcsr2hyb", "library");
-    subst("cusparseZcsrcolor", "rocsparse_zcsrcolor", "library");
-    subst("cusparseZcsrgeam", "rocsparse_zcsrgeam", "library");
-    subst("cusparseZcsrgeam2", "rocsparse_zcsrgeam", "library");
-    subst("cusparseZcsrgemm2", "rocsparse_zcsrgemm", "library");
-    subst("cusparseZcsrgemm2_bufferSizeExt", "rocsparse_zcsrgemm_buffer_size", "library");
-    subst("cusparseZcsric02", "rocsparse_zcsric0", "library");
-    subst("cusparseZcsric02_analysis", "rocsparse_zcsric0_analysis", "library");
-    subst("cusparseZcsric02_bufferSize", "rocsparse_zcsric0_buffer_size", "library");
-    subst("cusparseZcsric02_bufferSizeExt", "rocsparse_zcsric0_buffer_size", "library");
-    subst("cusparseZcsrilu02", "rocsparse_zcsrilu0", "library");
-    subst("cusparseZcsrilu02_analysis", "rocsparse_zcsrilu0_analysis", "library");
-    subst("cusparseZcsrilu02_bufferSize", "rocsparse_zcsrilu0_buffer_size", "library");
-    subst("cusparseZcsrilu02_bufferSizeExt", "rocsparse_zcsrilu0_buffer_size", "library");
-    subst("cusparseZcsrilu02_numericBoost", "rocsparse_zcsrilu0_numeric_boost", "library");
-    subst("cusparseZcsrmm", "rocsparse_zcsrmm", "library");
-    subst("cusparseZcsrmm2", "rocsparse_zcsrmm", "library");
-    subst("cusparseZcsrmv", "rocsparse_zcsrmv", "library");
-    subst("cusparseZcsrsm2_analysis", "rocsparse_zcsrsm_analysis", "library");
-    subst("cusparseZcsrsm2_bufferSizeExt", "rocsparse_zcsrsm_buffer_size", "library");
-    subst("cusparseZcsrsm2_solve", "rocsparse_zcsrsm_solve", "library");
-    subst("cusparseZcsrsv2_analysis", "rocsparse_zcsrsv_analysis", "library");
-    subst("cusparseZcsrsv2_bufferSize", "rocsparse_zcsrsv_buffer_size", "library");
-    subst("cusparseZcsrsv2_bufferSizeExt", "rocsparse_zcsrsv_buffer_size", "library");
-    subst("cusparseZcsrsv2_solve", "rocsparse_zcsrsv_solve", "library");
-    subst("cusparseZdense2csc", "rocsparse_zdense2csc", "library");
-    subst("cusparseZdense2csr", "rocsparse_zdense2csr", "library");
-    subst("cusparseZdotci", "rocsparse_zdotci", "library");
-    subst("cusparseZdoti", "rocsparse_zdoti", "library");
-    subst("cusparseZgebsr2csr", "rocsparse_zgebsr2csr", "library");
-    subst("cusparseZgebsr2gebsc", "rocsparse_zgebsr2gebsc", "library");
-    subst("cusparseZgebsr2gebsc_bufferSize", "rocsparse_zgebsr2gebsc_buffer_size", "library");
-    subst("cusparseZgebsr2gebsr", "rocsparse_zgebsr2gebsr", "library");
-    subst("cusparseZgebsr2gebsr_bufferSize", "rocsparse_zgebsr2gebsr_buffer_size", "library");
-    subst("cusparseZgemvi", "rocsparse_zgemvi", "library");
-    subst("cusparseZgemvi_bufferSize", "rocsparse_zgemvi_buffer_size", "library");
-    subst("cusparseZgpsvInterleavedBatch", "rocsparse_zgpsv_interleaved_batch", "library");
-    subst("cusparseZgpsvInterleavedBatch_bufferSizeExt", "rocsparse_zgpsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseZgthr", "rocsparse_zgthr", "library");
-    subst("cusparseZgthrz", "rocsparse_zgthrz", "library");
-    subst("cusparseZgtsv2", "rocsparse_zgtsv", "library");
-    subst("cusparseZgtsv2StridedBatch", "rocsparse_zgtsv_no_pivot_strided_batch", "library");
-    subst("cusparseZgtsv2StridedBatch_bufferSizeExt", "rocsparse_zgtsv_no_pivot_strided_batch_buffer_size", "library");
-    subst("cusparseZgtsv2_bufferSizeExt", "rocsparse_zgtsv_buffer_size", "library");
-    subst("cusparseZgtsv2_nopivot", "rocsparse_zgtsv_no_pivot", "library");
-    subst("cusparseZgtsv2_nopivot_bufferSizeExt", "rocsparse_zgtsv_no_pivot_buffer_size", "library");
-    subst("cusparseZgtsvInterleavedBatch", "rocsparse_zgtsv_interleaved_batch", "library");
-    subst("cusparseZgtsvInterleavedBatch_bufferSizeExt", "rocsparse_zgtsv_interleaved_batch_buffer_size", "library");
-    subst("cusparseZhybmv", "rocsparse_zhybmv", "library");
-    subst("cusparseZnnz", "rocsparse_znnz", "library");
-    subst("cusparseZnnz_compress", "rocsparse_znnz_compress", "library");
-    subst("cusparseZsctr", "rocsparse_zsctr", "library");
-    subst("__half", "rocblas_half", "device_type");
-    subst("__nv_bfloat16", "rocblas_bfloat16", "device_type");
-    subst("cublas.h", "rocblas.h", "include_cuda_main_header");
-    subst("cublas_v2.h", "rocblas.h", "include_cuda_main_header_v2");
-    subst("bsric02Info", "_rocsparse_mat_info", "type");
-    subst("bsric02Info_t", "rocsparse_mat_info", "type");
-    subst("bsrilu02Info", "_rocsparse_mat_info", "type");
-    subst("bsrilu02Info_t", "rocsparse_mat_info", "type");
-    subst("bsrsm2Info", "_rocsparse_mat_info", "type");
-    subst("bsrsm2Info_t", "rocsparse_mat_info", "type");
-    subst("bsrsv2Info", "_rocsparse_mat_info", "type");
-    subst("bsrsv2Info_t", "rocsparse_mat_info", "type");
-    subst("csrgemm2Info", "_rocsparse_mat_info", "type");
-    subst("csrgemm2Info_t", "rocsparse_mat_info", "type");
-    subst("csric02Info", "_rocsparse_mat_info", "type");
-    subst("csric02Info_t", "rocsparse_mat_info", "type");
-    subst("csrilu02Info", "_rocsparse_mat_info", "type");
-    subst("csrilu02Info_t", "rocsparse_mat_info", "type");
-    subst("csrsm2Info", "_rocsparse_mat_info", "type");
-    subst("csrsm2Info_t", "rocsparse_mat_info", "type");
-    subst("csrsv2Info", "_rocsparse_mat_descr", "type");
-    subst("csrsv2Info_t", "rocsparse_mat_descr", "type");
-    subst("cuComplex", "rocblas_float_complex", "type");
-    subst("cuDoubleComplex", "rocblas_double_complex", "type");
-    subst("cuFloatComplex", "rocblas_float_complex", "type");
-    subst("cublasAtomicsMode_t", "rocblas_atomics_mode", "type");
-    subst("cublasComputeType_t", "rocblas_computetype", "type");
-    subst("cublasContext", "_rocblas_handle", "type");
-    subst("cublasDataType_t", "rocblas_datatype", "type");
-    subst("cublasDiagType_t", "rocblas_diagonal", "type");
-    subst("cublasFillMode_t", "rocblas_fill", "type");
-    subst("cublasGemmAlgo_t", "rocblas_gemm_algo", "type");
-    subst("cublasHandle_t", "rocblas_handle", "type");
-    subst("cublasMath_t", "rocblas_math_mode", "type");
-    subst("cublasOperation_t", "rocblas_operation", "type");
-    subst("cublasPointerMode_t", "rocblas_pointer_mode", "type");
-    subst("cublasSideMode_t", "rocblas_side", "type");
-    subst("cublasStatus", "rocblas_status", "type");
-    subst("cublasStatus_t", "rocblas_status", "type");
-    subst("cudaDataType", "rocblas_datatype", "type");
-    subst("cudaDataType_t", "rocblas_datatype_", "type");
-    subst("cudnnActivationDescriptor_t", "miopenActivationDescriptor_t", "type");
-    subst("cudnnActivationMode_t", "miopenActivationMode_t", "type");
-    subst("cudnnBatchNormMode_t", "miopenBatchNormMode_t", "type");
-    subst("cudnnCTCLossAlgo_t", "miopenCTCLossAlgo_t", "type");
-    subst("cudnnCTCLossDescriptor_t", "miopenCTCLossDescriptor_t", "type");
-    subst("cudnnConvolutionBwdDataAlgoPerfStruct", "miopenConvAlgoPerf_t", "type");
-    subst("cudnnConvolutionBwdDataAlgoPerf_t", "miopenConvAlgoPerf_t", "type");
-    subst("cudnnConvolutionBwdDataAlgo_t", "miopenConvBwdDataAlgorithm_t", "type");
-    subst("cudnnConvolutionDescriptor_t", "miopenConvolutionDescriptor_t", "type");
-    subst("cudnnConvolutionFwdAlgoPerfStruct", "miopenConvAlgoPerf_t", "type");
-    subst("cudnnConvolutionFwdAlgoPerf_t", "miopenConvAlgoPerf_t", "type");
-    subst("cudnnConvolutionFwdAlgo_t", "miopenConvFwdAlgorithm_t", "type");
-    subst("cudnnConvolutionMode_t", "miopenConvolutionMode_t", "type");
-    subst("cudnnDataType_t", "miopenDataType_t", "type");
-    subst("cudnnDirectionMode_t", "miopenRNNDirectionMode_t", "type");
-    subst("cudnnDropoutDescriptor_t", "miopenDropoutDescriptor_t", "type");
-    subst("cudnnFilterDescriptor_t", "miopenTensorDescriptor_t", "type");
-    subst("cudnnHandle_t", "miopenHandle_t", "type");
-    subst("cudnnIndicesType_t", "miopenIndicesType_t", "type");
-    subst("cudnnLRNDescriptor_t", "miopenLRNDescriptor_t", "type");
-    subst("cudnnLRNMode_t", "miopenLRNMode_t", "type");
-    subst("cudnnNanPropagation_t", "miopenNanPropagation_t", "type");
-    subst("cudnnOpTensorOp_t", "miopenTensorOp_t", "type");
-    subst("cudnnPoolingDescriptor_t", "miopenPoolingDescriptor_t", "type");
-    subst("cudnnPoolingMode_t", "miopenPoolingMode_t", "type");
-    subst("cudnnRNNAlgo_t", "miopenRNNAlgo_t", "type");
-    subst("cudnnRNNBiasMode_t", "miopenRNNBiasMode_t", "type");
-    subst("cudnnRNNDescriptor_t", "miopenRNNDescriptor_t", "type");
-    subst("cudnnRNNInputMode_t", "miopenRNNInputMode_t", "type");
-    subst("cudnnRNNMode_t", "miopenRNNMode_t", "type");
-    subst("cudnnReduceTensorDescriptor_t", "miopenReduceTensorDescriptor_t", "type");
-    subst("cudnnReduceTensorIndices_t", "miopenReduceTensorIndices_t", "type");
-    subst("cudnnReduceTensorOp_t", "miopenReduceTensorOp_t", "type");
-    subst("cudnnSoftmaxAlgorithm_t", "miopenSoftmaxAlgorithm_t", "type");
-    subst("cudnnSoftmaxMode_t", "miopenSoftmaxMode_t", "type");
-    subst("cudnnStatus_t", "miopenStatus_t", "type");
-    subst("cudnnTensorDescriptor_t", "miopenTensorDescriptor_t", "type");
-    subst("cusolverDnHandle_t", "rocblas_handle", "type");
-    subst("cusolverEigMode_t", "rocblas_evect", "type");
-    subst("cusolverEigRange_t", "rocblas_erange", "type");
-    subst("cusolverEigType_t", "rocblas_eform", "type");
-    subst("cusolverStatus_t", "rocblas_status", "type");
-    subst("cusparseAction_t", "rocsparse_action", "type");
-    subst("cusparseColorInfo", "_rocsparse_color_info", "type");
-    subst("cusparseColorInfo_t", "rocsparse_color_info", "type");
-    subst("cusparseConstDnMatDescr_t", "rocsparse_const_dnmat_descr", "type");
-    subst("cusparseConstDnVecDescr_t", "rocsparse_const_dnvec_descr", "type");
-    subst("cusparseConstSpMatDescr_t", "rocsparse_const_spmat_descr", "type");
-    subst("cusparseConstSpVecDescr_t", "rocsparse_const_spvec_descr", "type");
-    subst("cusparseContext", "_rocsparse_handle", "type");
-    subst("cusparseDenseToSparseAlg_t", "rocsparse_dense_to_sparse_alg", "type");
-    subst("cusparseDiagType_t", "rocsparse_diag_type", "type");
-    subst("cusparseDirection_t", "rocsparse_direction", "type");
-    subst("cusparseDnMatDescr", "_rocsparse_dnmat_descr", "type");
-    subst("cusparseDnMatDescr_t", "rocsparse_dnmat_descr", "type");
-    subst("cusparseDnVecDescr", "_rocsparse_dnvec_descr", "type");
-    subst("cusparseDnVecDescr_t", "rocsparse_dnvec_descr", "type");
-    subst("cusparseFillMode_t", "rocsparse_fill_mode", "type");
-    subst("cusparseFormat_t", "rocsparse_format", "type");
-    subst("cusparseHandle_t", "rocsparse_handle", "type");
-    subst("cusparseHybMat", "_rocsparse_hyb_mat", "type");
-    subst("cusparseHybMat_t", "rocsparse_hyb_mat", "type");
-    subst("cusparseHybPartition_t", "rocsparse_hyb_partition", "type");
-    subst("cusparseIndexBase_t", "rocsparse_index_base", "type");
-    subst("cusparseIndexType_t", "rocsparse_indextype", "type");
-    subst("cusparseMatDescr", "_rocsparse_mat_descr", "type");
-    subst("cusparseMatDescr_t", "rocsparse_mat_descr", "type");
-    subst("cusparseMatrixType_t", "rocsparse_matrix_type", "type");
-    subst("cusparseOperation_t", "rocsparse_operation", "type");
-    subst("cusparseOrder_t", "rocsparse_order", "type");
-    subst("cusparsePointerMode_t", "rocsparse_pointer_mode", "type");
-    subst("cusparseSDDMMAlg_t", "rocsparse_sddmm_alg", "type");
-    subst("cusparseSolvePolicy_t", "rocsparse_solve_policy", "type");
-    subst("cusparseSpGEMMAlg_t", "rocsparse_spgemm_alg", "type");
-    subst("cusparseSpMMAlg_t", "rocsparse_spmm_alg", "type");
-    subst("cusparseSpMVAlg_t", "rocsparse_spmv_alg", "type");
-    subst("cusparseSpMatAttribute_t", "rocsparse_spmat_attribute", "type");
-    subst("cusparseSpMatDescr", "_rocsparse_spmat_descr", "type");
-    subst("cusparseSpMatDescr_t", "rocsparse_spmat_descr", "type");
-    subst("cusparseSpSMAlg_t", "rocsparse_spsm_alg", "type");
-    subst("cusparseSpSVAlg_t", "rocsparse_spsv_alg", "type");
-    subst("cusparseSpVecDescr", "_rocsparse_spvec_descr", "type");
-    subst("cusparseSpVecDescr_t", "rocsparse_spvec_descr", "type");
-    subst("cusparseSparseToDenseAlg_t", "rocsparse_sparse_to_dense_alg", "type");
-    subst("cusparseStatus_t", "rocsparse_status", "type");
-    subst("pruneInfo", "_rocsparse_mat_info", "type");
-    subst("pruneInfo_t", "rocsparse_mat_info", "type");
-    subst("CUBLAS_ATOMICS_ALLOWED", "rocblas_atomics_allowed", "numeric_literal");
-    subst("CUBLAS_ATOMICS_NOT_ALLOWED", "rocblas_atomics_not_allowed", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32F", "rocblas_compute_type_f32", "numeric_literal");
-    subst("CUBLAS_DEFAULT_MATH", "rocblas_default_math", "numeric_literal");
-    subst("CUBLAS_DIAG_NON_UNIT", "rocblas_diagonal_non_unit", "numeric_literal");
-    subst("CUBLAS_DIAG_UNIT", "rocblas_diagonal_unit", "numeric_literal");
-    subst("CUBLAS_FILL_MODE_FULL", "rocblas_fill_full", "numeric_literal");
-    subst("CUBLAS_FILL_MODE_LOWER", "rocblas_fill_lower", "numeric_literal");
-    subst("CUBLAS_FILL_MODE_UPPER", "rocblas_fill_upper", "numeric_literal");
-    subst("CUBLAS_GEMM_DEFAULT", "rocblas_gemm_algo_standard", "numeric_literal");
-    subst("CUBLAS_GEMM_DFALT", "rocblas_gemm_algo_standard", "numeric_literal");
-    subst("CUBLAS_OP_C", "rocblas_operation_conjugate_transpose", "numeric_literal");
-    subst("CUBLAS_OP_HERMITAN", "rocblas_operation_conjugate_transpose", "numeric_literal");
-    subst("CUBLAS_OP_N", "rocblas_operation_none", "numeric_literal");
-    subst("CUBLAS_OP_T", "rocblas_operation_transpose", "numeric_literal");
-    subst("CUBLAS_POINTER_MODE_DEVICE", "rocblas_pointer_mode_device", "numeric_literal");
-    subst("CUBLAS_POINTER_MODE_HOST", "rocblas_pointer_mode_host", "numeric_literal");
-    subst("CUBLAS_SIDE_LEFT", "rocblas_side_left", "numeric_literal");
-    subst("CUBLAS_SIDE_RIGHT", "rocblas_side_right", "numeric_literal");
-    subst("CUBLAS_STATUS_ALLOC_FAILED", "rocblas_status_not_implemented", "numeric_literal");
-    subst("CUBLAS_STATUS_ARCH_MISMATCH", "rocblas_status_arch_mismatch", "numeric_literal");
-    subst("CUBLAS_STATUS_EXECUTION_FAILED", "rocblas_status_memory_error", "numeric_literal");
-    subst("CUBLAS_STATUS_INTERNAL_ERROR", "rocblas_status_internal_error", "numeric_literal");
-    subst("CUBLAS_STATUS_INVALID_VALUE", "rocblas_status_invalid_value", "numeric_literal");
-    subst("CUBLAS_STATUS_MAPPING_ERROR", "rocblas_status_invalid_size", "numeric_literal");
-    subst("CUBLAS_STATUS_NOT_INITIALIZED", "rocblas_status_invalid_handle", "numeric_literal");
-    subst("CUBLAS_STATUS_NOT_SUPPORTED", "rocblas_status_perf_degraded", "numeric_literal");
-    subst("CUBLAS_STATUS_SUCCESS", "rocblas_status_success", "numeric_literal");
-    subst("CUDA_C_16BF", "rocblas_datatype_bf16_c", "numeric_literal");
-    subst("CUDA_C_16F", "rocblas_datatype_f16_c", "numeric_literal");
-    subst("CUDA_C_32F", "rocblas_datatype_f32_c", "numeric_literal");
-    subst("CUDA_C_32I", "rocblas_datatype_i32_c", "numeric_literal");
-    subst("CUDA_C_32U", "rocblas_datatype_u32_c", "numeric_literal");
-    subst("CUDA_C_64F", "rocblas_datatype_f64_c", "numeric_literal");
-    subst("CUDA_C_8I", "rocblas_datatype_i8_c", "numeric_literal");
-    subst("CUDA_C_8U", "rocblas_datatype_u8_c", "numeric_literal");
-    subst("CUDA_R_16BF", "rocblas_datatype_bf16_r", "numeric_literal");
-    subst("CUDA_R_16F", "rocblas_datatype_f16_r", "numeric_literal");
-    subst("CUDA_R_32F", "rocblas_datatype_f32_r", "numeric_literal");
-    subst("CUDA_R_32I", "rocblas_datatype_i32_r", "numeric_literal");
-    subst("CUDA_R_32U", "rocblas_datatype_u32_r", "numeric_literal");
-    subst("CUDA_R_64F", "rocblas_datatype_f64_r", "numeric_literal");
-    subst("CUDA_R_8I", "rocblas_datatype_i8_r", "numeric_literal");
-    subst("CUDA_R_8U", "rocblas_datatype_u8_r", "numeric_literal");
-    subst("CUDNN_16BIT_INDICES", "MIOPEN_16BIT_INDICES", "numeric_literal");
-    subst("CUDNN_32BIT_INDICES", "MIOPEN_32BIT_INDICES", "numeric_literal");
-    subst("CUDNN_64BIT_INDICES", "MIOPEN_64BIT_INDICES", "numeric_literal");
-    subst("CUDNN_8BIT_INDICES", "MIOPEN_8BIT_INDICES", "numeric_literal");
-    subst("CUDNN_ACTIVATION_CLIPPED_RELU", "miopenActivationCLIPPEDRELU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_ELU", "miopenActivationELU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_IDENTITY", "miopenActivationPASTHRU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_RELU", "miopenActivationRELU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_TANH", "miopenActivationTANH", "numeric_literal");
-    subst("CUDNN_BATCHNORM_PER_ACTIVATION", "miopenBNPerActivation", "numeric_literal");
-    subst("CUDNN_BATCHNORM_SPATIAL", "miopenBNSpatial", "numeric_literal");
-    subst("CUDNN_BIDIRECTIONAL", "miopenRNNbidirection", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_0", "miopenConvolutionBwdDataAlgoGEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_1", "miopenConvolutionBwdDataAlgoDirect", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", "miopenConvolutionBwdDataAlgoFFT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", "miopenConvolutionBwdDataAlgoWinograd", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_DIRECT", "miopenConvolutionFwdAlgoDirect", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_FFT", "miopenConvolutionFwdAlgoFFT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_GEMM", "miopenConvolutionFwdAlgoGEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", "miopenConvolutionFwdAlgoImplicitGEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", "miopenConvolutionFwdAlgoWinograd", "numeric_literal");
-    subst("CUDNN_CTC_LOSS_ALGO_DETERMINISTIC", "MIOPEN_CTC_LOSS_ALGO_DETERMINISTIC", "numeric_literal");
-    subst("CUDNN_DATA_BFLOAT16", "miopenBFloat16", "numeric_literal");
-    subst("CUDNN_DATA_DOUBLE", "miopenDouble", "numeric_literal");
-    subst("CUDNN_DATA_FLOAT", "miopenFloat", "numeric_literal");
-    subst("CUDNN_DATA_HALF", "miopenHalf", "numeric_literal");
-    subst("CUDNN_DATA_INT32", "miopenInt32", "numeric_literal");
-    subst("CUDNN_DATA_INT8", "miopenInt8", "numeric_literal");
-    subst("CUDNN_DATA_INT8x4", "miopenInt8x4", "numeric_literal");
-    subst("CUDNN_GRU", "miopenGRU", "numeric_literal");
-    subst("CUDNN_LINEAR_INPUT", "miopenRNNlinear", "numeric_literal");
-    subst("CUDNN_LRN_CROSS_CHANNEL_DIM1", "miopenLRNCrossChannel", "numeric_literal");
-    subst("CUDNN_LSTM", "miopenLSTM", "numeric_literal");
-    subst("CUDNN_NOT_PROPAGATE_NAN", "MIOPEN_NOT_PROPAGATE_NAN", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_ADD", "miopenTensorOpAdd", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_MAX", "miopenTensorOpMax", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_MIN", "miopenTensorOpMin", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_MUL", "miopenTensorOpMul", "numeric_literal");
-    subst("CUDNN_POOLING_MAX", "miopenPoolingMax", "numeric_literal");
-    subst("CUDNN_PROPAGATE_NAN", "MIOPEN_PROPAGATE_NAN", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_ADD", "MIOPEN_REDUCE_TENSOR_ADD", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_AMAX", "MIOPEN_REDUCE_TENSOR_AMAX", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_AVG", "MIOPEN_REDUCE_TENSOR_AVG", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_FLATTENED_INDICES", "MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MAX", "MIOPEN_REDUCE_TENSOR_MAX", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MIN", "MIOPEN_REDUCE_TENSOR_MIN", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MUL", "MIOPEN_REDUCE_TENSOR_MUL", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_NORM1", "MIOPEN_REDUCE_TENSOR_NORM1", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_NORM2", "MIOPEN_REDUCE_TENSOR_NORM2", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_NO_INDICES", "MIOPEN_REDUCE_TENSOR_NO_INDICES", "numeric_literal");
-    subst("CUDNN_RNN_ALGO_STANDARD", "miopenRNNdefault", "numeric_literal");
-    subst("CUDNN_RNN_DOUBLE_BIAS", "miopenRNNwithBias", "numeric_literal");
-    subst("CUDNN_RNN_NO_BIAS", "miopenRNNNoBias", "numeric_literal");
-    subst("CUDNN_RNN_RELU", "miopenRNNRELU", "numeric_literal");
-    subst("CUDNN_RNN_SINGLE_INP_BIAS", "miopenRNNwithBias", "numeric_literal");
-    subst("CUDNN_RNN_SINGLE_REC_BIAS", "miopenRNNwithBias", "numeric_literal");
-    subst("CUDNN_RNN_TANH", "miopenRNNTANH", "numeric_literal");
-    subst("CUDNN_SKIP_INPUT", "miopenRNNskip", "numeric_literal");
-    subst("CUDNN_SOFTMAX_ACCURATE", "MIOPEN_SOFTMAX_ACCURATE", "numeric_literal");
-    subst("CUDNN_SOFTMAX_FAST", "MIOPEN_SOFTMAX_FAST", "numeric_literal");
-    subst("CUDNN_SOFTMAX_LOG", "MIOPEN_SOFTMAX_LOG", "numeric_literal");
-    subst("CUDNN_SOFTMAX_MODE_CHANNEL", "MIOPEN_SOFTMAX_MODE_CHANNEL", "numeric_literal");
-    subst("CUDNN_SOFTMAX_MODE_INSTANCE", "MIOPEN_SOFTMAX_MODE_INSTANCE", "numeric_literal");
-    subst("CUDNN_STATUS_ALLOC_FAILED", "miopenStatusAllocFailed", "numeric_literal");
-    subst("CUDNN_STATUS_BAD_PARAM", "miopenStatusBadParm", "numeric_literal");
-    subst("CUDNN_STATUS_INTERNAL_ERROR", "miopenStatusInternalError", "numeric_literal");
-    subst("CUDNN_STATUS_INVALID_VALUE", "miopenStatusInvalidValue", "numeric_literal");
-    subst("CUDNN_STATUS_NOT_INITIALIZED", "miopenStatusNotInitialized", "numeric_literal");
-    subst("CUDNN_STATUS_NOT_SUPPORTED", "miopenStatusUnsupportedOp", "numeric_literal");
-    subst("CUDNN_STATUS_SUCCESS", "miopenStatusSuccess", "numeric_literal");
-    subst("CUDNN_UNIDIRECTIONAL", "miopenRNNunidirection", "numeric_literal");
-    subst("CUSOLVER_EIG_MODE_NOVECTOR", "rocblas_evect_none", "numeric_literal");
-    subst("CUSOLVER_EIG_MODE_VECTOR", "rocblas_evect_original", "numeric_literal");
-    subst("CUSOLVER_EIG_RANGE_ALL", "rocblas_erange_all", "numeric_literal");
-    subst("CUSOLVER_EIG_RANGE_I", "rocblas_erange_index", "numeric_literal");
-    subst("CUSOLVER_EIG_RANGE_V", "rocblas_erange_value", "numeric_literal");
-    subst("CUSOLVER_EIG_TYPE_1", "rocblas_eform_ax", "numeric_literal");
-    subst("CUSOLVER_EIG_TYPE_2", "rocblas_eform_abx", "numeric_literal");
-    subst("CUSOLVER_EIG_TYPE_3", "rocblas_eform_bax", "numeric_literal");
-    subst("CUSOLVER_STATUS_ALLOC_FAILED", "rocblas_status_memory_error", "numeric_literal");
-    subst("CUSOLVER_STATUS_ARCH_MISMATCH", "rocblas_status_arch_mismatch", "numeric_literal");
-    subst("CUSOLVER_STATUS_EXECUTION_FAILED", "rocblas_status_not_implemented", "numeric_literal");
-    subst("CUSOLVER_STATUS_INTERNAL_ERROR", "rocblas_status_internal_error", "numeric_literal");
-    subst("CUSOLVER_STATUS_INVALID_VALUE", "rocblas_status_invalid_value", "numeric_literal");
-    subst("CUSOLVER_STATUS_MAPPING_ERROR", "rocblas_status_not_implemented", "numeric_literal");
-    subst("CUSOLVER_STATUS_NOT_INITIALIZED", "rocblas_status_invalid_handle", "numeric_literal");
-    subst("CUSOLVER_STATUS_NOT_SUPPORTED", "rocblas_status_not_implemented", "numeric_literal");
-    subst("CUSOLVER_STATUS_SUCCESS", "rocblas_status_success", "numeric_literal");
-    subst("CUSOLVER_STATUS_ZERO_PIVOT", "rocblas_status_not_implemented", "numeric_literal");
-    subst("CUSPARSE_ACTION_NUMERIC", "rocsparse_action_numeric", "numeric_literal");
-    subst("CUSPARSE_ACTION_SYMBOLIC", "rocsparse_action_symbolic", "numeric_literal");
-    subst("CUSPARSE_DENSETOSPARSE_ALG_DEFAULT", "rocsparse_dense_to_sparse_alg_default", "numeric_literal");
-    subst("CUSPARSE_DIAG_TYPE_NON_UNIT", "rocsparse_diag_type_non_unit", "numeric_literal");
-    subst("CUSPARSE_DIAG_TYPE_UNIT", "rocsparse_diag_type_unit", "numeric_literal");
-    subst("CUSPARSE_DIRECTION_COLUMN", "rocsparse_direction_column", "numeric_literal");
-    subst("CUSPARSE_DIRECTION_ROW", "rocsparse_direction_row", "numeric_literal");
-    subst("CUSPARSE_FILL_MODE_LOWER", "rocsparse_fill_mode_lower", "numeric_literal");
-    subst("CUSPARSE_FILL_MODE_UPPER", "rocsparse_fill_mode_upper", "numeric_literal");
-    subst("CUSPARSE_FORMAT_BLOCKED_ELL", "rocsparse_format_bell", "numeric_literal");
-    subst("CUSPARSE_FORMAT_BSR", "rocsparse_format_bsr", "numeric_literal");
-    subst("CUSPARSE_FORMAT_COO", "rocsparse_format_coo", "numeric_literal");
-    subst("CUSPARSE_FORMAT_COO_AOS", "rocsparse_format_coo_aos", "numeric_literal");
-    subst("CUSPARSE_FORMAT_CSC", "rocsparse_format_csc", "numeric_literal");
-    subst("CUSPARSE_FORMAT_CSR", "rocsparse_format_csr", "numeric_literal");
-    subst("CUSPARSE_FORMAT_SLICED_ELLPACK", "rocsparse_format_ell", "numeric_literal");
-    subst("CUSPARSE_HYB_PARTITION_AUTO", "rocsparse_hyb_partition_auto", "numeric_literal");
-    subst("CUSPARSE_HYB_PARTITION_MAX", "rocsparse_hyb_partition_max", "numeric_literal");
-    subst("CUSPARSE_HYB_PARTITION_USER", "rocsparse_hyb_partition_user", "numeric_literal");
-    subst("CUSPARSE_INDEX_16U", "rocsparse_indextype_u16", "numeric_literal");
-    subst("CUSPARSE_INDEX_32I", "rocsparse_indextype_i32", "numeric_literal");
-    subst("CUSPARSE_INDEX_64I", "rocsparse_indextype_i64", "numeric_literal");
-    subst("CUSPARSE_INDEX_BASE_ONE", "rocsparse_index_base_one", "numeric_literal");
-    subst("CUSPARSE_INDEX_BASE_ZERO", "rocsparse_index_base_zero", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_GENERAL", "rocsparse_matrix_type_general", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_HERMITIAN", "rocsparse_matrix_type_hermitian", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_SYMMETRIC", "rocsparse_matrix_type_symmetric", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_TRIANGULAR", "rocsparse_matrix_type_triangular", "numeric_literal");
-    subst("CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE", "rocsparse_operation_conjugate_transpose", "numeric_literal");
-    subst("CUSPARSE_OPERATION_NON_TRANSPOSE", "rocsparse_operation_none", "numeric_literal");
-    subst("CUSPARSE_OPERATION_TRANSPOSE", "rocsparse_operation_transpose", "numeric_literal");
-    subst("CUSPARSE_ORDER_COL", "rocsparse_order_row", "numeric_literal");
-    subst("CUSPARSE_ORDER_ROW", "rocsparse_order_column", "numeric_literal");
-    subst("CUSPARSE_POINTER_MODE_DEVICE", "rocsparse_pointer_mode_device", "numeric_literal");
-    subst("CUSPARSE_POINTER_MODE_HOST", "rocsparse_pointer_mode_host", "numeric_literal");
-    subst("CUSPARSE_SDDMM_ALG_DEFAULT", "rocsparse_sddmm_alg_default", "numeric_literal");
-    subst("CUSPARSE_SOLVE_POLICY_NO_LEVEL", "rocsparse_solve_policy_auto", "numeric_literal");
-    subst("CUSPARSE_SOLVE_POLICY_USE_LEVEL", "rocsparse_solve_policy_auto", "numeric_literal");
-    subst("CUSPARSE_SPARSETODENSE_ALG_DEFAULT", "rocsparse_sparse_to_dense_alg_default", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_DEFAULT", "rocsparse_spgemm_alg_default", "numeric_literal");
-    subst("CUSPARSE_SPMAT_DIAG_TYPE", "rocsparse_spmat_diag_type", "numeric_literal");
-    subst("CUSPARSE_SPMAT_FILL_MODE", "rocsparse_spmat_fill_mode", "numeric_literal");
-    subst("CUSPARSE_SPMM_ALG_DEFAULT", "rocsparse_spmm_alg_default", "numeric_literal");
-    subst("CUSPARSE_SPMM_BLOCKED_ELL_ALG1", "rocsparse_spmm_alg_bell", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG1", "rocsparse_spmm_alg_coo_segmented", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG2", "rocsparse_spmm_alg_coo_atomic", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG3", "rocsparse_spmm_alg_coo_segmented_atomic", "numeric_literal");
-    subst("CUSPARSE_SPMM_CSR_ALG1", "rocsparse_spmm_alg_csr", "numeric_literal");
-    subst("CUSPARSE_SPMM_CSR_ALG2", "rocsparse_spmm_alg_csr_row_split", "numeric_literal");
-    subst("CUSPARSE_SPMM_CSR_ALG3", "rocsparse_spmm_alg_csr_merge", "numeric_literal");
-    subst("CUSPARSE_SPMV_ALG_DEFAULT", "rocsparse_spmv_alg_default", "numeric_literal");
-    subst("CUSPARSE_SPMV_COO_ALG1", "rocsparse_spmv_alg_coo", "numeric_literal");
-    subst("CUSPARSE_SPMV_COO_ALG2", "rocsparse_spmv_alg_coo_atomic", "numeric_literal");
-    subst("CUSPARSE_SPMV_CSR_ALG1", "rocsparse_spmv_alg_csr_adaptive", "numeric_literal");
-    subst("CUSPARSE_SPMV_CSR_ALG2", "rocsparse_spmv_alg_csr_stream", "numeric_literal");
-    subst("CUSPARSE_SPMV_SELL_ALG1", "rocsparse_spmv_alg_ell", "numeric_literal");
-    subst("CUSPARSE_SPSM_ALG_DEFAULT", "rocsparse_spsm_alg_default", "numeric_literal");
-    subst("CUSPARSE_SPSV_ALG_DEFAULT", "rocsparse_spsv_alg_default", "numeric_literal");
-    subst("CUSPARSE_STATUS_ALLOC_FAILED", "rocsparse_status_memory_error", "numeric_literal");
-    subst("CUSPARSE_STATUS_ARCH_MISMATCH", "rocsparse_status_arch_mismatch", "numeric_literal");
-    subst("CUSPARSE_STATUS_INTERNAL_ERROR", "rocsparse_status_internal_error", "numeric_literal");
-    subst("CUSPARSE_STATUS_INVALID_VALUE", "rocsparse_status_invalid_value", "numeric_literal");
-    subst("CUSPARSE_STATUS_NOT_INITIALIZED", "rocsparse_status_not_initialized", "numeric_literal");
-    subst("CUSPARSE_STATUS_NOT_SUPPORTED", "rocsparse_status_not_implemented", "numeric_literal");
-    subst("CUSPARSE_STATUS_SUCCESS", "rocsparse_status_success", "numeric_literal");
-    subst("CUSPARSE_STATUS_ZERO_PIVOT", "rocsparse_status_zero_pivot", "numeric_literal");
-    subst("cusolver_int_t", "rocblas_int", "numeric_literal");
-}
-
-sub simpleSubstitutions {
-    subst("cuGetErrorName", "hipDrvGetErrorName", "error");
-    subst("cuGetErrorString", "hipDrvGetErrorString", "error");
-    subst("cudaGetErrorName", "hipGetErrorName", "error");
-    subst("cudaGetErrorString", "hipGetErrorString", "error");
-    subst("cudaGetLastError", "hipGetLastError", "error");
-    subst("cudaPeekAtLastError", "hipPeekAtLastError", "error");
-    subst("cuInit", "hipInit", "init");
-    subst("cuDriverGetVersion", "hipDriverGetVersion", "version");
-    subst("cudaDriverGetVersion", "hipDriverGetVersion", "version");
-    subst("cudaRuntimeGetVersion", "hipRuntimeGetVersion", "version");
-    subst("cuDeviceComputeCapability", "hipDeviceComputeCapability", "device");
-    subst("cuDeviceGet", "hipDeviceGet", "device");
-    subst("cuDeviceGetAttribute", "hipDeviceGetAttribute", "device");
-    subst("cuDeviceGetCount", "hipGetDeviceCount", "device");
-    subst("cuDeviceGetDefaultMemPool", "hipDeviceGetDefaultMemPool", "device");
-    subst("cuDeviceGetMemPool", "hipDeviceGetMemPool", "device");
-    subst("cuDeviceGetName", "hipDeviceGetName", "device");
-    subst("cuDeviceGetUuid", "hipDeviceGetUuid", "device");
-    subst("cuDeviceGetUuid_v2", "hipDeviceGetUuid", "device");
-    subst("cuDeviceSetMemPool", "hipDeviceSetMemPool", "device");
-    subst("cuDeviceTotalMem", "hipDeviceTotalMem", "device");
-    subst("cuDeviceTotalMem_v2", "hipDeviceTotalMem", "device");
-    subst("cudaChooseDevice", "hipChooseDevice", "device");
-    subst("cudaDeviceGetAttribute", "hipDeviceGetAttribute", "device");
-    subst("cudaDeviceGetByPCIBusId", "hipDeviceGetByPCIBusId", "device");
-    subst("cudaDeviceGetCacheConfig", "hipDeviceGetCacheConfig", "device");
-    subst("cudaDeviceGetDefaultMemPool", "hipDeviceGetDefaultMemPool", "device");
-    subst("cudaDeviceGetLimit", "hipDeviceGetLimit", "device");
-    subst("cudaDeviceGetMemPool", "hipDeviceGetMemPool", "device");
-    subst("cudaDeviceGetP2PAttribute", "hipDeviceGetP2PAttribute", "device");
-    subst("cudaDeviceGetPCIBusId", "hipDeviceGetPCIBusId", "device");
-    subst("cudaDeviceGetSharedMemConfig", "hipDeviceGetSharedMemConfig", "device");
-    subst("cudaDeviceGetStreamPriorityRange", "hipDeviceGetStreamPriorityRange", "device");
-    subst("cudaDeviceReset", "hipDeviceReset", "device");
-    subst("cudaDeviceSetCacheConfig", "hipDeviceSetCacheConfig", "device");
-    subst("cudaDeviceSetLimit", "hipDeviceSetLimit", "device");
-    subst("cudaDeviceSetMemPool", "hipDeviceSetMemPool", "device");
-    subst("cudaDeviceSetSharedMemConfig", "hipDeviceSetSharedMemConfig", "device");
-    subst("cudaDeviceSynchronize", "hipDeviceSynchronize", "device");
-    subst("cudaGetDevice", "hipGetDevice", "device");
-    subst("cudaGetDeviceCount", "hipGetDeviceCount", "device");
-    subst("cudaGetDeviceFlags", "hipGetDeviceFlags", "device");
-    subst("cudaGetDeviceProperties", "hipGetDeviceProperties", "device");
-    subst("cudaIpcCloseMemHandle", "hipIpcCloseMemHandle", "device");
-    subst("cudaIpcGetEventHandle", "hipIpcGetEventHandle", "device");
-    subst("cudaIpcGetMemHandle", "hipIpcGetMemHandle", "device");
-    subst("cudaIpcOpenEventHandle", "hipIpcOpenEventHandle", "device");
-    subst("cudaIpcOpenMemHandle", "hipIpcOpenMemHandle", "device");
-    subst("cudaSetDevice", "hipSetDevice", "device");
-    subst("cudaSetDeviceFlags", "hipSetDeviceFlags", "device");
-    subst("cuCtxCreate", "hipCtxCreate", "context");
-    subst("cuCtxCreate_v2", "hipCtxCreate", "context");
-    subst("cuCtxDestroy", "hipCtxDestroy", "context");
-    subst("cuCtxDestroy_v2", "hipCtxDestroy", "context");
-    subst("cuCtxGetApiVersion", "hipCtxGetApiVersion", "context");
-    subst("cuCtxGetCacheConfig", "hipCtxGetCacheConfig", "context");
-    subst("cuCtxGetCurrent", "hipCtxGetCurrent", "context");
-    subst("cuCtxGetDevice", "hipCtxGetDevice", "context");
-    subst("cuCtxGetFlags", "hipCtxGetFlags", "context");
-    subst("cuCtxGetLimit", "hipDeviceGetLimit", "context");
-    subst("cuCtxGetSharedMemConfig", "hipCtxGetSharedMemConfig", "context");
-    subst("cuCtxGetStreamPriorityRange", "hipDeviceGetStreamPriorityRange", "context");
-    subst("cuCtxPopCurrent", "hipCtxPopCurrent", "context");
-    subst("cuCtxPopCurrent_v2", "hipCtxPopCurrent", "context");
-    subst("cuCtxPushCurrent", "hipCtxPushCurrent", "context");
-    subst("cuCtxPushCurrent_v2", "hipCtxPushCurrent", "context");
-    subst("cuCtxSetCacheConfig", "hipCtxSetCacheConfig", "context");
-    subst("cuCtxSetCurrent", "hipCtxSetCurrent", "context");
-    subst("cuCtxSetLimit", "hipDeviceSetLimit", "context");
-    subst("cuCtxSetSharedMemConfig", "hipCtxSetSharedMemConfig", "context");
-    subst("cuCtxSynchronize", "hipCtxSynchronize", "context");
-    subst("cuDevicePrimaryCtxGetState", "hipDevicePrimaryCtxGetState", "context");
-    subst("cuDevicePrimaryCtxRelease", "hipDevicePrimaryCtxRelease", "context");
-    subst("cuDevicePrimaryCtxRelease_v2", "hipDevicePrimaryCtxRelease", "context");
-    subst("cuDevicePrimaryCtxReset", "hipDevicePrimaryCtxReset", "context");
-    subst("cuDevicePrimaryCtxReset_v2", "hipDevicePrimaryCtxReset", "context");
-    subst("cuDevicePrimaryCtxRetain", "hipDevicePrimaryCtxRetain", "context");
-    subst("cuDevicePrimaryCtxSetFlags", "hipDevicePrimaryCtxSetFlags", "context");
-    subst("cuDevicePrimaryCtxSetFlags_v2", "hipDevicePrimaryCtxSetFlags", "context");
-    subst("cuLinkAddData", "hiprtcLinkAddData", "module");
-    subst("cuLinkAddData_v2", "hiprtcLinkAddData", "module");
-    subst("cuLinkAddFile", "hiprtcLinkAddFile", "module");
-    subst("cuLinkAddFile_v2", "hiprtcLinkAddFile", "module");
-    subst("cuLinkComplete", "hiprtcLinkComplete", "module");
-    subst("cuLinkCreate", "hiprtcLinkCreate", "module");
-    subst("cuLinkCreate_v2", "hiprtcLinkCreate", "module");
-    subst("cuLinkDestroy", "hiprtcLinkDestroy", "module");
-    subst("cuModuleGetFunction", "hipModuleGetFunction", "module");
-    subst("cuModuleGetGlobal", "hipModuleGetGlobal", "module");
-    subst("cuModuleGetGlobal_v2", "hipModuleGetGlobal", "module");
-    subst("cuModuleGetTexRef", "hipModuleGetTexRef", "module");
-    subst("cuModuleLoad", "hipModuleLoad", "module");
-    subst("cuModuleLoadData", "hipModuleLoadData", "module");
-    subst("cuModuleLoadDataEx", "hipModuleLoadDataEx", "module");
-    subst("cuModuleUnload", "hipModuleUnload", "module");
-    subst("cuArray3DCreate", "hipArray3DCreate", "memory");
-    subst("cuArray3DCreate_v2", "hipArray3DCreate", "memory");
-    subst("cuArray3DGetDescriptor", "hipArray3DGetDescriptor", "memory");
-    subst("cuArray3DGetDescriptor_v2", "hipArray3DGetDescriptor", "memory");
-    subst("cuArrayCreate", "hipArrayCreate", "memory");
-    subst("cuArrayCreate_v2", "hipArrayCreate", "memory");
-    subst("cuArrayDestroy", "hipArrayDestroy", "memory");
-    subst("cuArrayGetDescriptor", "hipArrayGetDescriptor", "memory");
-    subst("cuArrayGetDescriptor_v2", "hipArrayGetDescriptor", "memory");
-    subst("cuDeviceGetByPCIBusId", "hipDeviceGetByPCIBusId", "memory");
-    subst("cuDeviceGetPCIBusId", "hipDeviceGetPCIBusId", "memory");
-    subst("cuIpcCloseMemHandle", "hipIpcCloseMemHandle", "memory");
-    subst("cuIpcGetEventHandle", "hipIpcGetEventHandle", "memory");
-    subst("cuIpcGetMemHandle", "hipIpcGetMemHandle", "memory");
-    subst("cuIpcOpenEventHandle", "hipIpcOpenEventHandle", "memory");
-    subst("cuIpcOpenMemHandle", "hipIpcOpenMemHandle", "memory");
-    subst("cuMemAlloc", "hipMalloc", "memory");
-    subst("cuMemAllocHost", "hipMemAllocHost", "memory");
-    subst("cuMemAllocHost_v2", "hipMemAllocHost", "memory");
-    subst("cuMemAllocManaged", "hipMallocManaged", "memory");
-    subst("cuMemAllocPitch", "hipMemAllocPitch", "memory");
-    subst("cuMemAllocPitch_v2", "hipMemAllocPitch", "memory");
-    subst("cuMemAlloc_v2", "hipMalloc", "memory");
-    subst("cuMemFree", "hipFree", "memory");
-    subst("cuMemFreeHost", "hipHostFree", "memory");
-    subst("cuMemFree_v2", "hipFree", "memory");
-    subst("cuMemGetAddressRange", "hipMemGetAddressRange", "memory");
-    subst("cuMemGetAddressRange_v2", "hipMemGetAddressRange", "memory");
-    subst("cuMemGetInfo", "hipMemGetInfo", "memory");
-    subst("cuMemGetInfo_v2", "hipMemGetInfo", "memory");
-    subst("cuMemHostAlloc", "hipHostAlloc", "memory");
-    subst("cuMemHostGetDevicePointer", "hipHostGetDevicePointer", "memory");
-    subst("cuMemHostGetDevicePointer_v2", "hipHostGetDevicePointer", "memory");
-    subst("cuMemHostGetFlags", "hipHostGetFlags", "memory");
-    subst("cuMemHostRegister", "hipHostRegister", "memory");
-    subst("cuMemHostRegister_v2", "hipHostRegister", "memory");
-    subst("cuMemHostUnregister", "hipHostUnregister", "memory");
-    subst("cuMemcpy2D", "hipMemcpyParam2D", "memory");
-    subst("cuMemcpy2DAsync", "hipMemcpyParam2DAsync", "memory");
-    subst("cuMemcpy2DAsync_v2", "hipMemcpyParam2DAsync", "memory");
-    subst("cuMemcpy2DUnaligned", "hipDrvMemcpy2DUnaligned", "memory");
-    subst("cuMemcpy2DUnaligned_v2", "hipDrvMemcpy2DUnaligned", "memory");
-    subst("cuMemcpy2D_v2", "hipMemcpyParam2D", "memory");
-    subst("cuMemcpy3D", "hipDrvMemcpy3D", "memory");
-    subst("cuMemcpy3DAsync", "hipDrvMemcpy3DAsync", "memory");
-    subst("cuMemcpy3DAsync_v2", "hipDrvMemcpy3DAsync", "memory");
-    subst("cuMemcpy3D_v2", "hipDrvMemcpy3D", "memory");
-    subst("cuMemcpyAtoH", "hipMemcpyAtoH", "memory");
-    subst("cuMemcpyAtoH_v2", "hipMemcpyAtoH", "memory");
-    subst("cuMemcpyDtoD", "hipMemcpyDtoD", "memory");
-    subst("cuMemcpyDtoDAsync", "hipMemcpyDtoDAsync", "memory");
-    subst("cuMemcpyDtoDAsync_v2", "hipMemcpyDtoDAsync", "memory");
-    subst("cuMemcpyDtoD_v2", "hipMemcpyDtoD", "memory");
-    subst("cuMemcpyDtoH", "hipMemcpyDtoH", "memory");
-    subst("cuMemcpyDtoHAsync", "hipMemcpyDtoHAsync", "memory");
-    subst("cuMemcpyDtoHAsync_v2", "hipMemcpyDtoHAsync", "memory");
-    subst("cuMemcpyDtoH_v2", "hipMemcpyDtoH", "memory");
-    subst("cuMemcpyHtoA", "hipMemcpyHtoA", "memory");
-    subst("cuMemcpyHtoA_v2", "hipMemcpyHtoA", "memory");
-    subst("cuMemcpyHtoD", "hipMemcpyHtoD", "memory");
-    subst("cuMemcpyHtoDAsync", "hipMemcpyHtoDAsync", "memory");
-    subst("cuMemcpyHtoDAsync_v2", "hipMemcpyHtoDAsync", "memory");
-    subst("cuMemcpyHtoD_v2", "hipMemcpyHtoD", "memory");
-    subst("cuMemsetD16", "hipMemsetD16", "memory");
-    subst("cuMemsetD16Async", "hipMemsetD16Async", "memory");
-    subst("cuMemsetD16_v2", "hipMemsetD16", "memory");
-    subst("cuMemsetD32", "hipMemsetD32", "memory");
-    subst("cuMemsetD32Async", "hipMemsetD32Async", "memory");
-    subst("cuMemsetD32_v2", "hipMemsetD32", "memory");
-    subst("cuMemsetD8", "hipMemsetD8", "memory");
-    subst("cuMemsetD8Async", "hipMemsetD8Async", "memory");
-    subst("cuMemsetD8_v2", "hipMemsetD8", "memory");
-    subst("cuMipmappedArrayCreate", "hipMipmappedArrayCreate", "memory");
-    subst("cuMipmappedArrayDestroy", "hipMipmappedArrayDestroy", "memory");
-    subst("cuMipmappedArrayGetLevel", "hipMipmappedArrayGetLevel", "memory");
-    subst("cudaArrayGetInfo", "hipArrayGetInfo", "memory");
-    subst("cudaFree", "hipFree", "memory");
-    subst("cudaFreeArray", "hipFreeArray", "memory");
-    subst("cudaFreeAsync", "hipFreeAsync", "memory");
-    subst("cudaFreeHost", "hipHostFree", "memory");
-    subst("cudaFreeMipmappedArray", "hipFreeMipmappedArray", "memory");
-    subst("cudaGetMipmappedArrayLevel", "hipGetMipmappedArrayLevel", "memory");
-    subst("cudaGetSymbolAddress", "hipGetSymbolAddress", "memory");
-    subst("cudaGetSymbolSize", "hipGetSymbolSize", "memory");
-    subst("cudaHostAlloc", "hipHostAlloc", "memory");
-    subst("cudaHostGetDevicePointer", "hipHostGetDevicePointer", "memory");
-    subst("cudaHostGetFlags", "hipHostGetFlags", "memory");
-    subst("cudaHostRegister", "hipHostRegister", "memory");
-    subst("cudaHostUnregister", "hipHostUnregister", "memory");
-    subst("cudaMalloc", "hipMalloc", "memory");
-    subst("cudaMalloc3D", "hipMalloc3D", "memory");
-    subst("cudaMalloc3DArray", "hipMalloc3DArray", "memory");
-    subst("cudaMallocArray", "hipMallocArray", "memory");
-    subst("cudaMallocAsync", "hipMallocAsync", "memory");
-    subst("cudaMallocFromPoolAsync", "hipMallocFromPoolAsync", "memory");
-    subst("cudaMallocHost", "hipHostMalloc", "memory");
-    subst("cudaMallocManaged", "hipMallocManaged", "memory");
-    subst("cudaMallocMipmappedArray", "hipMallocMipmappedArray", "memory");
-    subst("cudaMallocPitch", "hipMallocPitch", "memory");
-    subst("cudaMemAdvise", "hipMemAdvise", "memory");
-    subst("cudaMemGetInfo", "hipMemGetInfo", "memory");
-    subst("cudaMemPoolCreate", "hipMemPoolCreate", "memory");
-    subst("cudaMemPoolDestroy", "hipMemPoolDestroy", "memory");
-    subst("cudaMemPoolExportPointer", "hipMemPoolExportPointer", "memory");
-    subst("cudaMemPoolExportToShareableHandle", "hipMemPoolExportToShareableHandle", "memory");
-    subst("cudaMemPoolGetAccess", "hipMemPoolGetAccess", "memory");
-    subst("cudaMemPoolGetAttribute", "hipMemPoolGetAttribute", "memory");
-    subst("cudaMemPoolImportFromShareableHandle", "hipMemPoolImportFromShareableHandle", "memory");
-    subst("cudaMemPoolImportPointer", "hipMemPoolImportPointer", "memory");
-    subst("cudaMemPoolSetAccess", "hipMemPoolSetAccess", "memory");
-    subst("cudaMemPoolSetAttribute", "hipMemPoolSetAttribute", "memory");
-    subst("cudaMemPoolTrimTo", "hipMemPoolTrimTo", "memory");
-    subst("cudaMemPrefetchAsync", "hipMemPrefetchAsync", "memory");
-    subst("cudaMemRangeGetAttribute", "hipMemRangeGetAttribute", "memory");
-    subst("cudaMemRangeGetAttributes", "hipMemRangeGetAttributes", "memory");
-    subst("cudaMemcpy", "hipMemcpy", "memory");
-    subst("cudaMemcpy2D", "hipMemcpy2D", "memory");
-    subst("cudaMemcpy2DAsync", "hipMemcpy2DAsync", "memory");
-    subst("cudaMemcpy2DFromArray", "hipMemcpy2DFromArray", "memory");
-    subst("cudaMemcpy2DFromArrayAsync", "hipMemcpy2DFromArrayAsync", "memory");
-    subst("cudaMemcpy2DToArray", "hipMemcpy2DToArray", "memory");
-    subst("cudaMemcpy2DToArrayAsync", "hipMemcpy2DToArrayAsync", "memory");
-    subst("cudaMemcpy3D", "hipMemcpy3D", "memory");
-    subst("cudaMemcpy3DAsync", "hipMemcpy3DAsync", "memory");
-    subst("cudaMemcpyAsync", "hipMemcpyAsync", "memory");
-    subst("cudaMemcpyFromArray", "hipMemcpyFromArray", "memory");
-    subst("cudaMemcpyFromSymbol", "hipMemcpyFromSymbol", "memory");
-    subst("cudaMemcpyFromSymbolAsync", "hipMemcpyFromSymbolAsync", "memory");
-    subst("cudaMemcpyPeer", "hipMemcpyPeer", "memory");
-    subst("cudaMemcpyPeerAsync", "hipMemcpyPeerAsync", "memory");
-    subst("cudaMemcpyToArray", "hipMemcpyToArray", "memory");
-    subst("cudaMemcpyToSymbol", "hipMemcpyToSymbol", "memory");
-    subst("cudaMemcpyToSymbolAsync", "hipMemcpyToSymbolAsync", "memory");
-    subst("cudaMemset", "hipMemset", "memory");
-    subst("cudaMemset2D", "hipMemset2D", "memory");
-    subst("cudaMemset2DAsync", "hipMemset2DAsync", "memory");
-    subst("cudaMemset3D", "hipMemset3D", "memory");
-    subst("cudaMemset3DAsync", "hipMemset3DAsync", "memory");
-    subst("cudaMemsetAsync", "hipMemsetAsync", "memory");
-    subst("make_cudaExtent", "make_hipExtent", "memory");
-    subst("make_cudaPitchedPtr", "make_hipPitchedPtr", "memory");
-    subst("make_cudaPos", "make_hipPos", "memory");
-    subst("cuMemAddressFree", "hipMemAddressFree", "virtual_memory");
-    subst("cuMemAddressReserve", "hipMemAddressReserve", "virtual_memory");
-    subst("cuMemCreate", "hipMemCreate", "virtual_memory");
-    subst("cuMemExportToShareableHandle", "hipMemExportToShareableHandle", "virtual_memory");
-    subst("cuMemGetAccess", "hipMemGetAccess", "virtual_memory");
-    subst("cuMemGetAllocationGranularity", "hipMemGetAllocationGranularity", "virtual_memory");
-    subst("cuMemGetAllocationPropertiesFromHandle", "hipMemGetAllocationPropertiesFromHandle", "virtual_memory");
-    subst("cuMemImportFromShareableHandle", "hipMemImportFromShareableHandle", "virtual_memory");
-    subst("cuMemMap", "hipMemMap", "virtual_memory");
-    subst("cuMemMapArrayAsync", "hipMemMapArrayAsync", "virtual_memory");
-    subst("cuMemRelease", "hipMemRelease", "virtual_memory");
-    subst("cuMemRetainAllocationHandle", "hipMemRetainAllocationHandle", "virtual_memory");
-    subst("cuMemSetAccess", "hipMemSetAccess", "virtual_memory");
-    subst("cuMemUnmap", "hipMemUnmap", "virtual_memory");
-    subst("cuMemAllocAsync", "hipMallocAsync", "ordered_memory");
-    subst("cuMemAllocFromPoolAsync", "hipMallocFromPoolAsync", "ordered_memory");
-    subst("cuMemFreeAsync", "hipFreeAsync", "ordered_memory");
-    subst("cuMemPoolCreate", "hipMemPoolCreate", "ordered_memory");
-    subst("cuMemPoolDestroy", "hipMemPoolDestroy", "ordered_memory");
-    subst("cuMemPoolExportPointer", "hipMemPoolExportPointer", "ordered_memory");
-    subst("cuMemPoolExportToShareableHandle", "hipMemPoolExportToShareableHandle", "ordered_memory");
-    subst("cuMemPoolGetAccess", "hipMemPoolGetAccess", "ordered_memory");
-    subst("cuMemPoolGetAttribute", "hipMemPoolGetAttribute", "ordered_memory");
-    subst("cuMemPoolImportFromShareableHandle", "hipMemPoolImportFromShareableHandle", "ordered_memory");
-    subst("cuMemPoolImportPointer", "hipMemPoolImportPointer", "ordered_memory");
-    subst("cuMemPoolSetAccess", "hipMemPoolSetAccess", "ordered_memory");
-    subst("cuMemPoolSetAttribute", "hipMemPoolSetAttribute", "ordered_memory");
-    subst("cuMemPoolTrimTo", "hipMemPoolTrimTo", "ordered_memory");
-    subst("cuMemAdvise", "hipMemAdvise", "unified");
-    subst("cuMemPrefetchAsync", "hipMemPrefetchAsync", "unified");
-    subst("cuMemRangeGetAttribute", "hipMemRangeGetAttribute", "unified");
-    subst("cuMemRangeGetAttributes", "hipMemRangeGetAttributes", "unified");
-    subst("cuPointerGetAttribute", "hipPointerGetAttribute", "unified");
-    subst("cuPointerGetAttributes", "hipDrvPointerGetAttributes", "unified");
-    subst("cuPointerSetAttribute", "hipPointerSetAttribute", "unified");
-    subst("cudaPointerGetAttributes", "hipPointerGetAttributes", "unified");
-    subst("cuStreamAddCallback", "hipStreamAddCallback", "stream");
-    subst("cuStreamAttachMemAsync", "hipStreamAttachMemAsync", "stream");
-    subst("cuStreamBeginCapture", "hipStreamBeginCapture", "stream");
-    subst("cuStreamBeginCapture_v2", "hipStreamBeginCapture", "stream");
-    subst("cuStreamCreate", "hipStreamCreateWithFlags", "stream");
-    subst("cuStreamCreateWithPriority", "hipStreamCreateWithPriority", "stream");
-    subst("cuStreamDestroy", "hipStreamDestroy", "stream");
-    subst("cuStreamDestroy_v2", "hipStreamDestroy", "stream");
-    subst("cuStreamEndCapture", "hipStreamEndCapture", "stream");
-    subst("cuStreamGetCaptureInfo", "hipStreamGetCaptureInfo", "stream");
-    subst("cuStreamGetCaptureInfo_v2", "hipStreamGetCaptureInfo_v2", "stream");
-    subst("cuStreamGetFlags", "hipStreamGetFlags", "stream");
-    subst("cuStreamGetPriority", "hipStreamGetPriority", "stream");
-    subst("cuStreamIsCapturing", "hipStreamIsCapturing", "stream");
-    subst("cuStreamQuery", "hipStreamQuery", "stream");
-    subst("cuStreamSynchronize", "hipStreamSynchronize", "stream");
-    subst("cuStreamUpdateCaptureDependencies", "hipStreamUpdateCaptureDependencies", "stream");
-    subst("cuStreamWaitEvent", "hipStreamWaitEvent", "stream");
-    subst("cuThreadExchangeStreamCaptureMode", "hipThreadExchangeStreamCaptureMode", "stream");
-    subst("cudaStreamAddCallback", "hipStreamAddCallback", "stream");
-    subst("cudaStreamAttachMemAsync", "hipStreamAttachMemAsync", "stream");
-    subst("cudaStreamBeginCapture", "hipStreamBeginCapture", "stream");
-    subst("cudaStreamCreate", "hipStreamCreate", "stream");
-    subst("cudaStreamCreateWithFlags", "hipStreamCreateWithFlags", "stream");
-    subst("cudaStreamCreateWithPriority", "hipStreamCreateWithPriority", "stream");
-    subst("cudaStreamDestroy", "hipStreamDestroy", "stream");
-    subst("cudaStreamEndCapture", "hipStreamEndCapture", "stream");
-    subst("cudaStreamGetCaptureInfo", "hipStreamGetCaptureInfo", "stream");
-    subst("cudaStreamGetFlags", "hipStreamGetFlags", "stream");
-    subst("cudaStreamGetPriority", "hipStreamGetPriority", "stream");
-    subst("cudaStreamIsCapturing", "hipStreamIsCapturing", "stream");
-    subst("cudaStreamQuery", "hipStreamQuery", "stream");
-    subst("cudaStreamSynchronize", "hipStreamSynchronize", "stream");
-    subst("cudaStreamUpdateCaptureDependencies", "hipStreamUpdateCaptureDependencies", "stream");
-    subst("cudaStreamWaitEvent", "hipStreamWaitEvent", "stream");
-    subst("cudaThreadExchangeStreamCaptureMode", "hipThreadExchangeStreamCaptureMode", "stream");
-    subst("cuEventCreate", "hipEventCreateWithFlags", "event");
-    subst("cuEventDestroy", "hipEventDestroy", "event");
-    subst("cuEventDestroy_v2", "hipEventDestroy", "event");
-    subst("cuEventElapsedTime", "hipEventElapsedTime", "event");
-    subst("cuEventQuery", "hipEventQuery", "event");
-    subst("cuEventRecord", "hipEventRecord", "event");
-    subst("cuEventSynchronize", "hipEventSynchronize", "event");
-    subst("cudaEventCreate", "hipEventCreate", "event");
-    subst("cudaEventCreateWithFlags", "hipEventCreateWithFlags", "event");
-    subst("cudaEventDestroy", "hipEventDestroy", "event");
-    subst("cudaEventElapsedTime", "hipEventElapsedTime", "event");
-    subst("cudaEventQuery", "hipEventQuery", "event");
-    subst("cudaEventRecord", "hipEventRecord", "event");
-    subst("cudaEventSynchronize", "hipEventSynchronize", "event");
-    subst("cuDestroyExternalMemory", "hipDestroyExternalMemory", "external_resource");
-    subst("cuDestroyExternalSemaphore", "hipDestroyExternalSemaphore", "external_resource");
-    subst("cuExternalMemoryGetMappedBuffer", "hipExternalMemoryGetMappedBuffer", "external_resource");
-    subst("cuImportExternalMemory", "hipImportExternalMemory", "external_resource");
-    subst("cuImportExternalSemaphore", "hipImportExternalSemaphore", "external_resource");
-    subst("cuSignalExternalSemaphoresAsync", "hipSignalExternalSemaphoresAsync", "external_resource");
-    subst("cuWaitExternalSemaphoresAsync", "hipWaitExternalSemaphoresAsync", "external_resource");
-    subst("cudaDestroyExternalMemory", "hipDestroyExternalMemory", "external_resource");
-    subst("cudaDestroyExternalSemaphore", "hipDestroyExternalSemaphore", "external_resource");
-    subst("cudaExternalMemoryGetMappedBuffer", "hipExternalMemoryGetMappedBuffer", "external_resource");
-    subst("cudaImportExternalMemory", "hipImportExternalMemory", "external_resource");
-    subst("cudaImportExternalSemaphore", "hipImportExternalSemaphore", "external_resource");
-    subst("cudaSignalExternalSemaphoresAsync", "hipSignalExternalSemaphoresAsync", "external_resource");
-    subst("cudaWaitExternalSemaphoresAsync", "hipWaitExternalSemaphoresAsync", "external_resource");
-    subst("cuStreamWaitValue32", "hipStreamWaitValue32", "stream_memory");
-    subst("cuStreamWaitValue32_v2", "hipStreamWaitValue32", "stream_memory");
-    subst("cuStreamWaitValue64", "hipStreamWaitValue64", "stream_memory");
-    subst("cuStreamWaitValue64_v2", "hipStreamWaitValue64", "stream_memory");
-    subst("cuStreamWriteValue32", "hipStreamWriteValue32", "stream_memory");
-    subst("cuStreamWriteValue32_v2", "hipStreamWriteValue32", "stream_memory");
-    subst("cuStreamWriteValue64", "hipStreamWriteValue64", "stream_memory");
-    subst("cuStreamWriteValue64_v2", "hipStreamWriteValue64", "stream_memory");
-    subst("cuFuncGetAttribute", "hipFuncGetAttribute", "execution");
-    subst("cuLaunchCooperativeKernel", "hipModuleLaunchCooperativeKernel", "execution");
-    subst("cuLaunchCooperativeKernelMultiDevice", "hipModuleLaunchCooperativeKernelMultiDevice", "execution");
-    subst("cuLaunchHostFunc", "hipLaunchHostFunc", "execution");
-    subst("cuLaunchKernel", "hipModuleLaunchKernel", "execution");
-    subst("cudaConfigureCall", "hipConfigureCall", "execution");
-    subst("cudaFuncGetAttributes", "hipFuncGetAttributes", "execution");
-    subst("cudaFuncSetAttribute", "hipFuncSetAttribute", "execution");
-    subst("cudaFuncSetCacheConfig", "hipFuncSetCacheConfig", "execution");
-    subst("cudaFuncSetSharedMemConfig", "hipFuncSetSharedMemConfig", "execution");
-    subst("cudaLaunch", "hipLaunchByPtr", "execution");
-    subst("cudaLaunchCooperativeKernel", "hipLaunchCooperativeKernel", "execution");
-    subst("cudaLaunchCooperativeKernelMultiDevice", "hipLaunchCooperativeKernelMultiDevice", "execution");
-    subst("cudaLaunchHostFunc", "hipLaunchHostFunc", "execution");
-    subst("cudaLaunchKernel", "hipLaunchKernel", "execution");
-    subst("cudaSetupArgument", "hipSetupArgument", "execution");
-    subst("cuDeviceGetGraphMemAttribute", "hipDeviceGetGraphMemAttribute", "graph");
-    subst("cuDeviceGraphMemTrim", "hipDeviceGraphMemTrim", "graph");
-    subst("cuDeviceSetGraphMemAttribute", "hipDeviceSetGraphMemAttribute", "graph");
-    subst("cuGraphAddBatchMemOpNode", "hipGraphAddBatchMemOpNode", "graph");
-    subst("cuGraphAddChildGraphNode", "hipGraphAddChildGraphNode", "graph");
-    subst("cuGraphAddDependencies", "hipGraphAddDependencies", "graph");
-    subst("cuGraphAddEmptyNode", "hipGraphAddEmptyNode", "graph");
-    subst("cuGraphAddEventRecordNode", "hipGraphAddEventRecordNode", "graph");
-    subst("cuGraphAddEventWaitNode", "hipGraphAddEventWaitNode", "graph");
-    subst("cuGraphAddExternalSemaphoresSignalNode", "hipGraphAddExternalSemaphoresSignalNode", "graph");
-    subst("cuGraphAddExternalSemaphoresWaitNode", "hipGraphAddExternalSemaphoresWaitNode", "graph");
-    subst("cuGraphAddHostNode", "hipGraphAddHostNode", "graph");
-    subst("cuGraphAddKernelNode", "hipGraphAddKernelNode", "graph");
-    subst("cuGraphAddMemAllocNode", "hipGraphAddMemAllocNode", "graph");
-    subst("cuGraphAddMemcpyNode", "hipDrvGraphAddMemcpyNode", "graph");
-    subst("cuGraphAddMemsetNode", "hipDrvGraphAddMemsetNode", "graph");
-    subst("cuGraphBatchMemOpNodeGetParams", "hipGraphBatchMemOpNodeGetParams", "graph");
-    subst("cuGraphBatchMemOpNodeSetParams", "hipGraphBatchMemOpNodeSetParams", "graph");
-    subst("cuGraphChildGraphNodeGetGraph", "hipGraphChildGraphNodeGetGraph", "graph");
-    subst("cuGraphClone", "hipGraphClone", "graph");
-    subst("cuGraphCreate", "hipGraphCreate", "graph");
-    subst("cuGraphDebugDotPrint", "hipGraphDebugDotPrint", "graph");
-    subst("cuGraphDestroy", "hipGraphDestroy", "graph");
-    subst("cuGraphDestroyNode", "hipGraphDestroyNode", "graph");
-    subst("cuGraphEventRecordNodeGetEvent", "hipGraphEventRecordNodeGetEvent", "graph");
-    subst("cuGraphEventRecordNodeSetEvent", "hipGraphEventRecordNodeSetEvent", "graph");
-    subst("cuGraphEventWaitNodeGetEvent", "hipGraphEventWaitNodeGetEvent", "graph");
-    subst("cuGraphEventWaitNodeSetEvent", "hipGraphEventWaitNodeSetEvent", "graph");
-    subst("cuGraphExecBatchMemOpNodeSetParams", "hipGraphExecBatchMemOpNodeSetParams", "graph");
-    subst("cuGraphExecChildGraphNodeSetParams", "hipGraphExecChildGraphNodeSetParams", "graph");
-    subst("cuGraphExecDestroy", "hipGraphExecDestroy", "graph");
-    subst("cuGraphExecEventRecordNodeSetEvent", "hipGraphExecEventRecordNodeSetEvent", "graph");
-    subst("cuGraphExecEventWaitNodeSetEvent", "hipGraphExecEventWaitNodeSetEvent", "graph");
-    subst("cuGraphExecExternalSemaphoresSignalNodeSetParams", "hipGraphExecExternalSemaphoresSignalNodeSetParams", "graph");
-    subst("cuGraphExecExternalSemaphoresWaitNodeSetParams", "hipGraphExecExternalSemaphoresWaitNodeSetParams", "graph");
-    subst("cuGraphExecHostNodeSetParams", "hipGraphExecHostNodeSetParams", "graph");
-    subst("cuGraphExecKernelNodeSetParams", "hipGraphExecKernelNodeSetParams", "graph");
-    subst("cuGraphExecUpdate", "hipGraphExecUpdate", "graph");
-    subst("cuGraphExternalSemaphoresSignalNodeGetParams", "hipGraphExternalSemaphoresSignalNodeGetParams", "graph");
-    subst("cuGraphExternalSemaphoresSignalNodeSetParams", "hipGraphExternalSemaphoresSignalNodeSetParams", "graph");
-    subst("cuGraphExternalSemaphoresWaitNodeGetParams", "hipGraphExternalSemaphoresWaitNodeGetParams", "graph");
-    subst("cuGraphExternalSemaphoresWaitNodeSetParams", "hipGraphExternalSemaphoresWaitNodeSetParams", "graph");
-    subst("cuGraphGetEdges", "hipGraphGetEdges", "graph");
-    subst("cuGraphGetNodes", "hipGraphGetNodes", "graph");
-    subst("cuGraphGetRootNodes", "hipGraphGetRootNodes", "graph");
-    subst("cuGraphHostNodeGetParams", "hipGraphHostNodeGetParams", "graph");
-    subst("cuGraphHostNodeSetParams", "hipGraphHostNodeSetParams", "graph");
-    subst("cuGraphInstantiate", "hipGraphInstantiate", "graph");
-    subst("cuGraphInstantiateWithFlags", "hipGraphInstantiateWithFlags", "graph");
-    subst("cuGraphInstantiate_v2", "hipGraphInstantiate", "graph");
-    subst("cuGraphKernelNodeCopyAttributes", "hipGraphKernelNodeCopyAttributes", "graph");
-    subst("cuGraphKernelNodeGetAttribute", "hipGraphKernelNodeGetAttribute", "graph");
-    subst("cuGraphKernelNodeGetParams", "hipGraphKernelNodeGetParams", "graph");
-    subst("cuGraphKernelNodeSetAttribute", "hipGraphKernelNodeSetAttribute", "graph");
-    subst("cuGraphKernelNodeSetParams", "hipGraphKernelNodeSetParams", "graph");
-    subst("cuGraphLaunch", "hipGraphLaunch", "graph");
-    subst("cuGraphMemAllocNodeGetParams", "hipGraphMemAllocNodeGetParams", "graph");
-    subst("cuGraphMemFreeNodeGetParams", "hipGraphMemFreeNodeGetParams", "graph");
-    subst("cuGraphMemsetNodeGetParams", "hipGraphMemsetNodeGetParams", "graph");
-    subst("cuGraphMemsetNodeSetParams", "hipGraphMemsetNodeSetParams", "graph");
-    subst("cuGraphNodeFindInClone", "hipGraphNodeFindInClone", "graph");
-    subst("cuGraphNodeGetDependencies", "hipGraphNodeGetDependencies", "graph");
-    subst("cuGraphNodeGetDependentNodes", "hipGraphNodeGetDependentNodes", "graph");
-    subst("cuGraphNodeGetEnabled", "hipGraphNodeGetEnabled", "graph");
-    subst("cuGraphNodeGetType", "hipGraphNodeGetType", "graph");
-    subst("cuGraphNodeSetEnabled", "hipGraphNodeSetEnabled", "graph");
-    subst("cuGraphReleaseUserObject", "hipGraphReleaseUserObject", "graph");
-    subst("cuGraphRemoveDependencies", "hipGraphRemoveDependencies", "graph");
-    subst("cuGraphRetainUserObject", "hipGraphRetainUserObject", "graph");
-    subst("cuGraphUpload", "hipGraphUpload", "graph");
-    subst("cuUserObjectCreate", "hipUserObjectCreate", "graph");
-    subst("cuUserObjectRelease", "hipUserObjectRelease", "graph");
-    subst("cuUserObjectRetain", "hipUserObjectRetain", "graph");
-    subst("cudaDeviceGetGraphMemAttribute", "hipDeviceGetGraphMemAttribute", "graph");
-    subst("cudaDeviceGraphMemTrim", "hipDeviceGraphMemTrim", "graph");
-    subst("cudaDeviceSetGraphMemAttribute", "hipDeviceSetGraphMemAttribute", "graph");
-    subst("cudaGraphAddChildGraphNode", "hipGraphAddChildGraphNode", "graph");
-    subst("cudaGraphAddDependencies", "hipGraphAddDependencies", "graph");
-    subst("cudaGraphAddEmptyNode", "hipGraphAddEmptyNode", "graph");
-    subst("cudaGraphAddEventRecordNode", "hipGraphAddEventRecordNode", "graph");
-    subst("cudaGraphAddEventWaitNode", "hipGraphAddEventWaitNode", "graph");
-    subst("cudaGraphAddExternalSemaphoresSignalNode", "hipGraphAddExternalSemaphoresSignalNode", "graph");
-    subst("cudaGraphAddExternalSemaphoresWaitNode", "hipGraphAddExternalSemaphoresWaitNode", "graph");
-    subst("cudaGraphAddHostNode", "hipGraphAddHostNode", "graph");
-    subst("cudaGraphAddKernelNode", "hipGraphAddKernelNode", "graph");
-    subst("cudaGraphAddMemAllocNode", "hipGraphAddMemAllocNode", "graph");
-    subst("cudaGraphAddMemFreeNode", "hipGraphAddMemFreeNode", "graph");
-    subst("cudaGraphAddMemcpyNode", "hipGraphAddMemcpyNode", "graph");
-    subst("cudaGraphAddMemcpyNode1D", "hipGraphAddMemcpyNode1D", "graph");
-    subst("cudaGraphAddMemcpyNodeFromSymbol", "hipGraphAddMemcpyNodeFromSymbol", "graph");
-    subst("cudaGraphAddMemcpyNodeToSymbol", "hipGraphAddMemcpyNodeToSymbol", "graph");
-    subst("cudaGraphAddMemsetNode", "hipGraphAddMemsetNode", "graph");
-    subst("cudaGraphChildGraphNodeGetGraph", "hipGraphChildGraphNodeGetGraph", "graph");
-    subst("cudaGraphClone", "hipGraphClone", "graph");
-    subst("cudaGraphCreate", "hipGraphCreate", "graph");
-    subst("cudaGraphDebugDotPrint", "hipGraphDebugDotPrint", "graph");
-    subst("cudaGraphDestroy", "hipGraphDestroy", "graph");
-    subst("cudaGraphDestroyNode", "hipGraphDestroyNode", "graph");
-    subst("cudaGraphEventRecordNodeGetEvent", "hipGraphEventRecordNodeGetEvent", "graph");
-    subst("cudaGraphEventRecordNodeSetEvent", "hipGraphEventRecordNodeSetEvent", "graph");
-    subst("cudaGraphEventWaitNodeGetEvent", "hipGraphEventWaitNodeGetEvent", "graph");
-    subst("cudaGraphEventWaitNodeSetEvent", "hipGraphEventWaitNodeSetEvent", "graph");
-    subst("cudaGraphExecChildGraphNodeSetParams", "hipGraphExecChildGraphNodeSetParams", "graph");
-    subst("cudaGraphExecDestroy", "hipGraphExecDestroy", "graph");
-    subst("cudaGraphExecEventRecordNodeSetEvent", "hipGraphExecEventRecordNodeSetEvent", "graph");
-    subst("cudaGraphExecEventWaitNodeSetEvent", "hipGraphExecEventWaitNodeSetEvent", "graph");
-    subst("cudaGraphExecExternalSemaphoresSignalNodeSetParams", "hipGraphExecExternalSemaphoresSignalNodeSetParams", "graph");
-    subst("cudaGraphExecExternalSemaphoresWaitNodeSetParams", "hipGraphExecExternalSemaphoresWaitNodeSetParams", "graph");
-    subst("cudaGraphExecHostNodeSetParams", "hipGraphExecHostNodeSetParams", "graph");
-    subst("cudaGraphExecKernelNodeSetParams", "hipGraphExecKernelNodeSetParams", "graph");
-    subst("cudaGraphExecMemcpyNodeSetParams", "hipGraphExecMemcpyNodeSetParams", "graph");
-    subst("cudaGraphExecMemcpyNodeSetParams1D", "hipGraphExecMemcpyNodeSetParams1D", "graph");
-    subst("cudaGraphExecMemcpyNodeSetParamsFromSymbol", "hipGraphExecMemcpyNodeSetParamsFromSymbol", "graph");
-    subst("cudaGraphExecMemcpyNodeSetParamsToSymbol", "hipGraphExecMemcpyNodeSetParamsToSymbol", "graph");
-    subst("cudaGraphExecMemsetNodeSetParams", "hipGraphExecMemsetNodeSetParams", "graph");
-    subst("cudaGraphExecUpdate", "hipGraphExecUpdate", "graph");
-    subst("cudaGraphExternalSemaphoresSignalNodeGetParams", "hipGraphExternalSemaphoresSignalNodeGetParams", "graph");
-    subst("cudaGraphExternalSemaphoresSignalNodeSetParams", "hipGraphExternalSemaphoresSignalNodeSetParams", "graph");
-    subst("cudaGraphExternalSemaphoresWaitNodeGetParams", "hipGraphExternalSemaphoresWaitNodeGetParams", "graph");
-    subst("cudaGraphExternalSemaphoresWaitNodeSetParams", "hipGraphExternalSemaphoresWaitNodeSetParams", "graph");
-    subst("cudaGraphGetEdges", "hipGraphGetEdges", "graph");
-    subst("cudaGraphGetNodes", "hipGraphGetNodes", "graph");
-    subst("cudaGraphGetRootNodes", "hipGraphGetRootNodes", "graph");
-    subst("cudaGraphHostNodeGetParams", "hipGraphHostNodeGetParams", "graph");
-    subst("cudaGraphHostNodeSetParams", "hipGraphHostNodeSetParams", "graph");
-    subst("cudaGraphInstantiate", "hipGraphInstantiate", "graph");
-    subst("cudaGraphInstantiateWithFlags", "hipGraphInstantiateWithFlags", "graph");
-    subst("cudaGraphKernelNodeCopyAttributes", "hipGraphKernelNodeCopyAttributes", "graph");
-    subst("cudaGraphKernelNodeGetAttribute", "hipGraphKernelNodeGetAttribute", "graph");
-    subst("cudaGraphKernelNodeGetParams", "hipGraphKernelNodeGetParams", "graph");
-    subst("cudaGraphKernelNodeSetAttribute", "hipGraphKernelNodeSetAttribute", "graph");
-    subst("cudaGraphKernelNodeSetParams", "hipGraphKernelNodeSetParams", "graph");
-    subst("cudaGraphLaunch", "hipGraphLaunch", "graph");
-    subst("cudaGraphMemAllocNodeGetParams", "hipGraphMemAllocNodeGetParams", "graph");
-    subst("cudaGraphMemFreeNodeGetParams", "hipGraphMemFreeNodeGetParams", "graph");
-    subst("cudaGraphMemcpyNodeGetParams", "hipGraphMemcpyNodeGetParams", "graph");
-    subst("cudaGraphMemcpyNodeSetParams", "hipGraphMemcpyNodeSetParams", "graph");
-    subst("cudaGraphMemcpyNodeSetParams1D", "hipGraphMemcpyNodeSetParams1D", "graph");
-    subst("cudaGraphMemcpyNodeSetParamsFromSymbol", "hipGraphMemcpyNodeSetParamsFromSymbol", "graph");
-    subst("cudaGraphMemcpyNodeSetParamsToSymbol", "hipGraphMemcpyNodeSetParamsToSymbol", "graph");
-    subst("cudaGraphMemsetNodeGetParams", "hipGraphMemsetNodeGetParams", "graph");
-    subst("cudaGraphMemsetNodeSetParams", "hipGraphMemsetNodeSetParams", "graph");
-    subst("cudaGraphNodeFindInClone", "hipGraphNodeFindInClone", "graph");
-    subst("cudaGraphNodeGetDependencies", "hipGraphNodeGetDependencies", "graph");
-    subst("cudaGraphNodeGetDependentNodes", "hipGraphNodeGetDependentNodes", "graph");
-    subst("cudaGraphNodeGetEnabled", "hipGraphNodeGetEnabled", "graph");
-    subst("cudaGraphNodeGetType", "hipGraphNodeGetType", "graph");
-    subst("cudaGraphNodeSetEnabled", "hipGraphNodeSetEnabled", "graph");
-    subst("cudaGraphReleaseUserObject", "hipGraphReleaseUserObject", "graph");
-    subst("cudaGraphRemoveDependencies", "hipGraphRemoveDependencies", "graph");
-    subst("cudaGraphRetainUserObject", "hipGraphRetainUserObject", "graph");
-    subst("cudaGraphUpload", "hipGraphUpload", "graph");
-    subst("cudaUserObjectCreate", "hipUserObjectCreate", "graph");
-    subst("cudaUserObjectRelease", "hipUserObjectRelease", "graph");
-    subst("cudaUserObjectRetain", "hipUserObjectRetain", "graph");
-    subst("cuOccupancyMaxActiveBlocksPerMultiprocessor", "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor", "occupancy");
-    subst("cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "occupancy");
-    subst("cuOccupancyMaxPotentialBlockSize", "hipModuleOccupancyMaxPotentialBlockSize", "occupancy");
-    subst("cuOccupancyMaxPotentialBlockSizeWithFlags", "hipModuleOccupancyMaxPotentialBlockSizeWithFlags", "occupancy");
-    subst("cudaOccupancyMaxActiveBlocksPerMultiprocessor", "hipOccupancyMaxActiveBlocksPerMultiprocessor", "occupancy");
-    subst("cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "occupancy");
-    subst("cudaOccupancyMaxPotentialBlockSize", "hipOccupancyMaxPotentialBlockSize", "occupancy");
-    subst("cudaOccupancyMaxPotentialBlockSizeVariableSMem", "hipOccupancyMaxPotentialBlockSizeVariableSMem", "occupancy");
-    subst("cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", "hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", "occupancy");
-    subst("cudaOccupancyMaxPotentialBlockSizeWithFlags", "hipOccupancyMaxPotentialBlockSizeWithFlags", "occupancy");
-    subst("cuTexObjectCreate", "hipTexObjectCreate", "texture");
-    subst("cuTexObjectDestroy", "hipTexObjectDestroy", "texture");
-    subst("cuTexObjectGetResourceDesc", "hipTexObjectGetResourceDesc", "texture");
-    subst("cuTexObjectGetResourceViewDesc", "hipTexObjectGetResourceViewDesc", "texture");
-    subst("cuTexObjectGetTextureDesc", "hipTexObjectGetTextureDesc", "texture");
-    subst("cuTexRefGetAddress", "hipTexRefGetAddress", "texture");
-    subst("cuTexRefGetAddressMode", "hipTexRefGetAddressMode", "texture");
-    subst("cuTexRefGetAddress_v2", "hipTexRefGetAddress", "texture");
-    subst("cuTexRefGetArray", "hipTexRefGetArray", "texture");
-    subst("cuTexRefGetBorderColor", "hipTexRefGetBorderColor", "texture");
-    subst("cuTexRefGetFilterMode", "hipTexRefGetFilterMode", "texture");
-    subst("cuTexRefGetFlags", "hipTexRefGetFlags", "texture");
-    subst("cuTexRefGetFormat", "hipTexRefGetFormat", "texture");
-    subst("cuTexRefGetMaxAnisotropy", "hipTexRefGetMaxAnisotropy", "texture");
-    subst("cuTexRefGetMipmapFilterMode", "hipTexRefGetMipmapFilterMode", "texture");
-    subst("cuTexRefGetMipmapLevelBias", "hipTexRefGetMipmapLevelBias", "texture");
-    subst("cuTexRefGetMipmapLevelClamp", "hipTexRefGetMipmapLevelClamp", "texture");
-    subst("cuTexRefGetMipmappedArray", "hipTexRefGetMipMappedArray", "texture");
-    subst("cuTexRefSetAddress", "hipTexRefSetAddress", "texture");
-    subst("cuTexRefSetAddress2D", "hipTexRefSetAddress2D", "texture");
-    subst("cuTexRefSetAddress2D_v2", "hipTexRefSetAddress2D", "texture");
-    subst("cuTexRefSetAddress2D_v3", "hipTexRefSetAddress2D", "texture");
-    subst("cuTexRefSetAddressMode", "hipTexRefSetAddressMode", "texture");
-    subst("cuTexRefSetAddress_v2", "hipTexRefSetAddress", "texture");
-    subst("cuTexRefSetArray", "hipTexRefSetArray", "texture");
-    subst("cuTexRefSetBorderColor", "hipTexRefSetBorderColor", "texture");
-    subst("cuTexRefSetFilterMode", "hipTexRefSetFilterMode", "texture");
-    subst("cuTexRefSetFlags", "hipTexRefSetFlags", "texture");
-    subst("cuTexRefSetFormat", "hipTexRefSetFormat", "texture");
-    subst("cuTexRefSetMaxAnisotropy", "hipTexRefSetMaxAnisotropy", "texture");
-    subst("cuTexRefSetMipmapFilterMode", "hipTexRefSetMipmapFilterMode", "texture");
-    subst("cuTexRefSetMipmapLevelBias", "hipTexRefSetMipmapLevelBias", "texture");
-    subst("cuTexRefSetMipmapLevelClamp", "hipTexRefSetMipmapLevelClamp", "texture");
-    subst("cuTexRefSetMipmappedArray", "hipTexRefSetMipmappedArray", "texture");
-    subst("cudaBindTexture", "hipBindTexture", "texture");
-    subst("cudaBindTexture2D", "hipBindTexture2D", "texture");
-    subst("cudaBindTextureToArray", "hipBindTextureToArray", "texture");
-    subst("cudaBindTextureToMipmappedArray", "hipBindTextureToMipmappedArray", "texture");
-    subst("cudaCreateChannelDesc", "hipCreateChannelDesc", "texture");
-    subst("cudaCreateTextureObject", "hipCreateTextureObject", "texture");
-    subst("cudaDestroyTextureObject", "hipDestroyTextureObject", "texture");
-    subst("cudaGetChannelDesc", "hipGetChannelDesc", "texture");
-    subst("cudaGetTextureAlignmentOffset", "hipGetTextureAlignmentOffset", "texture");
-    subst("cudaGetTextureObjectResourceDesc", "hipGetTextureObjectResourceDesc", "texture");
-    subst("cudaGetTextureObjectResourceViewDesc", "hipGetTextureObjectResourceViewDesc", "texture");
-    subst("cudaGetTextureObjectTextureDesc", "hipGetTextureObjectTextureDesc", "texture");
-    subst("cudaGetTextureReference", "hipGetTextureReference", "texture");
-    subst("cudaUnbindTexture", "hipUnbindTexture", "texture");
-    subst("cudaCreateSurfaceObject", "hipCreateSurfaceObject", "surface");
-    subst("cudaDestroySurfaceObject", "hipDestroySurfaceObject", "surface");
-    subst("cuCtxDisablePeerAccess", "hipCtxDisablePeerAccess", "peer");
-    subst("cuCtxEnablePeerAccess", "hipCtxEnablePeerAccess", "peer");
-    subst("cuDeviceCanAccessPeer", "hipDeviceCanAccessPeer", "peer");
-    subst("cuDeviceGetP2PAttribute", "hipDeviceGetP2PAttribute", "peer");
-    subst("cudaDeviceCanAccessPeer", "hipDeviceCanAccessPeer", "peer");
-    subst("cudaDeviceDisablePeerAccess", "hipDeviceDisablePeerAccess", "peer");
-    subst("cudaDeviceEnablePeerAccess", "hipDeviceEnablePeerAccess", "peer");
-    subst("cuGraphicsMapResources", "hipGraphicsMapResources", "graphics");
-    subst("cuGraphicsResourceGetMappedPointer", "hipGraphicsResourceGetMappedPointer", "graphics");
-    subst("cuGraphicsResourceGetMappedPointer_v2", "hipGraphicsResourceGetMappedPointer", "graphics");
-    subst("cuGraphicsSubResourceGetMappedArray", "hipGraphicsSubResourceGetMappedArray", "graphics");
-    subst("cuGraphicsUnmapResources", "hipGraphicsUnmapResources", "graphics");
-    subst("cuGraphicsUnregisterResource", "hipGraphicsUnregisterResource", "graphics");
-    subst("cudaGraphicsMapResources", "hipGraphicsMapResources", "graphics");
-    subst("cudaGraphicsResourceGetMappedPointer", "hipGraphicsResourceGetMappedPointer", "graphics");
-    subst("cudaGraphicsSubResourceGetMappedArray", "hipGraphicsSubResourceGetMappedArray", "graphics");
-    subst("cudaGraphicsUnmapResources", "hipGraphicsUnmapResources", "graphics");
-    subst("cudaGraphicsUnregisterResource", "hipGraphicsUnregisterResource", "graphics");
-    subst("cuProfilerStart", "hipProfilerStart", "profiler");
-    subst("cuProfilerStop", "hipProfilerStop", "profiler");
-    subst("cudaProfilerStart", "hipProfilerStart", "profiler");
-    subst("cudaProfilerStop", "hipProfilerStop", "profiler");
-    subst("cuGLGetDevices", "hipGLGetDevices", "openGL");
-    subst("cuGraphicsGLRegisterBuffer", "hipGraphicsGLRegisterBuffer", "openGL");
-    subst("cuGraphicsGLRegisterImage", "hipGraphicsGLRegisterImage", "openGL");
-    subst("cudaGLGetDevices", "hipGLGetDevices", "openGL");
-    subst("cudaGraphicsGLRegisterBuffer", "hipGraphicsGLRegisterBuffer", "openGL");
-    subst("cudaGraphicsGLRegisterImage", "hipGraphicsGLRegisterImage", "openGL");
-    subst("cudaThreadExit", "hipDeviceReset", "thread");
-    subst("cudaThreadGetCacheConfig", "hipDeviceGetCacheConfig", "thread");
-    subst("cudaThreadSetCacheConfig", "hipDeviceSetCacheConfig", "thread");
-    subst("cudaThreadSynchronize", "hipDeviceSynchronize", "thread");
-    subst("cuCabs", "hipCabs", "complex");
-    subst("cuCabsf", "hipCabsf", "complex");
-    subst("cuCadd", "hipCadd", "complex");
-    subst("cuCaddf", "hipCaddf", "complex");
-    subst("cuCdiv", "hipCdiv", "complex");
-    subst("cuCdivf", "hipCdivf", "complex");
-    subst("cuCfma", "hipCfma", "complex");
-    subst("cuCfmaf", "hipCfmaf", "complex");
-    subst("cuCimag", "hipCimag", "complex");
-    subst("cuCimagf", "hipCimagf", "complex");
-    subst("cuCmul", "hipCmul", "complex");
-    subst("cuCmulf", "hipCmulf", "complex");
-    subst("cuComplexDoubleToFloat", "hipComplexDoubleToFloat", "complex");
-    subst("cuComplexFloatToDouble", "hipComplexFloatToDouble", "complex");
-    subst("cuConj", "hipConj", "complex");
-    subst("cuConjf", "hipConjf", "complex");
-    subst("cuCreal", "hipCreal", "complex");
-    subst("cuCrealf", "hipCrealf", "complex");
-    subst("cuCsub", "hipCsub", "complex");
-    subst("cuCsubf", "hipCsubf", "complex");
-    subst("make_cuComplex", "make_hipComplex", "complex");
-    subst("make_cuDoubleComplex", "make_hipDoubleComplex", "complex");
-    subst("make_cuFloatComplex", "make_hipFloatComplex", "complex");
-    subst("cublasAxpyEx", "hipblasAxpyEx_v2", "library");
-    subst("cublasCaxpy", "hipblasCaxpy_v2", "library");
-    subst("cublasCaxpy_64", "hipblasCaxpy_v2_64", "library");
-    subst("cublasCaxpy_v2", "hipblasCaxpy_v2", "library");
-    subst("cublasCaxpy_v2_64", "hipblasCaxpy_v2_64", "library");
-    subst("cublasCcopy", "hipblasCcopy_v2", "library");
-    subst("cublasCcopy_64", "hipblasCcopy_v2_64", "library");
-    subst("cublasCcopy_v2", "hipblasCcopy_v2", "library");
-    subst("cublasCcopy_v2_64", "hipblasCcopy_v2_64", "library");
-    subst("cublasCdgmm", "hipblasCdgmm_v2", "library");
-    subst("cublasCdotc", "hipblasCdotc_v2", "library");
-    subst("cublasCdotc_64", "hipblasCdotc_v2_64", "library");
-    subst("cublasCdotc_v2", "hipblasCdotc_v2", "library");
-    subst("cublasCdotc_v2_64", "hipblasCdotc_v2_64", "library");
-    subst("cublasCdotu", "hipblasCdotu_v2", "library");
-    subst("cublasCdotu_64", "hipblasCdotu_v2_64", "library");
-    subst("cublasCdotu_v2", "hipblasCdotu_v2", "library");
-    subst("cublasCdotu_v2_64", "hipblasCdotu_v2_64", "library");
-    subst("cublasCgbmv", "hipblasCgbmv_v2", "library");
-    subst("cublasCgbmv_v2", "hipblasCgbmv_v2", "library");
-    subst("cublasCgeam", "hipblasCgeam_v2", "library");
-    subst("cublasCgelsBatched", "hipblasCgelsBatched_v2", "library");
-    subst("cublasCgemm", "hipblasCgemm_v2", "library");
-    subst("cublasCgemmBatched", "hipblasCgemmBatched_v2", "library");
-    subst("cublasCgemmStridedBatched", "hipblasCgemmStridedBatched_v2", "library");
-    subst("cublasCgemm_v2", "hipblasCgemm_v2", "library");
-    subst("cublasCgemv", "hipblasCgemv_v2", "library");
-    subst("cublasCgemvBatched", "hipblasCgemvBatched_v2", "library");
-    subst("cublasCgemvStridedBatched", "hipblasCgemvStridedBatched_v2", "library");
-    subst("cublasCgemv_v2", "hipblasCgemv_v2", "library");
-    subst("cublasCgeqrfBatched", "hipblasCgeqrfBatched_v2", "library");
-    subst("cublasCgerc", "hipblasCgerc_v2", "library");
-    subst("cublasCgerc_v2", "hipblasCgerc_v2", "library");
-    subst("cublasCgeru", "hipblasCgeru_v2", "library");
-    subst("cublasCgeru_v2", "hipblasCgeru_v2", "library");
-    subst("cublasCgetrfBatched", "hipblasCgetrfBatched_v2", "library");
-    subst("cublasCgetriBatched", "hipblasCgetriBatched_v2", "library");
-    subst("cublasCgetrsBatched", "hipblasCgetrsBatched_v2", "library");
-    subst("cublasChbmv", "hipblasChbmv_v2", "library");
-    subst("cublasChbmv_v2", "hipblasChbmv_v2", "library");
-    subst("cublasChemm", "hipblasChemm_v2", "library");
-    subst("cublasChemm_v2", "hipblasChemm_v2", "library");
-    subst("cublasChemv", "hipblasChemv_v2", "library");
-    subst("cublasChemv_v2", "hipblasChemv_v2", "library");
-    subst("cublasCher", "hipblasCher_v2", "library");
-    subst("cublasCher2", "hipblasCher2_v2", "library");
-    subst("cublasCher2_v2", "hipblasCher2_v2", "library");
-    subst("cublasCher2k", "hipblasCher2k_v2", "library");
-    subst("cublasCher2k_v2", "hipblasCher2k_v2", "library");
-    subst("cublasCher_v2", "hipblasCher_v2", "library");
-    subst("cublasCherk", "hipblasCherk_v2", "library");
-    subst("cublasCherk_v2", "hipblasCherk_v2", "library");
-    subst("cublasCherkx", "hipblasCherkx_v2", "library");
-    subst("cublasChpmv", "hipblasChpmv_v2", "library");
-    subst("cublasChpmv_v2", "hipblasChpmv_v2", "library");
-    subst("cublasChpr", "hipblasChpr_v2", "library");
-    subst("cublasChpr2", "hipblasChpr2_v2", "library");
-    subst("cublasChpr2_v2", "hipblasChpr2_v2", "library");
-    subst("cublasChpr_v2", "hipblasChpr_v2", "library");
-    subst("cublasCreate", "hipblasCreate", "library");
-    subst("cublasCreate_v2", "hipblasCreate", "library");
-    subst("cublasCrot", "hipblasCrot_v2", "library");
-    subst("cublasCrot_64", "hipblasCrot_v2_64", "library");
-    subst("cublasCrot_v2", "hipblasCrot_v2", "library");
-    subst("cublasCrot_v2_64", "hipblasCrot_v2_64", "library");
-    subst("cublasCrotg", "hipblasCrotg_v2", "library");
-    subst("cublasCrotg_v2", "hipblasCrotg_v2", "library");
-    subst("cublasCscal", "hipblasCscal_v2", "library");
-    subst("cublasCscal_64", "hipblasCscal_v2_64", "library");
-    subst("cublasCscal_v2", "hipblasCscal_v2", "library");
-    subst("cublasCscal_v2_64", "hipblasCscal_v2_64", "library");
-    subst("cublasCsrot", "hipblasCsrot_v2", "library");
-    subst("cublasCsrot_64", "hipblasCsrot_v2_64", "library");
-    subst("cublasCsrot_v2", "hipblasCsrot_v2", "library");
-    subst("cublasCsrot_v2_64", "hipblasCsrot_v2_64", "library");
-    subst("cublasCsscal", "hipblasCsscal_v2", "library");
-    subst("cublasCsscal_64", "hipblasCsscal_v2_64", "library");
-    subst("cublasCsscal_v2", "hipblasCsscal_v2", "library");
-    subst("cublasCsscal_v2_64", "hipblasCsscal_v2_64", "library");
-    subst("cublasCswap", "hipblasCswap_v2", "library");
-    subst("cublasCswap_64", "hipblasCswap_v2_64", "library");
-    subst("cublasCswap_v2", "hipblasCswap_v2", "library");
-    subst("cublasCswap_v2_64", "hipblasCswap_v2_64", "library");
-    subst("cublasCsymm", "hipblasCsymm_v2", "library");
-    subst("cublasCsymm_v2", "hipblasCsymm_v2", "library");
-    subst("cublasCsymv", "hipblasCsymv_v2", "library");
-    subst("cublasCsymv_v2", "hipblasCsymv_v2", "library");
-    subst("cublasCsyr", "hipblasCsyr_v2", "library");
-    subst("cublasCsyr2", "hipblasCsyr2_v2", "library");
-    subst("cublasCsyr2_v2", "hipblasCsyr2_v2", "library");
-    subst("cublasCsyr2k", "hipblasCsyr2k_v2", "library");
-    subst("cublasCsyr2k_v2", "hipblasCsyr2k_v2", "library");
-    subst("cublasCsyr_v2", "hipblasCsyr_v2", "library");
-    subst("cublasCsyrk", "hipblasCsyrk_v2", "library");
-    subst("cublasCsyrk_v2", "hipblasCsyrk_v2", "library");
-    subst("cublasCsyrkx", "hipblasCsyrkx_v2", "library");
-    subst("cublasCtbmv", "hipblasCtbmv_v2", "library");
-    subst("cublasCtbmv_v2", "hipblasCtbmv_v2", "library");
-    subst("cublasCtbsv", "hipblasCtbsv_v2", "library");
-    subst("cublasCtbsv_v2", "hipblasCtbsv_v2", "library");
-    subst("cublasCtpmv", "hipblasCtpmv_v2", "library");
-    subst("cublasCtpmv_v2", "hipblasCtpmv_v2", "library");
-    subst("cublasCtpsv", "hipblasCtpsv_v2", "library");
-    subst("cublasCtpsv_v2", "hipblasCtpsv_v2", "library");
-    subst("cublasCtrmm", "hipblasCtrmm_v2", "library");
-    subst("cublasCtrmm_v2", "hipblasCtrmm_v2", "library");
-    subst("cublasCtrmv", "hipblasCtrmv_v2", "library");
-    subst("cublasCtrmv_v2", "hipblasCtrmv_v2", "library");
-    subst("cublasCtrsm", "hipblasCtrsm_v2", "library");
-    subst("cublasCtrsmBatched", "hipblasCtrsmBatched_v2", "library");
-    subst("cublasCtrsm_v2", "hipblasCtrsm_v2", "library");
-    subst("cublasCtrsv", "hipblasCtrsv_v2", "library");
-    subst("cublasCtrsv_v2", "hipblasCtrsv_v2", "library");
-    subst("cublasDasum", "hipblasDasum", "library");
-    subst("cublasDasum_64", "hipblasDasum_64", "library");
-    subst("cublasDasum_v2", "hipblasDasum", "library");
-    subst("cublasDasum_v2_64", "hipblasDasum_64", "library");
-    subst("cublasDaxpy", "hipblasDaxpy", "library");
-    subst("cublasDaxpy_64", "hipblasDaxpy_64", "library");
-    subst("cublasDaxpy_v2", "hipblasDaxpy", "library");
-    subst("cublasDaxpy_v2_64", "hipblasDaxpy_64", "library");
-    subst("cublasDcopy", "hipblasDcopy", "library");
-    subst("cublasDcopy_64", "hipblasDcopy_64", "library");
-    subst("cublasDcopy_v2", "hipblasDcopy", "library");
-    subst("cublasDcopy_v2_64", "hipblasDcopy_64", "library");
-    subst("cublasDdgmm", "hipblasDdgmm", "library");
-    subst("cublasDdot", "hipblasDdot", "library");
-    subst("cublasDdot_64", "hipblasDdot_64", "library");
-    subst("cublasDdot_v2", "hipblasDdot", "library");
-    subst("cublasDdot_v2_64", "hipblasDdot_64", "library");
-    subst("cublasDestroy", "hipblasDestroy", "library");
-    subst("cublasDestroy_v2", "hipblasDestroy", "library");
-    subst("cublasDgbmv", "hipblasDgbmv", "library");
-    subst("cublasDgbmv_v2", "hipblasDgbmv", "library");
-    subst("cublasDgeam", "hipblasDgeam", "library");
-    subst("cublasDgelsBatched", "hipblasDgelsBatched", "library");
-    subst("cublasDgemm", "hipblasDgemm", "library");
-    subst("cublasDgemmBatched", "hipblasDgemmBatched", "library");
-    subst("cublasDgemmStridedBatched", "hipblasDgemmStridedBatched", "library");
-    subst("cublasDgemm_v2", "hipblasDgemm", "library");
-    subst("cublasDgemv", "hipblasDgemv", "library");
-    subst("cublasDgemvBatched", "hipblasDgemvBatched", "library");
-    subst("cublasDgemvStridedBatched", "hipblasDgemvStridedBatched", "library");
-    subst("cublasDgemv_v2", "hipblasDgemv", "library");
-    subst("cublasDgeqrfBatched", "hipblasDgeqrfBatched", "library");
-    subst("cublasDger", "hipblasDger", "library");
-    subst("cublasDger_v2", "hipblasDger", "library");
-    subst("cublasDgetrfBatched", "hipblasDgetrfBatched", "library");
-    subst("cublasDgetriBatched", "hipblasDgetriBatched", "library");
-    subst("cublasDgetrsBatched", "hipblasDgetrsBatched", "library");
-    subst("cublasDnrm2", "hipblasDnrm2", "library");
-    subst("cublasDnrm2_64", "hipblasDnrm2_64", "library");
-    subst("cublasDnrm2_v2", "hipblasDnrm2", "library");
-    subst("cublasDnrm2_v2_64", "hipblasDnrm2_64", "library");
-    subst("cublasDotEx", "hipblasDotEx_v2", "library");
-    subst("cublasDotcEx", "hipblasDotcEx_v2", "library");
-    subst("cublasDrot", "hipblasDrot", "library");
-    subst("cublasDrot_64", "hipblasDrot_64", "library");
-    subst("cublasDrot_v2", "hipblasDrot", "library");
-    subst("cublasDrot_v2_64", "hipblasDrot_64", "library");
-    subst("cublasDrotg", "hipblasDrotg", "library");
-    subst("cublasDrotg_v2", "hipblasDrotg", "library");
-    subst("cublasDrotm", "hipblasDrotm", "library");
-    subst("cublasDrotm_64", "hipblasDrotm_64", "library");
-    subst("cublasDrotm_v2", "hipblasDrotm", "library");
-    subst("cublasDrotm_v2_64", "hipblasDrotm_64", "library");
-    subst("cublasDrotmg", "hipblasDrotmg", "library");
-    subst("cublasDrotmg_v2", "hipblasDrotmg", "library");
-    subst("cublasDsbmv", "hipblasDsbmv", "library");
-    subst("cublasDsbmv_v2", "hipblasDsbmv", "library");
-    subst("cublasDscal", "hipblasDscal", "library");
-    subst("cublasDscal_64", "hipblasDscal_64", "library");
-    subst("cublasDscal_v2", "hipblasDscal", "library");
-    subst("cublasDscal_v2_64", "hipblasDscal_64", "library");
-    subst("cublasDspmv", "hipblasDspmv", "library");
-    subst("cublasDspmv_v2", "hipblasDspmv", "library");
-    subst("cublasDspr", "hipblasDspr", "library");
-    subst("cublasDspr2", "hipblasDspr2", "library");
-    subst("cublasDspr2_v2", "hipblasDspr2", "library");
-    subst("cublasDspr_v2", "hipblasDspr", "library");
-    subst("cublasDswap", "hipblasDswap", "library");
-    subst("cublasDswap_64", "hipblasDswap_64", "library");
-    subst("cublasDswap_v2", "hipblasDswap", "library");
-    subst("cublasDswap_v2_64", "hipblasDswap_64", "library");
-    subst("cublasDsymm", "hipblasDsymm", "library");
-    subst("cublasDsymm_v2", "hipblasDsymm", "library");
-    subst("cublasDsymv", "hipblasDsymv", "library");
-    subst("cublasDsymv_v2", "hipblasDsymv", "library");
-    subst("cublasDsyr", "hipblasDsyr", "library");
-    subst("cublasDsyr2", "hipblasDsyr2", "library");
-    subst("cublasDsyr2_v2", "hipblasDsyr2", "library");
-    subst("cublasDsyr2k", "hipblasDsyr2k", "library");
-    subst("cublasDsyr2k_v2", "hipblasDsyr2k", "library");
-    subst("cublasDsyr_v2", "hipblasDsyr", "library");
-    subst("cublasDsyrk", "hipblasDsyrk", "library");
-    subst("cublasDsyrk_v2", "hipblasDsyrk", "library");
-    subst("cublasDsyrkx", "hipblasDsyrkx", "library");
-    subst("cublasDtbmv", "hipblasDtbmv", "library");
-    subst("cublasDtbmv_v2", "hipblasDtbmv", "library");
-    subst("cublasDtbsv", "hipblasDtbsv", "library");
-    subst("cublasDtbsv_v2", "hipblasDtbsv", "library");
-    subst("cublasDtpmv", "hipblasDtpmv", "library");
-    subst("cublasDtpmv_v2", "hipblasDtpmv", "library");
-    subst("cublasDtpsv", "hipblasDtpsv", "library");
-    subst("cublasDtpsv_v2", "hipblasDtpsv", "library");
-    subst("cublasDtrmm", "hipblasDtrmm", "library");
-    subst("cublasDtrmm_v2", "hipblasDtrmm", "library");
-    subst("cublasDtrmv", "hipblasDtrmv", "library");
-    subst("cublasDtrmv_v2", "hipblasDtrmv", "library");
-    subst("cublasDtrsm", "hipblasDtrsm", "library");
-    subst("cublasDtrsmBatched", "hipblasDtrsmBatched", "library");
-    subst("cublasDtrsm_v2", "hipblasDtrsm", "library");
-    subst("cublasDtrsv", "hipblasDtrsv", "library");
-    subst("cublasDtrsv_v2", "hipblasDtrsv", "library");
-    subst("cublasDzasum", "hipblasDzasum_v2", "library");
-    subst("cublasDzasum_64", "hipblasDzasum_v2_64", "library");
-    subst("cublasDzasum_v2", "hipblasDzasum_v2", "library");
-    subst("cublasDzasum_v2_64", "hipblasDzasum_v2_64", "library");
-    subst("cublasDznrm2", "hipblasDznrm2_v2", "library");
-    subst("cublasDznrm2_64", "hipblasDznrm2_v2_64", "library");
-    subst("cublasDznrm2_v2", "hipblasDznrm2_v2", "library");
-    subst("cublasDznrm2_v2_64", "hipblasDznrm2_v2_64", "library");
-    subst("cublasGemmBatchedEx", "hipblasGemmBatchedEx_v2", "library");
-    subst("cublasGemmEx", "hipblasGemmEx_v2", "library");
-    subst("cublasGemmStridedBatchedEx", "hipblasGemmStridedBatchedEx_v2", "library");
-    subst("cublasGetAtomicsMode", "hipblasGetAtomicsMode", "library");
-    subst("cublasGetMathMode", "hipblasGetMathMode", "library");
-    subst("cublasGetMatrix", "hipblasGetMatrix", "library");
-    subst("cublasGetMatrixAsync", "hipblasGetMatrixAsync", "library");
-    subst("cublasGetPointerMode", "hipblasGetPointerMode", "library");
-    subst("cublasGetPointerMode_v2", "hipblasGetPointerMode", "library");
-    subst("cublasGetStream", "hipblasGetStream", "library");
-    subst("cublasGetStream_v2", "hipblasGetStream", "library");
-    subst("cublasGetVector", "hipblasGetVector", "library");
-    subst("cublasGetVectorAsync", "hipblasGetVectorAsync", "library");
-    subst("cublasHgemm", "hipblasHgemm", "library");
-    subst("cublasHgemmBatched", "hipblasHgemmBatched", "library");
-    subst("cublasHgemmStridedBatched", "hipblasHgemmStridedBatched", "library");
-    subst("cublasIcamax", "hipblasIcamax_v2", "library");
-    subst("cublasIcamax_64", "hipblasIcamax_v2_64", "library");
-    subst("cublasIcamax_v2", "hipblasIcamax_v2", "library");
-    subst("cublasIcamax_v2_64", "hipblasIcamax_v2_64", "library");
-    subst("cublasIcamin", "hipblasIcamin_v2", "library");
-    subst("cublasIcamin_64", "hipblasIcamin_v2_64", "library");
-    subst("cublasIcamin_v2", "hipblasIcamin_v2", "library");
-    subst("cublasIcamin_v2_64", "hipblasIcamin_v2_64", "library");
-    subst("cublasIdamax", "hipblasIdamax", "library");
-    subst("cublasIdamax_64", "hipblasIdamax_64", "library");
-    subst("cublasIdamax_v2", "hipblasIdamax", "library");
-    subst("cublasIdamax_v2_64", "hipblasIdamax_64", "library");
-    subst("cublasIdamin", "hipblasIdamin", "library");
-    subst("cublasIdamin_64", "hipblasIdamin_64", "library");
-    subst("cublasIdamin_v2", "hipblasIdamin", "library");
-    subst("cublasIdamin_v2_64", "hipblasIdamin_64", "library");
-    subst("cublasIsamax", "hipblasIsamax", "library");
-    subst("cublasIsamax_64", "hipblasIsamax_64", "library");
-    subst("cublasIsamax_v2", "hipblasIsamax", "library");
-    subst("cublasIsamax_v2_64", "hipblasIsamax_64", "library");
-    subst("cublasIsamin", "hipblasIsamin", "library");
-    subst("cublasIsamin_64", "hipblasIsamin_64", "library");
-    subst("cublasIsamin_v2", "hipblasIsamin", "library");
-    subst("cublasIsamin_v2_64", "hipblasIsamin_64", "library");
-    subst("cublasIzamax", "hipblasIzamax_v2", "library");
-    subst("cublasIzamax_64", "hipblasIzamax_v2_64", "library");
-    subst("cublasIzamax_v2", "hipblasIzamax_v2", "library");
-    subst("cublasIzamax_v2_64", "hipblasIzamax_v2_64", "library");
-    subst("cublasIzamin", "hipblasIzamin_v2", "library");
-    subst("cublasIzamin_64", "hipblasIzamin_v2_64", "library");
-    subst("cublasIzamin_v2", "hipblasIzamin_v2", "library");
-    subst("cublasIzamin_v2_64", "hipblasIzamin_v2_64", "library");
-    subst("cublasLtCreate", "hipblasLtCreate", "library");
-    subst("cublasLtDestroy", "hipblasLtDestroy", "library");
-    subst("cublasLtMatmul", "hipblasLtMatmul", "library");
-    subst("cublasLtMatmulAlgoGetHeuristic", "hipblasLtMatmulAlgoGetHeuristic", "library");
-    subst("cublasLtMatmulDescCreate", "hipblasLtMatmulDescCreate", "library");
-    subst("cublasLtMatmulDescDestroy", "hipblasLtMatmulDescDestroy", "library");
-    subst("cublasLtMatmulDescGetAttribute", "hipblasLtMatmulDescGetAttribute", "library");
-    subst("cublasLtMatmulDescSetAttribute", "hipblasLtMatmulDescSetAttribute", "library");
-    subst("cublasLtMatmulPreferenceCreate", "hipblasLtMatmulPreferenceCreate", "library");
-    subst("cublasLtMatmulPreferenceDestroy", "hipblasLtMatmulPreferenceDestroy", "library");
-    subst("cublasLtMatmulPreferenceGetAttribute", "hipblasLtMatmulPreferenceGetAttribute", "library");
-    subst("cublasLtMatmulPreferenceSetAttribute", "hipblasLtMatmulPreferenceSetAttribute", "library");
-    subst("cublasLtMatrixLayoutCreate", "hipblasLtMatrixLayoutCreate", "library");
-    subst("cublasLtMatrixLayoutDestroy", "hipblasLtMatrixLayoutDestroy", "library");
-    subst("cublasLtMatrixLayoutGetAttribute", "hipblasLtMatrixLayoutGetAttribute", "library");
-    subst("cublasLtMatrixLayoutSetAttribute", "hipblasLtMatrixLayoutSetAttribute", "library");
-    subst("cublasLtMatrixTransform", "hipblasLtMatrixTransform", "library");
-    subst("cublasLtMatrixTransformDescCreate", "hipblasLtMatrixTransformDescCreate", "library");
-    subst("cublasLtMatrixTransformDescDestroy", "hipblasLtMatrixTransformDescDestroy", "library");
-    subst("cublasLtMatrixTransformDescGetAttribute", "hipblasLtMatrixTransformDescGetAttribute", "library");
-    subst("cublasLtMatrixTransformDescSetAttribute", "hipblasLtMatrixTransformDescSetAttribute", "library");
-    subst("cublasNrm2Ex", "hipblasNrm2Ex_v2", "library");
-    subst("cublasRotEx", "hipblasRotEx_v2", "library");
-    subst("cublasSasum", "hipblasSasum", "library");
-    subst("cublasSasum_64", "hipblasSasum_64", "library");
-    subst("cublasSasum_v2", "hipblasSasum", "library");
-    subst("cublasSasum_v2_64", "hipblasSasum_64", "library");
-    subst("cublasSaxpy", "hipblasSaxpy", "library");
-    subst("cublasSaxpy_64", "hipblasSaxpy_64", "library");
-    subst("cublasSaxpy_v2", "hipblasSaxpy", "library");
-    subst("cublasSaxpy_v2_64", "hipblasSaxpy_64", "library");
-    subst("cublasScalEx", "hipblasScalEx_v2", "library");
-    subst("cublasScasum", "hipblasScasum_v2", "library");
-    subst("cublasScasum_64", "hipblasScasum_v2_64", "library");
-    subst("cublasScasum_v2", "hipblasScasum_v2", "library");
-    subst("cublasScasum_v2_64", "hipblasScasum_v2_64", "library");
-    subst("cublasScnrm2", "hipblasScnrm2_v2", "library");
-    subst("cublasScnrm2_64", "hipblasScnrm2_v2_64", "library");
-    subst("cublasScnrm2_v2", "hipblasScnrm2_v2", "library");
-    subst("cublasScnrm2_v2_64", "hipblasScnrm2_v2_64", "library");
-    subst("cublasScopy", "hipblasScopy", "library");
-    subst("cublasScopy_64", "hipblasScopy_64", "library");
-    subst("cublasScopy_v2", "hipblasScopy", "library");
-    subst("cublasScopy_v2_64", "hipblasScopy_64", "library");
-    subst("cublasSdgmm", "hipblasSdgmm", "library");
-    subst("cublasSdot", "hipblasSdot", "library");
-    subst("cublasSdot_64", "hipblasSdot_64", "library");
-    subst("cublasSdot_v2", "hipblasSdot", "library");
-    subst("cublasSdot_v2_64", "hipblasSdot_64", "library");
-    subst("cublasSetAtomicsMode", "hipblasSetAtomicsMode", "library");
-    subst("cublasSetMathMode", "hipblasSetMathMode", "library");
-    subst("cublasSetMatrix", "hipblasSetMatrix", "library");
-    subst("cublasSetMatrixAsync", "hipblasSetMatrixAsync", "library");
-    subst("cublasSetPointerMode", "hipblasSetPointerMode", "library");
-    subst("cublasSetPointerMode_v2", "hipblasSetPointerMode", "library");
-    subst("cublasSetStream", "hipblasSetStream", "library");
-    subst("cublasSetStream_v2", "hipblasSetStream", "library");
-    subst("cublasSetVector", "hipblasSetVector", "library");
-    subst("cublasSetVectorAsync", "hipblasSetVectorAsync", "library");
-    subst("cublasSgbmv", "hipblasSgbmv", "library");
-    subst("cublasSgbmv_v2", "hipblasSgbmv", "library");
-    subst("cublasSgeam", "hipblasSgeam", "library");
-    subst("cublasSgelsBatched", "hipblasSgelsBatched", "library");
-    subst("cublasSgemm", "hipblasSgemm", "library");
-    subst("cublasSgemmBatched", "hipblasSgemmBatched", "library");
-    subst("cublasSgemmStridedBatched", "hipblasSgemmStridedBatched", "library");
-    subst("cublasSgemm_v2", "hipblasSgemm", "library");
-    subst("cublasSgemv", "hipblasSgemv", "library");
-    subst("cublasSgemvBatched", "hipblasSgemvBatched", "library");
-    subst("cublasSgemvStridedBatched", "hipblasSgemvStridedBatched", "library");
-    subst("cublasSgemv_v2", "hipblasSgemv", "library");
-    subst("cublasSgeqrfBatched", "hipblasSgeqrfBatched", "library");
-    subst("cublasSger", "hipblasSger", "library");
-    subst("cublasSger_v2", "hipblasSger", "library");
-    subst("cublasSgetrfBatched", "hipblasSgetrfBatched", "library");
-    subst("cublasSgetriBatched", "hipblasSgetriBatched", "library");
-    subst("cublasSgetrsBatched", "hipblasSgetrsBatched", "library");
-    subst("cublasSnrm2", "hipblasSnrm2", "library");
-    subst("cublasSnrm2_64", "hipblasSnrm2_64", "library");
-    subst("cublasSnrm2_v2", "hipblasSnrm2", "library");
-    subst("cublasSnrm2_v2_64", "hipblasSnrm2_64", "library");
-    subst("cublasSrot", "hipblasSrot", "library");
-    subst("cublasSrot_64", "hipblasSrot_64", "library");
-    subst("cublasSrot_v2", "hipblasSrot", "library");
-    subst("cublasSrot_v2_64", "hipblasSrot_64", "library");
-    subst("cublasSrotg", "hipblasSrotg", "library");
-    subst("cublasSrotg_v2", "hipblasSrotg", "library");
-    subst("cublasSrotm", "hipblasSrotm", "library");
-    subst("cublasSrotm_64", "hipblasSrotm_64", "library");
-    subst("cublasSrotm_v2", "hipblasSrotm", "library");
-    subst("cublasSrotm_v2_64", "hipblasSrotm_64", "library");
-    subst("cublasSrotmg", "hipblasSrotmg", "library");
-    subst("cublasSrotmg_v2", "hipblasSrotmg", "library");
-    subst("cublasSsbmv", "hipblasSsbmv", "library");
-    subst("cublasSsbmv_v2", "hipblasSsbmv", "library");
-    subst("cublasSscal", "hipblasSscal", "library");
-    subst("cublasSscal_64", "hipblasSscal_64", "library");
-    subst("cublasSscal_v2", "hipblasSscal", "library");
-    subst("cublasSscal_v2_64", "hipblasSscal_64", "library");
-    subst("cublasSspmv", "hipblasSspmv", "library");
-    subst("cublasSspmv_v2", "hipblasSspmv", "library");
-    subst("cublasSspr", "hipblasSspr", "library");
-    subst("cublasSspr2", "hipblasSspr2", "library");
-    subst("cublasSspr2_v2", "hipblasSspr2", "library");
-    subst("cublasSspr_v2", "hipblasSspr", "library");
-    subst("cublasSswap", "hipblasSswap", "library");
-    subst("cublasSswap_64", "hipblasSswap_64", "library");
-    subst("cublasSswap_v2", "hipblasSswap", "library");
-    subst("cublasSswap_v2_64", "hipblasSswap_64", "library");
-    subst("cublasSsymm", "hipblasSsymm", "library");
-    subst("cublasSsymm_v2", "hipblasSsymm", "library");
-    subst("cublasSsymv", "hipblasSsymv", "library");
-    subst("cublasSsymv_v2", "hipblasSsymv", "library");
-    subst("cublasSsyr", "hipblasSsyr", "library");
-    subst("cublasSsyr2", "hipblasSsyr2", "library");
-    subst("cublasSsyr2_v2", "hipblasSsyr2", "library");
-    subst("cublasSsyr2k", "hipblasSsyr2k", "library");
-    subst("cublasSsyr2k_v2", "hipblasSsyr2k", "library");
-    subst("cublasSsyr_v2", "hipblasSsyr", "library");
-    subst("cublasSsyrk", "hipblasSsyrk", "library");
-    subst("cublasSsyrk_v2", "hipblasSsyrk", "library");
-    subst("cublasSsyrkx", "hipblasSsyrkx", "library");
-    subst("cublasStbmv", "hipblasStbmv", "library");
-    subst("cublasStbmv_v2", "hipblasStbmv", "library");
-    subst("cublasStbsv", "hipblasStbsv", "library");
-    subst("cublasStbsv_v2", "hipblasStbsv", "library");
-    subst("cublasStpmv", "hipblasStpmv", "library");
-    subst("cublasStpmv_v2", "hipblasStpmv", "library");
-    subst("cublasStpsv", "hipblasStpsv", "library");
-    subst("cublasStpsv_v2", "hipblasStpsv", "library");
-    subst("cublasStrmm", "hipblasStrmm", "library");
-    subst("cublasStrmm_v2", "hipblasStrmm", "library");
-    subst("cublasStrmv", "hipblasStrmv", "library");
-    subst("cublasStrmv_v2", "hipblasStrmv", "library");
-    subst("cublasStrsm", "hipblasStrsm", "library");
-    subst("cublasStrsmBatched", "hipblasStrsmBatched", "library");
-    subst("cublasStrsm_v2", "hipblasStrsm", "library");
-    subst("cublasStrsv", "hipblasStrsv", "library");
-    subst("cublasStrsv_v2", "hipblasStrsv", "library");
-    subst("cublasZaxpy", "hipblasZaxpy_v2", "library");
-    subst("cublasZaxpy_64", "hipblasZaxpy_v2_64", "library");
-    subst("cublasZaxpy_v2", "hipblasZaxpy_v2", "library");
-    subst("cublasZaxpy_v2_64", "hipblasZaxpy_v2_64", "library");
-    subst("cublasZcopy", "hipblasZcopy_v2", "library");
-    subst("cublasZcopy_64", "hipblasZcopy_v2_64", "library");
-    subst("cublasZcopy_v2", "hipblasZcopy_v2", "library");
-    subst("cublasZcopy_v2_64", "hipblasZcopy_v2_64", "library");
-    subst("cublasZdgmm", "hipblasZdgmm_v2", "library");
-    subst("cublasZdotc", "hipblasZdotc_v2", "library");
-    subst("cublasZdotc_64", "hipblasZdotc_v2_64", "library");
-    subst("cublasZdotc_v2", "hipblasZdotc_v2", "library");
-    subst("cublasZdotc_v2_64", "hipblasZdotc_v2_64", "library");
-    subst("cublasZdotu", "hipblasZdotu_v2", "library");
-    subst("cublasZdotu_64", "hipblasZdotu_v2_64", "library");
-    subst("cublasZdotu_v2", "hipblasZdotu_v2", "library");
-    subst("cublasZdotu_v2_64", "hipblasZdotu_v2_64", "library");
-    subst("cublasZdrot", "hipblasZdrot_v2", "library");
-    subst("cublasZdrot_64", "hipblasZdrot_v2_64", "library");
-    subst("cublasZdrot_v2", "hipblasZdrot_v2", "library");
-    subst("cublasZdrot_v2_64", "hipblasZdrot_v2_64", "library");
-    subst("cublasZdscal", "hipblasZdscal_v2", "library");
-    subst("cublasZdscal_64", "hipblasZdscal_v2_64", "library");
-    subst("cublasZdscal_v2", "hipblasZdscal_v2", "library");
-    subst("cublasZdscal_v2_64", "hipblasZdscal_v2_64", "library");
-    subst("cublasZgbmv", "hipblasZgbmv_v2", "library");
-    subst("cublasZgbmv_v2", "hipblasZgbmv_v2", "library");
-    subst("cublasZgeam", "hipblasZgeam_v2", "library");
-    subst("cublasZgelsBatched", "hipblasZgelsBatched_v2", "library");
-    subst("cublasZgemm", "hipblasZgemm_v2", "library");
-    subst("cublasZgemmBatched", "hipblasZgemmBatched_v2", "library");
-    subst("cublasZgemmStridedBatched", "hipblasZgemmStridedBatched_v2", "library");
-    subst("cublasZgemm_v2", "hipblasZgemm_v2", "library");
-    subst("cublasZgemv", "hipblasZgemv_v2", "library");
-    subst("cublasZgemvBatched", "hipblasZgemvBatched_v2", "library");
-    subst("cublasZgemvStridedBatched", "hipblasZgemvStridedBatched_v2", "library");
-    subst("cublasZgemv_v2", "hipblasZgemv_v2", "library");
-    subst("cublasZgeqrfBatched", "hipblasZgeqrfBatched_v2", "library");
-    subst("cublasZgerc", "hipblasZgerc_v2", "library");
-    subst("cublasZgerc_v2", "hipblasZgerc_v2", "library");
-    subst("cublasZgeru", "hipblasZgeru_v2", "library");
-    subst("cublasZgeru_v2", "hipblasZgeru_v2", "library");
-    subst("cublasZgetrfBatched", "hipblasZgetrfBatched_v2", "library");
-    subst("cublasZgetriBatched", "hipblasZgetriBatched_v2", "library");
-    subst("cublasZgetrsBatched", "hipblasZgetrsBatched_v2", "library");
-    subst("cublasZhbmv", "hipblasZhbmv_v2", "library");
-    subst("cublasZhbmv_v2", "hipblasZhbmv_v2", "library");
-    subst("cublasZhemm", "hipblasZhemm_v2", "library");
-    subst("cublasZhemm_v2", "hipblasZhemm_v2", "library");
-    subst("cublasZhemv", "hipblasZhemv_v2", "library");
-    subst("cublasZhemv_v2", "hipblasZhemv_v2", "library");
-    subst("cublasZher", "hipblasZher_v2", "library");
-    subst("cublasZher2", "hipblasZher2_v2", "library");
-    subst("cublasZher2_v2", "hipblasZher2_v2", "library");
-    subst("cublasZher2k", "hipblasZher2k_v2", "library");
-    subst("cublasZher2k_v2", "hipblasZher2k_v2", "library");
-    subst("cublasZher_v2", "hipblasZher_v2", "library");
-    subst("cublasZherk", "hipblasZherk_v2", "library");
-    subst("cublasZherk_v2", "hipblasZherk_v2", "library");
-    subst("cublasZherkx", "hipblasZherkx_v2", "library");
-    subst("cublasZhpmv", "hipblasZhpmv_v2", "library");
-    subst("cublasZhpmv_v2", "hipblasZhpmv_v2", "library");
-    subst("cublasZhpr", "hipblasZhpr_v2", "library");
-    subst("cublasZhpr2", "hipblasZhpr2_v2", "library");
-    subst("cublasZhpr2_v2", "hipblasZhpr2_v2", "library");
-    subst("cublasZhpr_v2", "hipblasZhpr_v2", "library");
-    subst("cublasZrot", "hipblasZrot_v2", "library");
-    subst("cublasZrot_64", "hipblasZrot_v2_64", "library");
-    subst("cublasZrot_v2", "hipblasZrot_v2", "library");
-    subst("cublasZrot_v2_64", "hipblasZrot_v2_64", "library");
-    subst("cublasZrotg", "hipblasZrotg_v2", "library");
-    subst("cublasZrotg_v2", "hipblasZrotg_v2", "library");
-    subst("cublasZscal", "hipblasZscal_v2", "library");
-    subst("cublasZscal_64", "hipblasZscal_v2_64", "library");
-    subst("cublasZscal_v2", "hipblasZscal_v2", "library");
-    subst("cublasZscal_v2_64", "hipblasZscal_v2_64", "library");
-    subst("cublasZswap", "hipblasZswap_v2", "library");
-    subst("cublasZswap_64", "hipblasZswap_v2_64", "library");
-    subst("cublasZswap_v2", "hipblasZswap_v2", "library");
-    subst("cublasZswap_v2_64", "hipblasZswap_v2_64", "library");
-    subst("cublasZsymm", "hipblasZsymm_v2", "library");
-    subst("cublasZsymm_v2", "hipblasZsymm_v2", "library");
-    subst("cublasZsymv", "hipblasZsymv_v2", "library");
-    subst("cublasZsymv_v2", "hipblasZsymv_v2", "library");
-    subst("cublasZsyr", "hipblasZsyr_v2", "library");
-    subst("cublasZsyr2", "hipblasZsyr2_v2", "library");
-    subst("cublasZsyr2_v2", "hipblasZsyr2_v2", "library");
-    subst("cublasZsyr2k", "hipblasZsyr2k_v2", "library");
-    subst("cublasZsyr2k_v2", "hipblasZsyr2k_v2", "library");
-    subst("cublasZsyr_v2", "hipblasZsyr_v2", "library");
-    subst("cublasZsyrk", "hipblasZsyrk_v2", "library");
-    subst("cublasZsyrk_v2", "hipblasZsyrk_v2", "library");
-    subst("cublasZsyrkx", "hipblasZsyrkx_v2", "library");
-    subst("cublasZtbmv", "hipblasZtbmv_v2", "library");
-    subst("cublasZtbmv_v2", "hipblasZtbmv_v2", "library");
-    subst("cublasZtbsv", "hipblasZtbsv_v2", "library");
-    subst("cublasZtbsv_v2", "hipblasZtbsv_v2", "library");
-    subst("cublasZtpmv", "hipblasZtpmv_v2", "library");
-    subst("cublasZtpmv_v2", "hipblasZtpmv_v2", "library");
-    subst("cublasZtpsv", "hipblasZtpsv_v2", "library");
-    subst("cublasZtpsv_v2", "hipblasZtpsv_v2", "library");
-    subst("cublasZtrmm", "hipblasZtrmm_v2", "library");
-    subst("cublasZtrmm_v2", "hipblasZtrmm_v2", "library");
-    subst("cublasZtrmv", "hipblasZtrmv_v2", "library");
-    subst("cublasZtrmv_v2", "hipblasZtrmv_v2", "library");
-    subst("cublasZtrsm", "hipblasZtrsm_v2", "library");
-    subst("cublasZtrsmBatched", "hipblasZtrsmBatched_v2", "library");
-    subst("cublasZtrsm_v2", "hipblasZtrsm_v2", "library");
-    subst("cublasZtrsv", "hipblasZtrsv_v2", "library");
-    subst("cublasZtrsv_v2", "hipblasZtrsv_v2", "library");
-    subst("cuda_stream", "hip_stream", "library");
-    subst("cudnnActivationBackward", "hipdnnActivationBackward", "library");
-    subst("cudnnActivationForward", "hipdnnActivationForward", "library");
-    subst("cudnnAddTensor", "hipdnnAddTensor", "library");
-    subst("cudnnBatchNormalizationBackward", "hipdnnBatchNormalizationBackward", "library");
-    subst("cudnnBatchNormalizationForwardInference", "hipdnnBatchNormalizationForwardInference", "library");
-    subst("cudnnBatchNormalizationForwardTraining", "hipdnnBatchNormalizationForwardTraining", "library");
-    subst("cudnnConvolutionBackwardBias", "hipdnnConvolutionBackwardBias", "library");
-    subst("cudnnConvolutionBackwardData", "hipdnnConvolutionBackwardData", "library");
-    subst("cudnnConvolutionBackwardFilter", "hipdnnConvolutionBackwardFilter", "library");
-    subst("cudnnConvolutionForward", "hipdnnConvolutionForward", "library");
-    subst("cudnnCreate", "hipdnnCreate", "library");
-    subst("cudnnCreateActivationDescriptor", "hipdnnCreateActivationDescriptor", "library");
-    subst("cudnnCreateConvolutionDescriptor", "hipdnnCreateConvolutionDescriptor", "library");
-    subst("cudnnCreateDropoutDescriptor", "hipdnnCreateDropoutDescriptor", "library");
-    subst("cudnnCreateFilterDescriptor", "hipdnnCreateFilterDescriptor", "library");
-    subst("cudnnCreateLRNDescriptor", "hipdnnCreateLRNDescriptor", "library");
-    subst("cudnnCreateOpTensorDescriptor", "hipdnnCreateOpTensorDescriptor", "library");
-    subst("cudnnCreatePersistentRNNPlan", "hipdnnCreatePersistentRNNPlan", "library");
-    subst("cudnnCreatePoolingDescriptor", "hipdnnCreatePoolingDescriptor", "library");
-    subst("cudnnCreateRNNDescriptor", "hipdnnCreateRNNDescriptor", "library");
-    subst("cudnnCreateReduceTensorDescriptor", "hipdnnCreateReduceTensorDescriptor", "library");
-    subst("cudnnCreateTensorDescriptor", "hipdnnCreateTensorDescriptor", "library");
-    subst("cudnnDeriveBNTensorDescriptor", "hipdnnDeriveBNTensorDescriptor", "library");
-    subst("cudnnDestroy", "hipdnnDestroy", "library");
-    subst("cudnnDestroyActivationDescriptor", "hipdnnDestroyActivationDescriptor", "library");
-    subst("cudnnDestroyConvolutionDescriptor", "hipdnnDestroyConvolutionDescriptor", "library");
-    subst("cudnnDestroyDropoutDescriptor", "hipdnnDestroyDropoutDescriptor", "library");
-    subst("cudnnDestroyFilterDescriptor", "hipdnnDestroyFilterDescriptor", "library");
-    subst("cudnnDestroyLRNDescriptor", "hipdnnDestroyLRNDescriptor", "library");
-    subst("cudnnDestroyOpTensorDescriptor", "hipdnnDestroyOpTensorDescriptor", "library");
-    subst("cudnnDestroyPersistentRNNPlan", "hipdnnDestroyPersistentRNNPlan", "library");
-    subst("cudnnDestroyPoolingDescriptor", "hipdnnDestroyPoolingDescriptor", "library");
-    subst("cudnnDestroyRNNDescriptor", "hipdnnDestroyRNNDescriptor", "library");
-    subst("cudnnDestroyReduceTensorDescriptor", "hipdnnDestroyReduceTensorDescriptor", "library");
-    subst("cudnnDestroyTensorDescriptor", "hipdnnDestroyTensorDescriptor", "library");
-    subst("cudnnDropoutGetStatesSize", "hipdnnDropoutGetStatesSize", "library");
-    subst("cudnnFindConvolutionBackwardDataAlgorithm", "hipdnnFindConvolutionBackwardDataAlgorithm", "library");
-    subst("cudnnFindConvolutionBackwardDataAlgorithmEx", "hipdnnFindConvolutionBackwardDataAlgorithmEx", "library");
-    subst("cudnnFindConvolutionBackwardFilterAlgorithm", "hipdnnFindConvolutionBackwardFilterAlgorithm", "library");
-    subst("cudnnFindConvolutionBackwardFilterAlgorithmEx", "hipdnnFindConvolutionBackwardFilterAlgorithmEx", "library");
-    subst("cudnnFindConvolutionForwardAlgorithm", "hipdnnFindConvolutionForwardAlgorithm", "library");
-    subst("cudnnFindConvolutionForwardAlgorithmEx", "hipdnnFindConvolutionForwardAlgorithmEx", "library");
-    subst("cudnnGetActivationDescriptor", "hipdnnGetActivationDescriptor", "library");
-    subst("cudnnGetConvolution2dDescriptor", "hipdnnGetConvolution2dDescriptor", "library");
-    subst("cudnnGetConvolution2dForwardOutputDim", "hipdnnGetConvolution2dForwardOutputDim", "library");
-    subst("cudnnGetConvolutionBackwardDataAlgorithm", "hipdnnGetConvolutionBackwardDataAlgorithm", "library");
-    subst("cudnnGetConvolutionBackwardDataWorkspaceSize", "hipdnnGetConvolutionBackwardDataWorkspaceSize", "library");
-    subst("cudnnGetConvolutionBackwardFilterAlgorithm", "hipdnnGetConvolutionBackwardFilterAlgorithm", "library");
-    subst("cudnnGetConvolutionBackwardFilterWorkspaceSize", "hipdnnGetConvolutionBackwardFilterWorkspaceSize", "library");
-    subst("cudnnGetConvolutionForwardAlgorithm", "hipdnnGetConvolutionForwardAlgorithm", "library");
-    subst("cudnnGetConvolutionForwardWorkspaceSize", "hipdnnGetConvolutionForwardWorkspaceSize", "library");
-    subst("cudnnGetErrorString", "hipdnnGetErrorString", "library");
-    subst("cudnnGetFilter4dDescriptor", "hipdnnGetFilter4dDescriptor", "library");
-    subst("cudnnGetFilterNdDescriptor", "hipdnnGetFilterNdDescriptor", "library");
-    subst("cudnnGetLRNDescriptor", "hipdnnGetLRNDescriptor", "library");
-    subst("cudnnGetOpTensorDescriptor", "hipdnnGetOpTensorDescriptor", "library");
-    subst("cudnnGetPooling2dDescriptor", "hipdnnGetPooling2dDescriptor", "library");
-    subst("cudnnGetPooling2dForwardOutputDim", "hipdnnGetPooling2dForwardOutputDim", "library");
-    subst("cudnnGetRNNDescriptor", "hipdnnGetRNNDescriptor", "library");
-    subst("cudnnGetRNNLinLayerBiasParams", "hipdnnGetRNNLinLayerBiasParams", "library");
-    subst("cudnnGetRNNLinLayerMatrixParams", "hipdnnGetRNNLinLayerMatrixParams", "library");
-    subst("cudnnGetRNNParamsSize", "hipdnnGetRNNParamsSize", "library");
-    subst("cudnnGetRNNTrainingReserveSize", "hipdnnGetRNNTrainingReserveSize", "library");
-    subst("cudnnGetRNNWorkspaceSize", "hipdnnGetRNNWorkspaceSize", "library");
-    subst("cudnnGetReduceTensorDescriptor", "hipdnnGetReduceTensorDescriptor", "library");
-    subst("cudnnGetReductionWorkspaceSize", "hipdnnGetReductionWorkspaceSize", "library");
-    subst("cudnnGetStream", "hipdnnGetStream", "library");
-    subst("cudnnGetTensor4dDescriptor", "hipdnnGetTensor4dDescriptor", "library");
-    subst("cudnnGetTensorNdDescriptor", "hipdnnGetTensorNdDescriptor", "library");
-    subst("cudnnGetVersion", "hipdnnGetVersion", "library");
-    subst("cudnnLRNCrossChannelBackward", "hipdnnLRNCrossChannelBackward", "library");
-    subst("cudnnLRNCrossChannelForward", "hipdnnLRNCrossChannelForward", "library");
-    subst("cudnnOpTensor", "hipdnnOpTensor", "library");
-    subst("cudnnPoolingBackward", "hipdnnPoolingBackward", "library");
-    subst("cudnnPoolingForward", "hipdnnPoolingForward", "library");
-    subst("cudnnRNNBackwardData", "hipdnnRNNBackwardData", "library");
-    subst("cudnnRNNBackwardWeights", "hipdnnRNNBackwardWeights", "library");
-    subst("cudnnRNNForwardInference", "hipdnnRNNForwardInference", "library");
-    subst("cudnnRNNForwardTraining", "hipdnnRNNForwardTraining", "library");
-    subst("cudnnReduceTensor", "hipdnnReduceTensor", "library");
-    subst("cudnnScaleTensor", "hipdnnScaleTensor", "library");
-    subst("cudnnSetActivationDescriptor", "hipdnnSetActivationDescriptor", "library");
-    subst("cudnnSetConvolution2dDescriptor", "hipdnnSetConvolution2dDescriptor", "library");
-    subst("cudnnSetConvolutionGroupCount", "hipdnnSetConvolutionGroupCount", "library");
-    subst("cudnnSetConvolutionMathType", "hipdnnSetConvolutionMathType", "library");
-    subst("cudnnSetConvolutionNdDescriptor", "hipdnnSetConvolutionNdDescriptor", "library");
-    subst("cudnnSetDropoutDescriptor", "hipdnnSetDropoutDescriptor", "library");
-    subst("cudnnSetFilter4dDescriptor", "hipdnnSetFilter4dDescriptor", "library");
-    subst("cudnnSetFilterNdDescriptor", "hipdnnSetFilterNdDescriptor", "library");
-    subst("cudnnSetLRNDescriptor", "hipdnnSetLRNDescriptor", "library");
-    subst("cudnnSetOpTensorDescriptor", "hipdnnSetOpTensorDescriptor", "library");
-    subst("cudnnSetPersistentRNNPlan", "hipdnnSetPersistentRNNPlan", "library");
-    subst("cudnnSetPooling2dDescriptor", "hipdnnSetPooling2dDescriptor", "library");
-    subst("cudnnSetPoolingNdDescriptor", "hipdnnSetPoolingNdDescriptor", "library");
-    subst("cudnnSetRNNDescriptor", "hipdnnSetRNNDescriptor", "library");
-    subst("cudnnSetRNNDescriptor_v5", "hipdnnSetRNNDescriptor_v5", "library");
-    subst("cudnnSetRNNDescriptor_v6", "hipdnnSetRNNDescriptor_v6", "library");
-    subst("cudnnSetReduceTensorDescriptor", "hipdnnSetReduceTensorDescriptor", "library");
-    subst("cudnnSetStream", "hipdnnSetStream", "library");
-    subst("cudnnSetTensor", "hipdnnSetTensor", "library");
-    subst("cudnnSetTensor4dDescriptor", "hipdnnSetTensor4dDescriptor", "library");
-    subst("cudnnSetTensor4dDescriptorEx", "hipdnnSetTensor4dDescriptorEx", "library");
-    subst("cudnnSetTensorNdDescriptor", "hipdnnSetTensorNdDescriptor", "library");
-    subst("cudnnSoftmaxBackward", "hipdnnSoftmaxBackward", "library");
-    subst("cudnnSoftmaxForward", "hipdnnSoftmaxForward", "library");
-    subst("cufftCallbackLoadC", "hipfftCallbackLoadC", "library");
-    subst("cufftCallbackLoadD", "hipfftCallbackLoadD", "library");
-    subst("cufftCallbackLoadR", "hipfftCallbackLoadR", "library");
-    subst("cufftCallbackLoadZ", "hipfftCallbackLoadZ", "library");
-    subst("cufftCallbackStoreC", "hipfftCallbackStoreC", "library");
-    subst("cufftCallbackStoreD", "hipfftCallbackStoreD", "library");
-    subst("cufftCallbackStoreR", "hipfftCallbackStoreR", "library");
-    subst("cufftCallbackStoreZ", "hipfftCallbackStoreZ", "library");
-    subst("cufftCreate", "hipfftCreate", "library");
-    subst("cufftDestroy", "hipfftDestroy", "library");
-    subst("cufftEstimate1d", "hipfftEstimate1d", "library");
-    subst("cufftEstimate2d", "hipfftEstimate2d", "library");
-    subst("cufftEstimate3d", "hipfftEstimate3d", "library");
-    subst("cufftEstimateMany", "hipfftEstimateMany", "library");
-    subst("cufftExecC2C", "hipfftExecC2C", "library");
-    subst("cufftExecC2R", "hipfftExecC2R", "library");
-    subst("cufftExecD2Z", "hipfftExecD2Z", "library");
-    subst("cufftExecR2C", "hipfftExecR2C", "library");
-    subst("cufftExecZ2D", "hipfftExecZ2D", "library");
-    subst("cufftExecZ2Z", "hipfftExecZ2Z", "library");
-    subst("cufftGetProperty", "hipfftGetProperty", "library");
-    subst("cufftGetSize", "hipfftGetSize", "library");
-    subst("cufftGetSize1d", "hipfftGetSize1d", "library");
-    subst("cufftGetSize2d", "hipfftGetSize2d", "library");
-    subst("cufftGetSize3d", "hipfftGetSize3d", "library");
-    subst("cufftGetSizeMany", "hipfftGetSizeMany", "library");
-    subst("cufftGetSizeMany64", "hipfftGetSizeMany64", "library");
-    subst("cufftGetVersion", "hipfftGetVersion", "library");
-    subst("cufftMakePlan1d", "hipfftMakePlan1d", "library");
-    subst("cufftMakePlan2d", "hipfftMakePlan2d", "library");
-    subst("cufftMakePlan3d", "hipfftMakePlan3d", "library");
-    subst("cufftMakePlanMany", "hipfftMakePlanMany", "library");
-    subst("cufftMakePlanMany64", "hipfftMakePlanMany64", "library");
-    subst("cufftPlan1d", "hipfftPlan1d", "library");
-    subst("cufftPlan2d", "hipfftPlan2d", "library");
-    subst("cufftPlan3d", "hipfftPlan3d", "library");
-    subst("cufftPlanMany", "hipfftPlanMany", "library");
-    subst("cufftSetAutoAllocation", "hipfftSetAutoAllocation", "library");
-    subst("cufftSetStream", "hipfftSetStream", "library");
-    subst("cufftSetWorkArea", "hipfftSetWorkArea", "library");
-    subst("cufftXtClearCallback", "hipfftXtClearCallback", "library");
-    subst("cufftXtSetCallback", "hipfftXtSetCallback", "library");
-    subst("cufftXtSetCallbackSharedSize", "hipfftXtSetCallbackSharedSize", "library");
-    subst("curandCreateGenerator", "hiprandCreateGenerator", "library");
-    subst("curandCreateGeneratorHost", "hiprandCreateGeneratorHost", "library");
-    subst("curandCreatePoissonDistribution", "hiprandCreatePoissonDistribution", "library");
-    subst("curandDestroyDistribution", "hiprandDestroyDistribution", "library");
-    subst("curandDestroyGenerator", "hiprandDestroyGenerator", "library");
-    subst("curandGenerate", "hiprandGenerate", "library");
-    subst("curandGenerateLogNormal", "hiprandGenerateLogNormal", "library");
-    subst("curandGenerateLogNormalDouble", "hiprandGenerateLogNormalDouble", "library");
-    subst("curandGenerateNormal", "hiprandGenerateNormal", "library");
-    subst("curandGenerateNormalDouble", "hiprandGenerateNormalDouble", "library");
-    subst("curandGeneratePoisson", "hiprandGeneratePoisson", "library");
-    subst("curandGenerateSeeds", "hiprandGenerateSeeds", "library");
-    subst("curandGenerateUniform", "hiprandGenerateUniform", "library");
-    subst("curandGenerateUniformDouble", "hiprandGenerateUniformDouble", "library");
-    subst("curandGetDirectionVectors32", "hiprandGetDirectionVectors32", "library");
-    subst("curandGetDirectionVectors64", "hiprandGetDirectionVectors64", "library");
-    subst("curandGetScrambleConstants32", "hiprandGetScrambleConstants32", "library");
-    subst("curandGetScrambleConstants64", "hiprandGetScrambleConstants64", "library");
-    subst("curandGetVersion", "hiprandGetVersion", "library");
-    subst("curandMakeMTGP32Constants", "hiprandMakeMTGP32Constants", "library");
-    subst("curandMakeMTGP32KernelState", "hiprandMakeMTGP32KernelState", "library");
-    subst("curandSetGeneratorOffset", "hiprandSetGeneratorOffset", "library");
-    subst("curandSetPseudoRandomGeneratorSeed", "hiprandSetPseudoRandomGeneratorSeed", "library");
-    subst("curandSetQuasiRandomGeneratorDimensions", "hiprandSetQuasiRandomGeneratorDimensions", "library");
-    subst("curandSetStream", "hiprandSetStream", "library");
-    subst("cusolverDnCCgels", "hipsolverDnCCgels", "library");
-    subst("cusolverDnCCgels_bufferSize", "hipsolverDnCCgels_bufferSize", "library");
-    subst("cusolverDnCCgesv", "hipsolverDnCCgesv", "library");
-    subst("cusolverDnCCgesv_bufferSize", "hipsolverDnCCgesv_bufferSize", "library");
-    subst("cusolverDnCgebrd", "hipsolverDnCgebrd", "library");
-    subst("cusolverDnCgebrd_bufferSize", "hipsolverDnCgebrd_bufferSize", "library");
-    subst("cusolverDnCgeqrf", "hipsolverDnCgeqrf", "library");
-    subst("cusolverDnCgeqrf_bufferSize", "hipsolverDnCgeqrf_bufferSize", "library");
-    subst("cusolverDnCgesvd", "hipsolverDnCgesvd", "library");
-    subst("cusolverDnCgesvd_bufferSize", "hipsolverDnCgesvd_bufferSize", "library");
-    subst("cusolverDnCgesvdaStridedBatched", "hipsolverDnCgesvdaStridedBatched", "library");
-    subst("cusolverDnCgesvdaStridedBatched_bufferSize", "hipsolverDnCgesvdaStridedBatched_bufferSize", "library");
-    subst("cusolverDnCgesvdj", "hipsolverDnCgesvdj", "library");
-    subst("cusolverDnCgesvdjBatched", "hipsolverDnCgesvdjBatched", "library");
-    subst("cusolverDnCgesvdjBatched_bufferSize", "hipsolverDnCgesvdjBatched_bufferSize", "library");
-    subst("cusolverDnCgesvdj_bufferSize", "hipsolverDnCgesvdj_bufferSize", "library");
-    subst("cusolverDnCgetrf", "hipsolverDnCgetrf", "library");
-    subst("cusolverDnCgetrf_bufferSize", "hipsolverDnCgetrf_bufferSize", "library");
-    subst("cusolverDnCgetrs", "hipsolverDnCgetrs", "library");
-    subst("cusolverDnCheevd", "hipsolverDnCheevd", "library");
-    subst("cusolverDnCheevd_bufferSize", "hipsolverDnCheevd_bufferSize", "library");
-    subst("cusolverDnCheevdx", "hipsolverDnCheevdx", "library");
-    subst("cusolverDnCheevdx_bufferSize", "hipsolverDnCheevdx_bufferSize", "library");
-    subst("cusolverDnCheevj", "hipsolverDnCheevj", "library");
-    subst("cusolverDnCheevjBatched", "hipsolverDnCheevjBatched", "library");
-    subst("cusolverDnCheevjBatched_bufferSize", "hipsolverDnCheevjBatched_bufferSize", "library");
-    subst("cusolverDnCheevj_bufferSize", "hipsolverDnCheevj_bufferSize", "library");
-    subst("cusolverDnChegvd", "hipsolverDnChegvd", "library");
-    subst("cusolverDnChegvd_bufferSize", "hipsolverDnChegvd_bufferSize", "library");
-    subst("cusolverDnChegvdx", "hipsolverDnChegvdx", "library");
-    subst("cusolverDnChegvdx_bufferSize", "hipsolverDnChegvdx_bufferSize", "library");
-    subst("cusolverDnChegvj", "hipsolverDnChegvj", "library");
-    subst("cusolverDnChegvj_bufferSize", "hipsolverDnChegvj_bufferSize", "library");
-    subst("cusolverDnChetrd", "hipsolverDnChetrd", "library");
-    subst("cusolverDnChetrd_bufferSize", "hipsolverDnChetrd_bufferSize", "library");
-    subst("cusolverDnCpotrf", "hipsolverDnCpotrf", "library");
-    subst("cusolverDnCpotrfBatched", "hipsolverDnCpotrfBatched", "library");
-    subst("cusolverDnCpotrf_bufferSize", "hipsolverDnCpotrf_bufferSize", "library");
-    subst("cusolverDnCpotri", "hipsolverDnCpotri", "library");
-    subst("cusolverDnCpotri_bufferSize", "hipsolverDnCpotri_bufferSize", "library");
-    subst("cusolverDnCpotrs", "hipsolverDnCpotrs", "library");
-    subst("cusolverDnCpotrsBatched", "hipsolverDnCpotrsBatched", "library");
-    subst("cusolverDnCreate", "hipsolverDnCreate", "library");
-    subst("cusolverDnCreateGesvdjInfo", "hipsolverDnCreateGesvdjInfo", "library");
-    subst("cusolverDnCreateSyevjInfo", "hipsolverDnCreateSyevjInfo", "library");
-    subst("cusolverDnCsytrf", "hipsolverDnCsytrf", "library");
-    subst("cusolverDnCsytrf_bufferSize", "hipsolverDnCsytrf_bufferSize", "library");
-    subst("cusolverDnCungbr", "hipsolverDnCungbr", "library");
-    subst("cusolverDnCungbr_bufferSize", "hipsolverDnCungbr_bufferSize", "library");
-    subst("cusolverDnCungqr", "hipsolverDnCungqr", "library");
-    subst("cusolverDnCungqr_bufferSize", "hipsolverDnCungqr_bufferSize", "library");
-    subst("cusolverDnCungtr", "hipsolverDnCungtr", "library");
-    subst("cusolverDnCungtr_bufferSize", "hipsolverDnCungtr_bufferSize", "library");
-    subst("cusolverDnCunmqr", "hipsolverDnCunmqr", "library");
-    subst("cusolverDnCunmqr_bufferSize", "hipsolverDnCunmqr_bufferSize", "library");
-    subst("cusolverDnCunmtr", "hipsolverDnCunmtr", "library");
-    subst("cusolverDnCunmtr_bufferSize", "hipsolverDnCunmtr_bufferSize", "library");
-    subst("cusolverDnDDgels", "hipsolverDnDDgels", "library");
-    subst("cusolverDnDDgels_bufferSize", "hipsolverDnDDgels_bufferSize", "library");
-    subst("cusolverDnDDgesv", "hipsolverDnDDgesv", "library");
-    subst("cusolverDnDDgesv_bufferSize", "hipsolverDnDDgesv_bufferSize", "library");
-    subst("cusolverDnDestroy", "hipsolverDnDestroy", "library");
-    subst("cusolverDnDestroyGesvdjInfo", "hipsolverDnDestroyGesvdjInfo", "library");
-    subst("cusolverDnDestroySyevjInfo", "hipsolverDnDestroySyevjInfo", "library");
-    subst("cusolverDnDgebrd", "hipsolverDnDgebrd", "library");
-    subst("cusolverDnDgebrd_bufferSize", "hipsolverDnDgebrd_bufferSize", "library");
-    subst("cusolverDnDgeqrf", "hipsolverDnDgeqrf", "library");
-    subst("cusolverDnDgeqrf_bufferSize", "hipsolverDnDgeqrf_bufferSize", "library");
-    subst("cusolverDnDgesvd", "hipsolverDnDgesvd", "library");
-    subst("cusolverDnDgesvd_bufferSize", "hipsolverDnDgesvd_bufferSize", "library");
-    subst("cusolverDnDgesvdaStridedBatched", "hipsolverDnDgesvdaStridedBatched", "library");
-    subst("cusolverDnDgesvdaStridedBatched_bufferSize", "hipsolverDnDgesvdaStridedBatched_bufferSize", "library");
-    subst("cusolverDnDgesvdj", "hipsolverDnDgesvdj", "library");
-    subst("cusolverDnDgesvdjBatched", "hipsolverDnDgesvdjBatched", "library");
-    subst("cusolverDnDgesvdjBatched_bufferSize", "hipsolverDnDgesvdjBatched_bufferSize", "library");
-    subst("cusolverDnDgesvdj_bufferSize", "hipsolverDnDgesvdj_bufferSize", "library");
-    subst("cusolverDnDgetrf", "hipsolverDnDgetrf", "library");
-    subst("cusolverDnDgetrf_bufferSize", "hipsolverDnDgetrf_bufferSize", "library");
-    subst("cusolverDnDgetrs", "hipsolverDnDgetrs", "library");
-    subst("cusolverDnDorgbr", "hipsolverDnDorgbr", "library");
-    subst("cusolverDnDorgbr_bufferSize", "hipsolverDnDorgbr_bufferSize", "library");
-    subst("cusolverDnDorgqr", "hipsolverDnDorgqr", "library");
-    subst("cusolverDnDorgqr_bufferSize", "hipsolverDnDorgqr_bufferSize", "library");
-    subst("cusolverDnDorgtr", "hipsolverDnDorgtr", "library");
-    subst("cusolverDnDorgtr_bufferSize", "hipsolverDnDorgtr_bufferSize", "library");
-    subst("cusolverDnDormqr", "hipsolverDnDormqr", "library");
-    subst("cusolverDnDormqr_bufferSize", "hipsolverDnDormqr_bufferSize", "library");
-    subst("cusolverDnDormtr", "hipsolverDnDormtr", "library");
-    subst("cusolverDnDormtr_bufferSize", "hipsolverDnDormtr_bufferSize", "library");
-    subst("cusolverDnDpotrf", "hipsolverDnDpotrf", "library");
-    subst("cusolverDnDpotrfBatched", "hipsolverDnDpotrfBatched", "library");
-    subst("cusolverDnDpotrf_bufferSize", "hipsolverDnDpotrf_bufferSize", "library");
-    subst("cusolverDnDpotri", "hipsolverDnDpotri", "library");
-    subst("cusolverDnDpotri_bufferSize", "hipsolverDnDpotri_bufferSize", "library");
-    subst("cusolverDnDpotrs", "hipsolverDnDpotrs", "library");
-    subst("cusolverDnDpotrsBatched", "hipsolverDnDpotrsBatched", "library");
-    subst("cusolverDnDsyevd", "hipsolverDnDsyevd", "library");
-    subst("cusolverDnDsyevd_bufferSize", "hipsolverDnDsyevd_bufferSize", "library");
-    subst("cusolverDnDsyevdx", "hipsolverDnDsyevdx", "library");
-    subst("cusolverDnDsyevdx_bufferSize", "hipsolverDnDsyevdx_bufferSize", "library");
-    subst("cusolverDnDsyevj", "hipsolverDnDsyevj", "library");
-    subst("cusolverDnDsyevjBatched", "hipsolverDnDsyevjBatched", "library");
-    subst("cusolverDnDsyevjBatched_bufferSize", "hipsolverDnDsyevjBatched_bufferSize", "library");
-    subst("cusolverDnDsyevj_bufferSize", "hipsolverDnDsyevj_bufferSize", "library");
-    subst("cusolverDnDsygvd", "hipsolverDnDsygvd", "library");
-    subst("cusolverDnDsygvd_bufferSize", "hipsolverDnDsygvd_bufferSize", "library");
-    subst("cusolverDnDsygvdx", "hipsolverDnDsygvdx", "library");
-    subst("cusolverDnDsygvdx_bufferSize", "hipsolverDnDsygvdx_bufferSize", "library");
-    subst("cusolverDnDsygvj", "hipsolverDnDsygvj", "library");
-    subst("cusolverDnDsygvj_bufferSize", "hipsolverDnDsygvj_bufferSize", "library");
-    subst("cusolverDnDsytrd", "hipsolverDnDsytrd", "library");
-    subst("cusolverDnDsytrd_bufferSize", "hipsolverDnDsytrd_bufferSize", "library");
-    subst("cusolverDnDsytrf", "hipsolverDnDsytrf", "library");
-    subst("cusolverDnDsytrf_bufferSize", "hipsolverDnDsytrf_bufferSize", "library");
-    subst("cusolverDnGetStream", "hipsolverGetStream", "library");
-    subst("cusolverDnSSgels", "hipsolverDnSSgels", "library");
-    subst("cusolverDnSSgels_bufferSize", "hipsolverDnSSgels_bufferSize", "library");
-    subst("cusolverDnSSgesv", "hipsolverDnSSgesv", "library");
-    subst("cusolverDnSSgesv_bufferSize", "hipsolverDnSSgesv_bufferSize", "library");
-    subst("cusolverDnSetStream", "hipsolverSetStream", "library");
-    subst("cusolverDnSgebrd", "hipsolverDnSgebrd", "library");
-    subst("cusolverDnSgebrd_bufferSize", "hipsolverDnSgebrd_bufferSize", "library");
-    subst("cusolverDnSgeqrf", "hipsolverDnSgeqrf", "library");
-    subst("cusolverDnSgeqrf_bufferSize", "hipsolverDnSgeqrf_bufferSize", "library");
-    subst("cusolverDnSgesvd", "hipsolverDnSgesvd", "library");
-    subst("cusolverDnSgesvd_bufferSize", "hipsolverDnSgesvd_bufferSize", "library");
-    subst("cusolverDnSgesvdaStridedBatched", "hipsolverDnSgesvdaStridedBatched", "library");
-    subst("cusolverDnSgesvdaStridedBatched_bufferSize", "hipsolverDnSgesvdaStridedBatched_bufferSize", "library");
-    subst("cusolverDnSgesvdj", "hipsolverDnSgesvdj", "library");
-    subst("cusolverDnSgesvdjBatched", "hipsolverDnSgesvdjBatched", "library");
-    subst("cusolverDnSgesvdjBatched_bufferSize", "hipsolverDnSgesvdjBatched_bufferSize", "library");
-    subst("cusolverDnSgesvdj_bufferSize", "hipsolverDnSgesvdj_bufferSize", "library");
-    subst("cusolverDnSgetrf", "hipsolverDnSgetrf", "library");
-    subst("cusolverDnSgetrf_bufferSize", "hipsolverDnSgetrf_bufferSize", "library");
-    subst("cusolverDnSgetrs", "hipsolverDnSgetrs", "library");
-    subst("cusolverDnSorgbr", "hipsolverDnSorgbr", "library");
-    subst("cusolverDnSorgbr_bufferSize", "hipsolverDnSorgbr_bufferSize", "library");
-    subst("cusolverDnSorgqr", "hipsolverDnSorgqr", "library");
-    subst("cusolverDnSorgqr_bufferSize", "hipsolverDnSorgqr_bufferSize", "library");
-    subst("cusolverDnSorgtr", "hipsolverDnSorgtr", "library");
-    subst("cusolverDnSorgtr_bufferSize", "hipsolverDnSorgtr_bufferSize", "library");
-    subst("cusolverDnSormqr", "hipsolverDnSormqr", "library");
-    subst("cusolverDnSormqr_bufferSize", "hipsolverDnSormqr_bufferSize", "library");
-    subst("cusolverDnSormtr", "hipsolverDnSormtr", "library");
-    subst("cusolverDnSormtr_bufferSize", "hipsolverDnSormtr_bufferSize", "library");
-    subst("cusolverDnSpotrf", "hipsolverDnSpotrf", "library");
-    subst("cusolverDnSpotrfBatched", "hipsolverDnSpotrfBatched", "library");
-    subst("cusolverDnSpotrf_bufferSize", "hipsolverDnSpotrf_bufferSize", "library");
-    subst("cusolverDnSpotri", "hipsolverDnSpotri", "library");
-    subst("cusolverDnSpotri_bufferSize", "hipsolverDnSpotri_bufferSize", "library");
-    subst("cusolverDnSpotrs", "hipsolverDnSpotrs", "library");
-    subst("cusolverDnSpotrsBatched", "hipsolverDnSpotrsBatched", "library");
-    subst("cusolverDnSsyevd", "hipsolverDnSsyevd", "library");
-    subst("cusolverDnSsyevd_bufferSize", "hipsolverDnSsyevd_bufferSize", "library");
-    subst("cusolverDnSsyevdx", "hipsolverDnSsyevdx", "library");
-    subst("cusolverDnSsyevdx_bufferSize", "hipsolverDnSsyevdx_bufferSize", "library");
-    subst("cusolverDnSsyevj", "hipsolverDnSsyevj", "library");
-    subst("cusolverDnSsyevjBatched", "hipsolverDnSsyevjBatched", "library");
-    subst("cusolverDnSsyevjBatched_bufferSize", "hipsolverDnSsyevjBatched_bufferSize", "library");
-    subst("cusolverDnSsyevj_bufferSize", "hipsolverDnSsyevj_bufferSize", "library");
-    subst("cusolverDnSsygvd", "hipsolverDnSsygvd", "library");
-    subst("cusolverDnSsygvd_bufferSize", "hipsolverDnSsygvd_bufferSize", "library");
-    subst("cusolverDnSsygvdx", "hipsolverDnSsygvdx", "library");
-    subst("cusolverDnSsygvdx_bufferSize", "hipsolverDnSsygvdx_bufferSize", "library");
-    subst("cusolverDnSsygvj", "hipsolverDnSsygvj", "library");
-    subst("cusolverDnSsygvj_bufferSize", "hipsolverDnSsygvj_bufferSize", "library");
-    subst("cusolverDnSsytrd", "hipsolverDnSsytrd", "library");
-    subst("cusolverDnSsytrd_bufferSize", "hipsolverDnSsytrd_bufferSize", "library");
-    subst("cusolverDnSsytrf", "hipsolverDnSsytrf", "library");
-    subst("cusolverDnSsytrf_bufferSize", "hipsolverDnSsytrf_bufferSize", "library");
-    subst("cusolverDnXgesvdjGetResidual", "hipsolverDnXgesvdjGetResidual", "library");
-    subst("cusolverDnXgesvdjGetSweeps", "hipsolverDnXgesvdjGetSweeps", "library");
-    subst("cusolverDnXgesvdjSetMaxSweeps", "hipsolverDnXgesvdjSetMaxSweeps", "library");
-    subst("cusolverDnXgesvdjSetSortEig", "hipsolverDnXgesvdjSetSortEig", "library");
-    subst("cusolverDnXgesvdjSetTolerance", "hipsolverDnXgesvdjSetTolerance", "library");
-    subst("cusolverDnXsyevjGetResidual", "hipsolverDnXsyevjGetResidual", "library");
-    subst("cusolverDnXsyevjGetSweeps", "hipsolverDnXsyevjGetSweeps", "library");
-    subst("cusolverDnXsyevjSetMaxSweeps", "hipsolverDnXsyevjSetMaxSweeps", "library");
-    subst("cusolverDnXsyevjSetSortEig", "hipsolverDnXsyevjSetSortEig", "library");
-    subst("cusolverDnXsyevjSetTolerance", "hipsolverDnXsyevjSetTolerance", "library");
-    subst("cusolverDnZZgels", "hipsolverDnZZgels", "library");
-    subst("cusolverDnZZgels_bufferSize", "hipsolverDnZZgels_bufferSize", "library");
-    subst("cusolverDnZZgesv", "hipsolverDnZZgesv", "library");
-    subst("cusolverDnZZgesv_bufferSize", "hipsolverDnZZgesv_bufferSize", "library");
-    subst("cusolverDnZgebrd", "hipsolverDnZgebrd", "library");
-    subst("cusolverDnZgebrd_bufferSize", "hipsolverDnZgebrd_bufferSize", "library");
-    subst("cusolverDnZgeqrf", "hipsolverDnZgeqrf", "library");
-    subst("cusolverDnZgeqrf_bufferSize", "hipsolverDnZgeqrf_bufferSize", "library");
-    subst("cusolverDnZgesvd", "hipsolverDnZgesvd", "library");
-    subst("cusolverDnZgesvd_bufferSize", "hipsolverDnZgesvd_bufferSize", "library");
-    subst("cusolverDnZgesvdaStridedBatched", "hipsolverDnZgesvdaStridedBatched", "library");
-    subst("cusolverDnZgesvdaStridedBatched_bufferSize", "hipsolverDnZgesvdaStridedBatched_bufferSize", "library");
-    subst("cusolverDnZgesvdj", "hipsolverDnZgesvdj", "library");
-    subst("cusolverDnZgesvdjBatched", "hipsolverDnZgesvdjBatched", "library");
-    subst("cusolverDnZgesvdjBatched_bufferSize", "hipsolverDnZgesvdjBatched_bufferSize", "library");
-    subst("cusolverDnZgesvdj_bufferSize", "hipsolverDnZgesvdj_bufferSize", "library");
-    subst("cusolverDnZgetrf", "hipsolverDnZgetrf", "library");
-    subst("cusolverDnZgetrf_bufferSize", "hipsolverDnZgetrf_bufferSize", "library");
-    subst("cusolverDnZgetrs", "hipsolverDnZgetrs", "library");
-    subst("cusolverDnZheevd", "hipsolverDnZheevd", "library");
-    subst("cusolverDnZheevd_bufferSize", "hipsolverDnZheevd_bufferSize", "library");
-    subst("cusolverDnZheevdx", "hipsolverDnZheevdx", "library");
-    subst("cusolverDnZheevdx_bufferSize", "hipsolverDnZheevdx_bufferSize", "library");
-    subst("cusolverDnZheevj", "hipsolverDnZheevj", "library");
-    subst("cusolverDnZheevjBatched", "hipsolverDnZheevjBatched", "library");
-    subst("cusolverDnZheevjBatched_bufferSize", "hipsolverDnZheevjBatched_bufferSize", "library");
-    subst("cusolverDnZheevj_bufferSize", "hipsolverDnZheevj_bufferSize", "library");
-    subst("cusolverDnZhegvd", "hipsolverDnZhegvd", "library");
-    subst("cusolverDnZhegvd_bufferSize", "hipsolverDnZhegvd_bufferSize", "library");
-    subst("cusolverDnZhegvdx", "hipsolverDnZhegvdx", "library");
-    subst("cusolverDnZhegvdx_bufferSize", "hipsolverDnZhegvdx_bufferSize", "library");
-    subst("cusolverDnZhegvj", "hipsolverDnZhegvj", "library");
-    subst("cusolverDnZhegvj_bufferSize", "hipsolverDnZhegvj_bufferSize", "library");
-    subst("cusolverDnZhetrd", "hipsolverDnZhetrd", "library");
-    subst("cusolverDnZhetrd_bufferSize", "hipsolverDnZhetrd_bufferSize", "library");
-    subst("cusolverDnZpotrf", "hipsolverDnZpotrf", "library");
-    subst("cusolverDnZpotrfBatched", "hipsolverDnZpotrfBatched", "library");
-    subst("cusolverDnZpotrf_bufferSize", "hipsolverDnZpotrf_bufferSize", "library");
-    subst("cusolverDnZpotri", "hipsolverDnZpotri", "library");
-    subst("cusolverDnZpotri_bufferSize", "hipsolverDnZpotri_bufferSize", "library");
-    subst("cusolverDnZpotrs", "hipsolverDnZpotrs", "library");
-    subst("cusolverDnZpotrsBatched", "hipsolverDnZpotrsBatched", "library");
-    subst("cusolverDnZsytrf", "hipsolverDnZsytrf", "library");
-    subst("cusolverDnZsytrf_bufferSize", "hipsolverDnZsytrf_bufferSize", "library");
-    subst("cusolverDnZungbr", "hipsolverDnZungbr", "library");
-    subst("cusolverDnZungbr_bufferSize", "hipsolverDnZungbr_bufferSize", "library");
-    subst("cusolverDnZungqr", "hipsolverDnZungqr", "library");
-    subst("cusolverDnZungqr_bufferSize", "hipsolverDnZungqr_bufferSize", "library");
-    subst("cusolverDnZungtr", "hipsolverDnZungtr", "library");
-    subst("cusolverDnZungtr_bufferSize", "hipsolverDnZungtr_bufferSize", "library");
-    subst("cusolverDnZunmqr", "hipsolverDnZunmqr", "library");
-    subst("cusolverDnZunmqr_bufferSize", "hipsolverDnZunmqr_bufferSize", "library");
-    subst("cusolverDnZunmtr", "hipsolverDnZunmtr", "library");
-    subst("cusolverDnZunmtr_bufferSize", "hipsolverDnZunmtr_bufferSize", "library");
-    subst("cusolverRfAccessBundledFactorsDevice", "hipsolverRfAccessBundledFactorsDevice", "library");
-    subst("cusolverRfAnalyze", "hipsolverRfAnalyze", "library");
-    subst("cusolverRfBatchAnalyze", "hipsolverRfBatchAnalyze", "library");
-    subst("cusolverRfBatchRefactor", "hipsolverRfBatchRefactor", "library");
-    subst("cusolverRfBatchResetValues", "hipsolverRfBatchResetValues", "library");
-    subst("cusolverRfBatchSetupHost", "hipsolverRfBatchSetupHost", "library");
-    subst("cusolverRfBatchSolve", "hipsolverRfBatchSolve", "library");
-    subst("cusolverRfBatchZeroPivot", "hipsolverRfBatchZeroPivot", "library");
-    subst("cusolverRfCreate", "hipsolverRfCreate", "library");
-    subst("cusolverRfDestroy", "hipsolverRfDestroy", "library");
-    subst("cusolverRfExtractBundledFactorsHost", "hipsolverRfExtractBundledFactorsHost", "library");
-    subst("cusolverRfExtractSplitFactorsHost", "hipsolverRfExtractSplitFactorsHost", "library");
-    subst("cusolverRfGetMatrixFormat", "hipsolverRfGetMatrixFormat", "library");
-    subst("cusolverRfGetNumericBoostReport", "hipsolverRfGetNumericBoostReport", "library");
-    subst("cusolverRfGetNumericProperties", "hipsolverRfGetNumericProperties", "library");
-    subst("cusolverRfGetResetValuesFastMode", "hipsolverRfGetResetValuesFastMode", "library");
-    subst("cusolverRfRefactor", "hipsolverRfRefactor", "library");
-    subst("cusolverRfResetValues", "hipsolverRfResetValues", "library");
-    subst("cusolverRfSetAlgs", "hipsolverRfSetAlgs", "library");
-    subst("cusolverRfSetMatrixFormat", "hipsolverRfSetMatrixFormat", "library");
-    subst("cusolverRfSetNumericProperties", "hipsolverRfSetNumericProperties", "library");
-    subst("cusolverRfSetResetValuesFastMode", "hipsolverRfSetResetValuesFastMode", "library");
-    subst("cusolverRfSetupDevice", "hipsolverRfSetupDevice", "library");
-    subst("cusolverRfSetupHost", "hipsolverRfSetupHost", "library");
-    subst("cusolverRfSolve", "hipsolverRfSolve", "library");
-    subst("cusolverSpCreate", "hipsolverSpCreate", "library");
-    subst("cusolverSpDcsrlsvchol", "hipsolverSpDcsrlsvchol", "library");
-    subst("cusolverSpDcsrlsvcholHost", "hipsolverSpDcsrlsvcholHost", "library");
-    subst("cusolverSpDestroy", "hipsolverSpDestroy", "library");
-    subst("cusolverSpScsrlsvchol", "hipsolverSpScsrlsvchol", "library");
-    subst("cusolverSpScsrlsvcholHost", "hipsolverSpScsrlsvcholHost", "library");
-    subst("cusolverSpSetStream", "hipsolverSpSetStream", "library");
-    subst("cusparseAxpby", "hipsparseAxpby", "library");
-    subst("cusparseBlockedEllGet", "hipsparseBlockedEllGet", "library");
-    subst("cusparseCaxpyi", "hipsparseCaxpyi", "library");
-    subst("cusparseCbsr2csr", "hipsparseCbsr2csr", "library");
-    subst("cusparseCbsric02", "hipsparseCbsric02", "library");
-    subst("cusparseCbsric02_analysis", "hipsparseCbsric02_analysis", "library");
-    subst("cusparseCbsric02_bufferSize", "hipsparseCbsric02_bufferSize", "library");
-    subst("cusparseCbsrilu02", "hipsparseCbsrilu02", "library");
-    subst("cusparseCbsrilu02_analysis", "hipsparseCbsrilu02_analysis", "library");
-    subst("cusparseCbsrilu02_bufferSize", "hipsparseCbsrilu02_bufferSize", "library");
-    subst("cusparseCbsrilu02_numericBoost", "hipsparseCbsrilu02_numericBoost", "library");
-    subst("cusparseCbsrmm", "hipsparseCbsrmm", "library");
-    subst("cusparseCbsrmv", "hipsparseCbsrmv", "library");
-    subst("cusparseCbsrsm2_analysis", "hipsparseCbsrsm2_analysis", "library");
-    subst("cusparseCbsrsm2_bufferSize", "hipsparseCbsrsm2_bufferSize", "library");
-    subst("cusparseCbsrsm2_solve", "hipsparseCbsrsm2_solve", "library");
-    subst("cusparseCbsrsv2_analysis", "hipsparseCbsrsv2_analysis", "library");
-    subst("cusparseCbsrsv2_bufferSize", "hipsparseCbsrsv2_bufferSize", "library");
-    subst("cusparseCbsrsv2_bufferSizeExt", "hipsparseCbsrsv2_bufferSizeExt", "library");
-    subst("cusparseCbsrsv2_solve", "hipsparseCbsrsv2_solve", "library");
-    subst("cusparseCbsrxmv", "hipsparseCbsrxmv", "library");
-    subst("cusparseCcsc2dense", "hipsparseCcsc2dense", "library");
-    subst("cusparseCcsr2bsr", "hipsparseCcsr2bsr", "library");
-    subst("cusparseCcsr2csc", "hipsparseCcsr2csc", "library");
-    subst("cusparseCcsr2csr_compress", "hipsparseCcsr2csr_compress", "library");
-    subst("cusparseCcsr2csru", "hipsparseCcsr2csru", "library");
-    subst("cusparseCcsr2dense", "hipsparseCcsr2dense", "library");
-    subst("cusparseCcsr2gebsr", "hipsparseCcsr2gebsr", "library");
-    subst("cusparseCcsr2gebsr_bufferSize", "hipsparseCcsr2gebsr_bufferSize", "library");
-    subst("cusparseCcsr2hyb", "hipsparseCcsr2hyb", "library");
-    subst("cusparseCcsrcolor", "hipsparseCcsrcolor", "library");
-    subst("cusparseCcsrgeam", "hipsparseCcsrgeam", "library");
-    subst("cusparseCcsrgeam2", "hipsparseCcsrgeam2", "library");
-    subst("cusparseCcsrgeam2_bufferSizeExt", "hipsparseCcsrgeam2_bufferSizeExt", "library");
-    subst("cusparseCcsrgemm", "hipsparseCcsrgemm", "library");
-    subst("cusparseCcsrgemm2", "hipsparseCcsrgemm2", "library");
-    subst("cusparseCcsrgemm2_bufferSizeExt", "hipsparseCcsrgemm2_bufferSizeExt", "library");
-    subst("cusparseCcsric02", "hipsparseCcsric02", "library");
-    subst("cusparseCcsric02_analysis", "hipsparseCcsric02_analysis", "library");
-    subst("cusparseCcsric02_bufferSize", "hipsparseCcsric02_bufferSize", "library");
-    subst("cusparseCcsric02_bufferSizeExt", "hipsparseCcsric02_bufferSizeExt", "library");
-    subst("cusparseCcsrilu02", "hipsparseCcsrilu02", "library");
-    subst("cusparseCcsrilu02_analysis", "hipsparseCcsrilu02_analysis", "library");
-    subst("cusparseCcsrilu02_bufferSize", "hipsparseCcsrilu02_bufferSize", "library");
-    subst("cusparseCcsrilu02_bufferSizeExt", "hipsparseCcsrilu02_bufferSizeExt", "library");
-    subst("cusparseCcsrilu02_numericBoost", "hipsparseCcsrilu02_numericBoost", "library");
-    subst("cusparseCcsrmm", "hipsparseCcsrmm", "library");
-    subst("cusparseCcsrmm2", "hipsparseCcsrmm2", "library");
-    subst("cusparseCcsrmv", "hipsparseCcsrmv", "library");
-    subst("cusparseCcsrsm2_analysis", "hipsparseCcsrsm2_analysis", "library");
-    subst("cusparseCcsrsm2_bufferSizeExt", "hipsparseCcsrsm2_bufferSizeExt", "library");
-    subst("cusparseCcsrsm2_solve", "hipsparseCcsrsm2_solve", "library");
-    subst("cusparseCcsrsv2_analysis", "hipsparseCcsrsv2_analysis", "library");
-    subst("cusparseCcsrsv2_bufferSize", "hipsparseCcsrsv2_bufferSize", "library");
-    subst("cusparseCcsrsv2_bufferSizeExt", "hipsparseCcsrsv2_bufferSizeExt", "library");
-    subst("cusparseCcsrsv2_solve", "hipsparseCcsrsv2_solve", "library");
-    subst("cusparseCcsru2csr", "hipsparseCcsru2csr", "library");
-    subst("cusparseCcsru2csr_bufferSizeExt", "hipsparseCcsru2csr_bufferSizeExt", "library");
-    subst("cusparseCdense2csc", "hipsparseCdense2csc", "library");
-    subst("cusparseCdense2csr", "hipsparseCdense2csr", "library");
-    subst("cusparseCdotci", "hipsparseCdotci", "library");
-    subst("cusparseCdoti", "hipsparseCdoti", "library");
-    subst("cusparseCgebsr2csr", "hipsparseCgebsr2csr", "library");
-    subst("cusparseCgebsr2gebsc", "hipsparseCgebsr2gebsc", "library");
-    subst("cusparseCgebsr2gebsc_bufferSize", "hipsparseCgebsr2gebsc_bufferSize", "library");
-    subst("cusparseCgebsr2gebsr", "hipsparseCgebsr2gebsr", "library");
-    subst("cusparseCgebsr2gebsr_bufferSize", "hipsparseCgebsr2gebsr_bufferSize", "library");
-    subst("cusparseCgemmi", "hipsparseCgemmi", "library");
-    subst("cusparseCgemvi", "hipsparseCgemvi", "library");
-    subst("cusparseCgemvi_bufferSize", "hipsparseCgemvi_bufferSize", "library");
-    subst("cusparseCgpsvInterleavedBatch", "hipsparseCgpsvInterleavedBatch", "library");
-    subst("cusparseCgpsvInterleavedBatch_bufferSizeExt", "hipsparseCgpsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseCgthr", "hipsparseCgthr", "library");
-    subst("cusparseCgthrz", "hipsparseCgthrz", "library");
-    subst("cusparseCgtsv2", "hipsparseCgtsv2", "library");
-    subst("cusparseCgtsv2StridedBatch", "hipsparseCgtsv2StridedBatch", "library");
-    subst("cusparseCgtsv2StridedBatch_bufferSizeExt", "hipsparseCgtsv2StridedBatch_bufferSizeExt", "library");
-    subst("cusparseCgtsv2_bufferSizeExt", "hipsparseCgtsv2_bufferSizeExt", "library");
-    subst("cusparseCgtsv2_nopivot", "hipsparseCgtsv2_nopivot", "library");
-    subst("cusparseCgtsv2_nopivot_bufferSizeExt", "hipsparseCgtsv2_nopivot_bufferSizeExt", "library");
-    subst("cusparseCgtsvInterleavedBatch", "hipsparseCgtsvInterleavedBatch", "library");
-    subst("cusparseCgtsvInterleavedBatch_bufferSizeExt", "hipsparseCgtsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseChyb2csr", "hipsparseChyb2csr", "library");
-    subst("cusparseChybmv", "hipsparseChybmv", "library");
-    subst("cusparseCnnz", "hipsparseCnnz", "library");
-    subst("cusparseCnnz_compress", "hipsparseCnnz_compress", "library");
-    subst("cusparseConstBlockedEllGet", "hipsparseConstBlockedEllGet", "library");
-    subst("cusparseConstCooGet", "hipsparseConstCooGet", "library");
-    subst("cusparseConstCscGet", "hipsparseConstCscGet", "library");
-    subst("cusparseConstCsrGet", "hipsparseConstCsrGet", "library");
-    subst("cusparseConstDnMatGet", "hipsparseConstDnMatGet", "library");
-    subst("cusparseConstDnMatGetValues", "hipsparseConstDnMatGetValues", "library");
-    subst("cusparseConstDnVecGet", "hipsparseConstDnVecGet", "library");
-    subst("cusparseConstDnVecGetValues", "hipsparseConstDnVecGetValues", "library");
-    subst("cusparseConstSpMatGetValues", "hipsparseConstSpMatGetValues", "library");
-    subst("cusparseConstSpVecGet", "hipsparseConstSpVecGet", "library");
-    subst("cusparseConstSpVecGetValues", "hipsparseConstSpVecGetValues", "library");
-    subst("cusparseCooAoSGet", "hipsparseCooAoSGet", "library");
-    subst("cusparseCooGet", "hipsparseCooGet", "library");
-    subst("cusparseCooSetPointers", "hipsparseCooSetPointers", "library");
-    subst("cusparseCooSetStridedBatch", "hipsparseCooSetStridedBatch", "library");
-    subst("cusparseCopyMatDescr", "hipsparseCopyMatDescr", "library");
-    subst("cusparseCreate", "hipsparseCreate", "library");
-    subst("cusparseCreateBlockedEll", "hipsparseCreateBlockedEll", "library");
-    subst("cusparseCreateBsric02Info", "hipsparseCreateBsric02Info", "library");
-    subst("cusparseCreateBsrilu02Info", "hipsparseCreateBsrilu02Info", "library");
-    subst("cusparseCreateBsrsm2Info", "hipsparseCreateBsrsm2Info", "library");
-    subst("cusparseCreateBsrsv2Info", "hipsparseCreateBsrsv2Info", "library");
-    subst("cusparseCreateColorInfo", "hipsparseCreateColorInfo", "library");
-    subst("cusparseCreateConstBlockedEll", "hipsparseCreateConstBlockedEll", "library");
-    subst("cusparseCreateConstCoo", "hipsparseCreateConstCoo", "library");
-    subst("cusparseCreateConstCsc", "hipsparseCreateConstCsc", "library");
-    subst("cusparseCreateConstCsr", "hipsparseCreateConstCsr", "library");
-    subst("cusparseCreateConstDnMat", "hipsparseCreateConstDnMat", "library");
-    subst("cusparseCreateConstDnVec", "hipsparseCreateConstDnVec", "library");
-    subst("cusparseCreateConstSpVec", "hipsparseCreateConstSpVec", "library");
-    subst("cusparseCreateCoo", "hipsparseCreateCoo", "library");
-    subst("cusparseCreateCooAoS", "hipsparseCreateCooAoS", "library");
-    subst("cusparseCreateCsc", "hipsparseCreateCsc", "library");
-    subst("cusparseCreateCsr", "hipsparseCreateCsr", "library");
-    subst("cusparseCreateCsrgemm2Info", "hipsparseCreateCsrgemm2Info", "library");
-    subst("cusparseCreateCsric02Info", "hipsparseCreateCsric02Info", "library");
-    subst("cusparseCreateCsrilu02Info", "hipsparseCreateCsrilu02Info", "library");
-    subst("cusparseCreateCsrsm2Info", "hipsparseCreateCsrsm2Info", "library");
-    subst("cusparseCreateCsrsv2Info", "hipsparseCreateCsrsv2Info", "library");
-    subst("cusparseCreateCsru2csrInfo", "hipsparseCreateCsru2csrInfo", "library");
-    subst("cusparseCreateDnMat", "hipsparseCreateDnMat", "library");
-    subst("cusparseCreateDnVec", "hipsparseCreateDnVec", "library");
-    subst("cusparseCreateHybMat", "hipsparseCreateHybMat", "library");
-    subst("cusparseCreateIdentityPermutation", "hipsparseCreateIdentityPermutation", "library");
-    subst("cusparseCreateMatDescr", "hipsparseCreateMatDescr", "library");
-    subst("cusparseCreatePruneInfo", "hipsparseCreatePruneInfo", "library");
-    subst("cusparseCreateSpVec", "hipsparseCreateSpVec", "library");
-    subst("cusparseCscGet", "hipsparseCscGet", "library");
-    subst("cusparseCscSetPointers", "hipsparseCscSetPointers", "library");
-    subst("cusparseCsctr", "hipsparseCsctr", "library");
-    subst("cusparseCsr2cscEx2", "hipsparseCsr2cscEx2", "library");
-    subst("cusparseCsr2cscEx2_bufferSize", "hipsparseCsr2cscEx2_bufferSize", "library");
-    subst("cusparseCsrGet", "hipsparseCsrGet", "library");
-    subst("cusparseCsrSetPointers", "hipsparseCsrSetPointers", "library");
-    subst("cusparseCsrSetStridedBatch", "hipsparseCsrSetStridedBatch", "library");
-    subst("cusparseDaxpyi", "hipsparseDaxpyi", "library");
-    subst("cusparseDbsr2csr", "hipsparseDbsr2csr", "library");
-    subst("cusparseDbsric02", "hipsparseDbsric02", "library");
-    subst("cusparseDbsric02_analysis", "hipsparseDbsric02_analysis", "library");
-    subst("cusparseDbsric02_bufferSize", "hipsparseDbsric02_bufferSize", "library");
-    subst("cusparseDbsrilu02", "hipsparseDbsrilu02", "library");
-    subst("cusparseDbsrilu02_analysis", "hipsparseDbsrilu02_analysis", "library");
-    subst("cusparseDbsrilu02_bufferSize", "hipsparseDbsrilu02_bufferSize", "library");
-    subst("cusparseDbsrilu02_numericBoost", "hipsparseDbsrilu02_numericBoost", "library");
-    subst("cusparseDbsrmm", "hipsparseDbsrmm", "library");
-    subst("cusparseDbsrmv", "hipsparseDbsrmv", "library");
-    subst("cusparseDbsrsm2_analysis", "hipsparseDbsrsm2_analysis", "library");
-    subst("cusparseDbsrsm2_bufferSize", "hipsparseDbsrsm2_bufferSize", "library");
-    subst("cusparseDbsrsm2_solve", "hipsparseDbsrsm2_solve", "library");
-    subst("cusparseDbsrsv2_analysis", "hipsparseDbsrsv2_analysis", "library");
-    subst("cusparseDbsrsv2_bufferSize", "hipsparseDbsrsv2_bufferSize", "library");
-    subst("cusparseDbsrsv2_bufferSizeExt", "hipsparseDbsrsv2_bufferSizeExt", "library");
-    subst("cusparseDbsrsv2_solve", "hipsparseDbsrsv2_solve", "library");
-    subst("cusparseDbsrxmv", "hipsparseDbsrxmv", "library");
-    subst("cusparseDcsc2dense", "hipsparseDcsc2dense", "library");
-    subst("cusparseDcsr2bsr", "hipsparseDcsr2bsr", "library");
-    subst("cusparseDcsr2csc", "hipsparseDcsr2csc", "library");
-    subst("cusparseDcsr2csr_compress", "hipsparseDcsr2csr_compress", "library");
-    subst("cusparseDcsr2csru", "hipsparseDcsr2csru", "library");
-    subst("cusparseDcsr2dense", "hipsparseDcsr2dense", "library");
-    subst("cusparseDcsr2gebsr", "hipsparseDcsr2gebsr", "library");
-    subst("cusparseDcsr2gebsr_bufferSize", "hipsparseDcsr2gebsr_bufferSize", "library");
-    subst("cusparseDcsr2hyb", "hipsparseDcsr2hyb", "library");
-    subst("cusparseDcsrcolor", "hipsparseDcsrcolor", "library");
-    subst("cusparseDcsrgeam", "hipsparseDcsrgeam", "library");
-    subst("cusparseDcsrgeam2", "hipsparseDcsrgeam2", "library");
-    subst("cusparseDcsrgeam2_bufferSizeExt", "hipsparseDcsrgeam2_bufferSizeExt", "library");
-    subst("cusparseDcsrgemm", "hipsparseDcsrgemm", "library");
-    subst("cusparseDcsrgemm2", "hipsparseDcsrgemm2", "library");
-    subst("cusparseDcsrgemm2_bufferSizeExt", "hipsparseDcsrgemm2_bufferSizeExt", "library");
-    subst("cusparseDcsric02", "hipsparseDcsric02", "library");
-    subst("cusparseDcsric02_analysis", "hipsparseDcsric02_analysis", "library");
-    subst("cusparseDcsric02_bufferSize", "hipsparseDcsric02_bufferSize", "library");
-    subst("cusparseDcsric02_bufferSizeExt", "hipsparseDcsric02_bufferSizeExt", "library");
-    subst("cusparseDcsrilu02", "hipsparseDcsrilu02", "library");
-    subst("cusparseDcsrilu02_analysis", "hipsparseDcsrilu02_analysis", "library");
-    subst("cusparseDcsrilu02_bufferSize", "hipsparseDcsrilu02_bufferSize", "library");
-    subst("cusparseDcsrilu02_bufferSizeExt", "hipsparseDcsrilu02_bufferSizeExt", "library");
-    subst("cusparseDcsrilu02_numericBoost", "hipsparseDcsrilu02_numericBoost", "library");
-    subst("cusparseDcsrmm", "hipsparseDcsrmm", "library");
-    subst("cusparseDcsrmm2", "hipsparseDcsrmm2", "library");
-    subst("cusparseDcsrmv", "hipsparseDcsrmv", "library");
-    subst("cusparseDcsrsm2_analysis", "hipsparseDcsrsm2_analysis", "library");
-    subst("cusparseDcsrsm2_bufferSizeExt", "hipsparseDcsrsm2_bufferSizeExt", "library");
-    subst("cusparseDcsrsm2_solve", "hipsparseDcsrsm2_solve", "library");
-    subst("cusparseDcsrsv2_analysis", "hipsparseDcsrsv2_analysis", "library");
-    subst("cusparseDcsrsv2_bufferSize", "hipsparseDcsrsv2_bufferSize", "library");
-    subst("cusparseDcsrsv2_bufferSizeExt", "hipsparseDcsrsv2_bufferSizeExt", "library");
-    subst("cusparseDcsrsv2_solve", "hipsparseDcsrsv2_solve", "library");
-    subst("cusparseDcsru2csr", "hipsparseDcsru2csr", "library");
-    subst("cusparseDcsru2csr_bufferSizeExt", "hipsparseDcsru2csr_bufferSizeExt", "library");
-    subst("cusparseDdense2csc", "hipsparseDdense2csc", "library");
-    subst("cusparseDdense2csr", "hipsparseDdense2csr", "library");
-    subst("cusparseDdoti", "hipsparseDdoti", "library");
-    subst("cusparseDenseToSparse_analysis", "hipsparseDenseToSparse_analysis", "library");
-    subst("cusparseDenseToSparse_bufferSize", "hipsparseDenseToSparse_bufferSize", "library");
-    subst("cusparseDenseToSparse_convert", "hipsparseDenseToSparse_convert", "library");
-    subst("cusparseDestroy", "hipsparseDestroy", "library");
-    subst("cusparseDestroyBsric02Info", "hipsparseDestroyBsric02Info", "library");
-    subst("cusparseDestroyBsrilu02Info", "hipsparseDestroyBsrilu02Info", "library");
-    subst("cusparseDestroyBsrsm2Info", "hipsparseDestroyBsrsm2Info", "library");
-    subst("cusparseDestroyBsrsv2Info", "hipsparseDestroyBsrsv2Info", "library");
-    subst("cusparseDestroyColorInfo", "hipsparseDestroyColorInfo", "library");
-    subst("cusparseDestroyCsrgemm2Info", "hipsparseDestroyCsrgemm2Info", "library");
-    subst("cusparseDestroyCsric02Info", "hipsparseDestroyCsric02Info", "library");
-    subst("cusparseDestroyCsrilu02Info", "hipsparseDestroyCsrilu02Info", "library");
-    subst("cusparseDestroyCsrsm2Info", "hipsparseDestroyCsrsm2Info", "library");
-    subst("cusparseDestroyCsrsv2Info", "hipsparseDestroyCsrsv2Info", "library");
-    subst("cusparseDestroyCsru2csrInfo", "hipsparseDestroyCsru2csrInfo", "library");
-    subst("cusparseDestroyDnMat", "hipsparseDestroyDnMat", "library");
-    subst("cusparseDestroyDnVec", "hipsparseDestroyDnVec", "library");
-    subst("cusparseDestroyHybMat", "hipsparseDestroyHybMat", "library");
-    subst("cusparseDestroyMatDescr", "hipsparseDestroyMatDescr", "library");
-    subst("cusparseDestroyPruneInfo", "hipsparseDestroyPruneInfo", "library");
-    subst("cusparseDestroySpMat", "hipsparseDestroySpMat", "library");
-    subst("cusparseDestroySpVec", "hipsparseDestroySpVec", "library");
-    subst("cusparseDgebsr2csr", "hipsparseDgebsr2csr", "library");
-    subst("cusparseDgebsr2gebsc", "hipsparseDgebsr2gebsc", "library");
-    subst("cusparseDgebsr2gebsc_bufferSize", "hipsparseDgebsr2gebsc_bufferSize", "library");
-    subst("cusparseDgebsr2gebsr", "hipsparseDgebsr2gebsr", "library");
-    subst("cusparseDgebsr2gebsr_bufferSize", "hipsparseDgebsr2gebsr_bufferSize", "library");
-    subst("cusparseDgemmi", "hipsparseDgemmi", "library");
-    subst("cusparseDgemvi", "hipsparseDgemvi", "library");
-    subst("cusparseDgemvi_bufferSize", "hipsparseDgemvi_bufferSize", "library");
-    subst("cusparseDgpsvInterleavedBatch", "hipsparseDgpsvInterleavedBatch", "library");
-    subst("cusparseDgpsvInterleavedBatch_bufferSizeExt", "hipsparseDgpsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseDgthr", "hipsparseDgthr", "library");
-    subst("cusparseDgthrz", "hipsparseDgthrz", "library");
-    subst("cusparseDgtsv2", "hipsparseDgtsv2", "library");
-    subst("cusparseDgtsv2StridedBatch", "hipsparseDgtsv2StridedBatch", "library");
-    subst("cusparseDgtsv2StridedBatch_bufferSizeExt", "hipsparseDgtsv2StridedBatch_bufferSizeExt", "library");
-    subst("cusparseDgtsv2_bufferSizeExt", "hipsparseDgtsv2_bufferSizeExt", "library");
-    subst("cusparseDgtsv2_nopivot", "hipsparseDgtsv2_nopivot", "library");
-    subst("cusparseDgtsv2_nopivot_bufferSizeExt", "hipsparseDgtsv2_nopivot_bufferSizeExt", "library");
-    subst("cusparseDgtsvInterleavedBatch", "hipsparseDgtsvInterleavedBatch", "library");
-    subst("cusparseDgtsvInterleavedBatch_bufferSizeExt", "hipsparseDgtsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseDhyb2csr", "hipsparseDhyb2csr", "library");
-    subst("cusparseDhybmv", "hipsparseDhybmv", "library");
-    subst("cusparseDnMatGet", "hipsparseDnMatGet", "library");
-    subst("cusparseDnMatGetStridedBatch", "hipsparseDnMatGetStridedBatch", "library");
-    subst("cusparseDnMatGetValues", "hipsparseDnMatGetValues", "library");
-    subst("cusparseDnMatSetStridedBatch", "hipsparseDnMatSetStridedBatch", "library");
-    subst("cusparseDnMatSetValues", "hipsparseDnMatSetValues", "library");
-    subst("cusparseDnVecGet", "hipsparseDnVecGet", "library");
-    subst("cusparseDnVecGetValues", "hipsparseDnVecGetValues", "library");
-    subst("cusparseDnVecSetValues", "hipsparseDnVecSetValues", "library");
-    subst("cusparseDnnz", "hipsparseDnnz", "library");
-    subst("cusparseDnnz_compress", "hipsparseDnnz_compress", "library");
-    subst("cusparseDpruneCsr2csr", "hipsparseDpruneCsr2csr", "library");
-    subst("cusparseDpruneCsr2csrByPercentage", "hipsparseDpruneCsr2csrByPercentage", "library");
-    subst("cusparseDpruneCsr2csrByPercentage_bufferSizeExt", "hipsparseDpruneCsr2csrByPercentage_bufferSizeExt", "library");
-    subst("cusparseDpruneCsr2csrNnz", "hipsparseDpruneCsr2csrNnz", "library");
-    subst("cusparseDpruneCsr2csrNnzByPercentage", "hipsparseDpruneCsr2csrNnzByPercentage", "library");
-    subst("cusparseDpruneCsr2csr_bufferSizeExt", "hipsparseDpruneCsr2csr_bufferSizeExt", "library");
-    subst("cusparseDpruneDense2csr", "hipsparseDpruneDense2csr", "library");
-    subst("cusparseDpruneDense2csrByPercentage", "hipsparseDpruneDense2csrByPercentage", "library");
-    subst("cusparseDpruneDense2csrByPercentage_bufferSizeExt", "hipsparseDpruneDense2csrByPercentage_bufferSizeExt", "library");
-    subst("cusparseDpruneDense2csrNnz", "hipsparseDpruneDense2csrNnz", "library");
-    subst("cusparseDpruneDense2csrNnzByPercentage", "hipsparseDpruneDense2csrNnzByPercentage", "library");
-    subst("cusparseDpruneDense2csr_bufferSizeExt", "hipsparseDpruneDense2csr_bufferSizeExt", "library");
-    subst("cusparseDroti", "hipsparseDroti", "library");
-    subst("cusparseDsctr", "hipsparseDsctr", "library");
-    subst("cusparseGather", "hipsparseGather", "library");
-    subst("cusparseGetErrorName", "hipsparseGetErrorName", "library");
-    subst("cusparseGetErrorString", "hipsparseGetErrorString", "library");
-    subst("cusparseGetMatDiagType", "hipsparseGetMatDiagType", "library");
-    subst("cusparseGetMatFillMode", "hipsparseGetMatFillMode", "library");
-    subst("cusparseGetMatIndexBase", "hipsparseGetMatIndexBase", "library");
-    subst("cusparseGetMatType", "hipsparseGetMatType", "library");
-    subst("cusparseGetPointerMode", "hipsparseGetPointerMode", "library");
-    subst("cusparseGetStream", "hipsparseGetStream", "library");
-    subst("cusparseGetVersion", "hipsparseGetVersion", "library");
-    subst("cusparseRot", "hipsparseRot", "library");
-    subst("cusparseSDDMM", "hipsparseSDDMM", "library");
-    subst("cusparseSDDMM_bufferSize", "hipsparseSDDMM_bufferSize", "library");
-    subst("cusparseSDDMM_preprocess", "hipsparseSDDMM_preprocess", "library");
-    subst("cusparseSaxpyi", "hipsparseSaxpyi", "library");
-    subst("cusparseSbsr2csr", "hipsparseSbsr2csr", "library");
-    subst("cusparseSbsric02", "hipsparseSbsric02", "library");
-    subst("cusparseSbsric02_analysis", "hipsparseSbsric02_analysis", "library");
-    subst("cusparseSbsric02_bufferSize", "hipsparseSbsric02_bufferSize", "library");
-    subst("cusparseSbsrilu02", "hipsparseSbsrilu02", "library");
-    subst("cusparseSbsrilu02_analysis", "hipsparseSbsrilu02_analysis", "library");
-    subst("cusparseSbsrilu02_bufferSize", "hipsparseSbsrilu02_bufferSize", "library");
-    subst("cusparseSbsrilu02_numericBoost", "hipsparseSbsrilu02_numericBoost", "library");
-    subst("cusparseSbsrmm", "hipsparseSbsrmm", "library");
-    subst("cusparseSbsrmv", "hipsparseSbsrmv", "library");
-    subst("cusparseSbsrsm2_analysis", "hipsparseSbsrsm2_analysis", "library");
-    subst("cusparseSbsrsm2_bufferSize", "hipsparseSbsrsm2_bufferSize", "library");
-    subst("cusparseSbsrsm2_solve", "hipsparseSbsrsm2_solve", "library");
-    subst("cusparseSbsrsv2_analysis", "hipsparseSbsrsv2_analysis", "library");
-    subst("cusparseSbsrsv2_bufferSize", "hipsparseSbsrsv2_bufferSize", "library");
-    subst("cusparseSbsrsv2_bufferSizeExt", "hipsparseSbsrsv2_bufferSizeExt", "library");
-    subst("cusparseSbsrsv2_solve", "hipsparseSbsrsv2_solve", "library");
-    subst("cusparseSbsrxmv", "hipsparseSbsrxmv", "library");
-    subst("cusparseScatter", "hipsparseScatter", "library");
-    subst("cusparseScsc2dense", "hipsparseScsc2dense", "library");
-    subst("cusparseScsr2bsr", "hipsparseScsr2bsr", "library");
-    subst("cusparseScsr2csc", "hipsparseScsr2csc", "library");
-    subst("cusparseScsr2csr_compress", "hipsparseScsr2csr_compress", "library");
-    subst("cusparseScsr2csru", "hipsparseScsr2csru", "library");
-    subst("cusparseScsr2dense", "hipsparseScsr2dense", "library");
-    subst("cusparseScsr2gebsr", "hipsparseScsr2gebsr", "library");
-    subst("cusparseScsr2gebsr_bufferSize", "hipsparseScsr2gebsr_bufferSize", "library");
-    subst("cusparseScsr2hyb", "hipsparseScsr2hyb", "library");
-    subst("cusparseScsrcolor", "hipsparseScsrcolor", "library");
-    subst("cusparseScsrgeam", "hipsparseScsrgeam", "library");
-    subst("cusparseScsrgeam2", "hipsparseScsrgeam2", "library");
-    subst("cusparseScsrgeam2_bufferSizeExt", "hipsparseScsrgeam2_bufferSizeExt", "library");
-    subst("cusparseScsrgemm", "hipsparseScsrgemm", "library");
-    subst("cusparseScsrgemm2", "hipsparseScsrgemm2", "library");
-    subst("cusparseScsrgemm2_bufferSizeExt", "hipsparseScsrgemm2_bufferSizeExt", "library");
-    subst("cusparseScsric02", "hipsparseScsric02", "library");
-    subst("cusparseScsric02_analysis", "hipsparseScsric02_analysis", "library");
-    subst("cusparseScsric02_bufferSize", "hipsparseScsric02_bufferSize", "library");
-    subst("cusparseScsric02_bufferSizeExt", "hipsparseScsric02_bufferSizeExt", "library");
-    subst("cusparseScsrilu02", "hipsparseScsrilu02", "library");
-    subst("cusparseScsrilu02_analysis", "hipsparseScsrilu02_analysis", "library");
-    subst("cusparseScsrilu02_bufferSize", "hipsparseScsrilu02_bufferSize", "library");
-    subst("cusparseScsrilu02_bufferSizeExt", "hipsparseScsrilu02_bufferSizeExt", "library");
-    subst("cusparseScsrilu02_numericBoost", "hipsparseScsrilu02_numericBoost", "library");
-    subst("cusparseScsrmm", "hipsparseScsrmm", "library");
-    subst("cusparseScsrmm2", "hipsparseScsrmm2", "library");
-    subst("cusparseScsrmv", "hipsparseScsrmv", "library");
-    subst("cusparseScsrsm2_analysis", "hipsparseScsrsm2_analysis", "library");
-    subst("cusparseScsrsm2_bufferSizeExt", "hipsparseScsrsm2_bufferSizeExt", "library");
-    subst("cusparseScsrsm2_solve", "hipsparseScsrsm2_solve", "library");
-    subst("cusparseScsrsv2_analysis", "hipsparseScsrsv2_analysis", "library");
-    subst("cusparseScsrsv2_bufferSize", "hipsparseScsrsv2_bufferSize", "library");
-    subst("cusparseScsrsv2_bufferSizeExt", "hipsparseScsrsv2_bufferSizeExt", "library");
-    subst("cusparseScsrsv2_solve", "hipsparseScsrsv2_solve", "library");
-    subst("cusparseScsru2csr", "hipsparseScsru2csr", "library");
-    subst("cusparseScsru2csr_bufferSizeExt", "hipsparseScsru2csr_bufferSizeExt", "library");
-    subst("cusparseSdense2csc", "hipsparseSdense2csc", "library");
-    subst("cusparseSdense2csr", "hipsparseSdense2csr", "library");
-    subst("cusparseSdoti", "hipsparseSdoti", "library");
-    subst("cusparseSetMatDiagType", "hipsparseSetMatDiagType", "library");
-    subst("cusparseSetMatFillMode", "hipsparseSetMatFillMode", "library");
-    subst("cusparseSetMatIndexBase", "hipsparseSetMatIndexBase", "library");
-    subst("cusparseSetMatType", "hipsparseSetMatType", "library");
-    subst("cusparseSetPointerMode", "hipsparseSetPointerMode", "library");
-    subst("cusparseSetStream", "hipsparseSetStream", "library");
-    subst("cusparseSgebsr2csr", "hipsparseSgebsr2csr", "library");
-    subst("cusparseSgebsr2gebsc", "hipsparseSgebsr2gebsc", "library");
-    subst("cusparseSgebsr2gebsc_bufferSize", "hipsparseSgebsr2gebsc_bufferSize", "library");
-    subst("cusparseSgebsr2gebsr", "hipsparseSgebsr2gebsr", "library");
-    subst("cusparseSgebsr2gebsr_bufferSize", "hipsparseSgebsr2gebsr_bufferSize", "library");
-    subst("cusparseSgemmi", "hipsparseSgemmi", "library");
-    subst("cusparseSgemvi", "hipsparseSgemvi", "library");
-    subst("cusparseSgemvi_bufferSize", "hipsparseSgemvi_bufferSize", "library");
-    subst("cusparseSgpsvInterleavedBatch", "hipsparseSgpsvInterleavedBatch", "library");
-    subst("cusparseSgpsvInterleavedBatch_bufferSizeExt", "hipsparseSgpsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseSgthr", "hipsparseSgthr", "library");
-    subst("cusparseSgthrz", "hipsparseSgthrz", "library");
-    subst("cusparseSgtsv2", "hipsparseSgtsv2", "library");
-    subst("cusparseSgtsv2StridedBatch", "hipsparseSgtsv2StridedBatch", "library");
-    subst("cusparseSgtsv2StridedBatch_bufferSizeExt", "hipsparseSgtsv2StridedBatch_bufferSizeExt", "library");
-    subst("cusparseSgtsv2_bufferSizeExt", "hipsparseSgtsv2_bufferSizeExt", "library");
-    subst("cusparseSgtsv2_nopivot", "hipsparseSgtsv2_nopivot", "library");
-    subst("cusparseSgtsv2_nopivot_bufferSizeExt", "hipsparseSgtsv2_nopivot_bufferSizeExt", "library");
-    subst("cusparseSgtsvInterleavedBatch", "hipsparseSgtsvInterleavedBatch", "library");
-    subst("cusparseSgtsvInterleavedBatch_bufferSizeExt", "hipsparseSgtsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseShyb2csr", "hipsparseShyb2csr", "library");
-    subst("cusparseShybmv", "hipsparseShybmv", "library");
-    subst("cusparseSnnz", "hipsparseSnnz", "library");
-    subst("cusparseSnnz_compress", "hipsparseSnnz_compress", "library");
-    subst("cusparseSpGEMM_compute", "hipsparseSpGEMM_compute", "library");
-    subst("cusparseSpGEMM_copy", "hipsparseSpGEMM_copy", "library");
-    subst("cusparseSpGEMM_createDescr", "hipsparseSpGEMM_createDescr", "library");
-    subst("cusparseSpGEMM_destroyDescr", "hipsparseSpGEMM_destroyDescr", "library");
-    subst("cusparseSpGEMM_workEstimation", "hipsparseSpGEMM_workEstimation", "library");
-    subst("cusparseSpGEMMreuse_compute", "hipsparseSpGEMMreuse_compute", "library");
-    subst("cusparseSpGEMMreuse_copy", "hipsparseSpGEMMreuse_copy", "library");
-    subst("cusparseSpGEMMreuse_nnz", "hipsparseSpGEMMreuse_nnz", "library");
-    subst("cusparseSpGEMMreuse_workEstimation", "hipsparseSpGEMMreuse_workEstimation", "library");
-    subst("cusparseSpMM", "hipsparseSpMM", "library");
-    subst("cusparseSpMM_bufferSize", "hipsparseSpMM_bufferSize", "library");
-    subst("cusparseSpMM_preprocess", "hipsparseSpMM_preprocess", "library");
-    subst("cusparseSpMV", "hipsparseSpMV", "library");
-    subst("cusparseSpMV_bufferSize", "hipsparseSpMV_bufferSize", "library");
-    subst("cusparseSpMV_preprocess", "hipsparseSpMV_preprocess", "library");
-    subst("cusparseSpMatGetAttribute", "hipsparseSpMatGetAttribute", "library");
-    subst("cusparseSpMatGetFormat", "hipsparseSpMatGetFormat", "library");
-    subst("cusparseSpMatGetIndexBase", "hipsparseSpMatGetIndexBase", "library");
-    subst("cusparseSpMatGetSize", "hipsparseSpMatGetSize", "library");
-    subst("cusparseSpMatGetStridedBatch", "hipsparseSpMatGetStridedBatch", "library");
-    subst("cusparseSpMatGetValues", "hipsparseSpMatGetValues", "library");
-    subst("cusparseSpMatSetAttribute", "hipsparseSpMatSetAttribute", "library");
-    subst("cusparseSpMatSetStridedBatch", "hipsparseSpMatSetStridedBatch", "library");
-    subst("cusparseSpMatSetValues", "hipsparseSpMatSetValues", "library");
-    subst("cusparseSpSM_analysis", "hipsparseSpSM_analysis", "library");
-    subst("cusparseSpSM_bufferSize", "hipsparseSpSM_bufferSize", "library");
-    subst("cusparseSpSM_createDescr", "hipsparseSpSM_createDescr", "library");
-    subst("cusparseSpSM_destroyDescr", "hipsparseSpSM_destroyDescr", "library");
-    subst("cusparseSpSM_solve", "hipsparseSpSM_solve", "library");
-    subst("cusparseSpSV_analysis", "hipsparseSpSV_analysis", "library");
-    subst("cusparseSpSV_bufferSize", "hipsparseSpSV_bufferSize", "library");
-    subst("cusparseSpSV_createDescr", "hipsparseSpSV_createDescr", "library");
-    subst("cusparseSpSV_destroyDescr", "hipsparseSpSV_destroyDescr", "library");
-    subst("cusparseSpSV_solve", "hipsparseSpSV_solve", "library");
-    subst("cusparseSpVV", "hipsparseSpVV", "library");
-    subst("cusparseSpVV_bufferSize", "hipsparseSpVV_bufferSize", "library");
-    subst("cusparseSpVecGet", "hipsparseSpVecGet", "library");
-    subst("cusparseSpVecGetIndexBase", "hipsparseSpVecGetIndexBase", "library");
-    subst("cusparseSpVecGetValues", "hipsparseSpVecGetValues", "library");
-    subst("cusparseSpVecSetValues", "hipsparseSpVecSetValues", "library");
-    subst("cusparseSparseToDense", "hipsparseSparseToDense", "library");
-    subst("cusparseSparseToDense_bufferSize", "hipsparseSparseToDense_bufferSize", "library");
-    subst("cusparseSpruneCsr2csr", "hipsparseSpruneCsr2csr", "library");
-    subst("cusparseSpruneCsr2csrByPercentage", "hipsparseSpruneCsr2csrByPercentage", "library");
-    subst("cusparseSpruneCsr2csrByPercentage_bufferSizeExt", "hipsparseSpruneCsr2csrByPercentage_bufferSizeExt", "library");
-    subst("cusparseSpruneCsr2csrNnz", "hipsparseSpruneCsr2csrNnz", "library");
-    subst("cusparseSpruneCsr2csrNnzByPercentage", "hipsparseSpruneCsr2csrNnzByPercentage", "library");
-    subst("cusparseSpruneCsr2csr_bufferSizeExt", "hipsparseSpruneCsr2csr_bufferSizeExt", "library");
-    subst("cusparseSpruneDense2csr", "hipsparseSpruneDense2csr", "library");
-    subst("cusparseSpruneDense2csrByPercentage", "hipsparseSpruneDense2csrByPercentage", "library");
-    subst("cusparseSpruneDense2csrByPercentage_bufferSizeExt", "hipsparseSpruneDense2csrByPercentage_bufferSizeExt", "library");
-    subst("cusparseSpruneDense2csrNnz", "hipsparseSpruneDense2csrNnz", "library");
-    subst("cusparseSpruneDense2csrNnzByPercentage", "hipsparseSpruneDense2csrNnzByPercentage", "library");
-    subst("cusparseSpruneDense2csr_bufferSizeExt", "hipsparseSpruneDense2csr_bufferSizeExt", "library");
-    subst("cusparseSroti", "hipsparseSroti", "library");
-    subst("cusparseSsctr", "hipsparseSsctr", "library");
-    subst("cusparseXbsric02_zeroPivot", "hipsparseXbsric02_zeroPivot", "library");
-    subst("cusparseXbsrilu02_zeroPivot", "hipsparseXbsrilu02_zeroPivot", "library");
-    subst("cusparseXbsrsm2_zeroPivot", "hipsparseXbsrsm2_zeroPivot", "library");
-    subst("cusparseXbsrsv2_zeroPivot", "hipsparseXbsrsv2_zeroPivot", "library");
-    subst("cusparseXcoo2csr", "hipsparseXcoo2csr", "library");
-    subst("cusparseXcoosortByColumn", "hipsparseXcoosortByColumn", "library");
-    subst("cusparseXcoosortByRow", "hipsparseXcoosortByRow", "library");
-    subst("cusparseXcoosort_bufferSizeExt", "hipsparseXcoosort_bufferSizeExt", "library");
-    subst("cusparseXcscsort", "hipsparseXcscsort", "library");
-    subst("cusparseXcscsort_bufferSizeExt", "hipsparseXcscsort_bufferSizeExt", "library");
-    subst("cusparseXcsr2bsrNnz", "hipsparseXcsr2bsrNnz", "library");
-    subst("cusparseXcsr2coo", "hipsparseXcsr2coo", "library");
-    subst("cusparseXcsr2gebsrNnz", "hipsparseXcsr2gebsrNnz", "library");
-    subst("cusparseXcsrgeam2Nnz", "hipsparseXcsrgeam2Nnz", "library");
-    subst("cusparseXcsrgeamNnz", "hipsparseXcsrgeamNnz", "library");
-    subst("cusparseXcsrgemm2Nnz", "hipsparseXcsrgemm2Nnz", "library");
-    subst("cusparseXcsrgemmNnz", "hipsparseXcsrgemmNnz", "library");
-    subst("cusparseXcsric02_zeroPivot", "hipsparseXcsric02_zeroPivot", "library");
-    subst("cusparseXcsrilu02_zeroPivot", "hipsparseXcsrilu02_zeroPivot", "library");
-    subst("cusparseXcsrsm2_zeroPivot", "hipsparseXcsrsm2_zeroPivot", "library");
-    subst("cusparseXcsrsort", "hipsparseXcsrsort", "library");
-    subst("cusparseXcsrsort_bufferSizeExt", "hipsparseXcsrsort_bufferSizeExt", "library");
-    subst("cusparseXcsrsv2_zeroPivot", "hipsparseXcsrsv2_zeroPivot", "library");
-    subst("cusparseXgebsr2gebsrNnz", "hipsparseXgebsr2gebsrNnz", "library");
-    subst("cusparseZaxpyi", "hipsparseZaxpyi", "library");
-    subst("cusparseZbsr2csr", "hipsparseZbsr2csr", "library");
-    subst("cusparseZbsric02", "hipsparseZbsric02", "library");
-    subst("cusparseZbsric02_analysis", "hipsparseZbsric02_analysis", "library");
-    subst("cusparseZbsric02_bufferSize", "hipsparseZbsric02_bufferSize", "library");
-    subst("cusparseZbsrilu02", "hipsparseZbsrilu02", "library");
-    subst("cusparseZbsrilu02_analysis", "hipsparseZbsrilu02_analysis", "library");
-    subst("cusparseZbsrilu02_bufferSize", "hipsparseZbsrilu02_bufferSize", "library");
-    subst("cusparseZbsrilu02_numericBoost", "hipsparseZbsrilu02_numericBoost", "library");
-    subst("cusparseZbsrmm", "hipsparseZbsrmm", "library");
-    subst("cusparseZbsrmv", "hipsparseZbsrmv", "library");
-    subst("cusparseZbsrsm2_analysis", "hipsparseZbsrsm2_analysis", "library");
-    subst("cusparseZbsrsm2_bufferSize", "hipsparseZbsrsm2_bufferSize", "library");
-    subst("cusparseZbsrsm2_solve", "hipsparseZbsrsm2_solve", "library");
-    subst("cusparseZbsrsv2_analysis", "hipsparseZbsrsv2_analysis", "library");
-    subst("cusparseZbsrsv2_bufferSize", "hipsparseZbsrsv2_bufferSize", "library");
-    subst("cusparseZbsrsv2_bufferSizeExt", "hipsparseZbsrsv2_bufferSizeExt", "library");
-    subst("cusparseZbsrsv2_solve", "hipsparseZbsrsv2_solve", "library");
-    subst("cusparseZbsrxmv", "hipsparseZbsrxmv", "library");
-    subst("cusparseZcsc2dense", "hipsparseZcsc2dense", "library");
-    subst("cusparseZcsr2bsr", "hipsparseZcsr2bsr", "library");
-    subst("cusparseZcsr2csc", "hipsparseZcsr2csc", "library");
-    subst("cusparseZcsr2csr_compress", "hipsparseZcsr2csr_compress", "library");
-    subst("cusparseZcsr2csru", "hipsparseZcsr2csru", "library");
-    subst("cusparseZcsr2dense", "hipsparseZcsr2dense", "library");
-    subst("cusparseZcsr2gebsr", "hipsparseZcsr2gebsr", "library");
-    subst("cusparseZcsr2gebsr_bufferSize", "hipsparseZcsr2gebsr_bufferSize", "library");
-    subst("cusparseZcsr2hyb", "hipsparseZcsr2hyb", "library");
-    subst("cusparseZcsrcolor", "hipsparseZcsrcolor", "library");
-    subst("cusparseZcsrgeam", "hipsparseZcsrgeam", "library");
-    subst("cusparseZcsrgeam2", "hipsparseZcsrgeam2", "library");
-    subst("cusparseZcsrgeam2_bufferSizeExt", "hipsparseZcsrgeam2_bufferSizeExt", "library");
-    subst("cusparseZcsrgemm", "hipsparseZcsrgemm", "library");
-    subst("cusparseZcsrgemm2", "hipsparseZcsrgemm2", "library");
-    subst("cusparseZcsrgemm2_bufferSizeExt", "hipsparseZcsrgemm2_bufferSizeExt", "library");
-    subst("cusparseZcsric02", "hipsparseZcsric02", "library");
-    subst("cusparseZcsric02_analysis", "hipsparseZcsric02_analysis", "library");
-    subst("cusparseZcsric02_bufferSize", "hipsparseZcsric02_bufferSize", "library");
-    subst("cusparseZcsric02_bufferSizeExt", "hipsparseZcsric02_bufferSizeExt", "library");
-    subst("cusparseZcsrilu02", "hipsparseZcsrilu02", "library");
-    subst("cusparseZcsrilu02_analysis", "hipsparseZcsrilu02_analysis", "library");
-    subst("cusparseZcsrilu02_bufferSize", "hipsparseZcsrilu02_bufferSize", "library");
-    subst("cusparseZcsrilu02_bufferSizeExt", "hipsparseZcsrilu02_bufferSizeExt", "library");
-    subst("cusparseZcsrilu02_numericBoost", "hipsparseZcsrilu02_numericBoost", "library");
-    subst("cusparseZcsrmm", "hipsparseZcsrmm", "library");
-    subst("cusparseZcsrmm2", "hipsparseZcsrmm2", "library");
-    subst("cusparseZcsrmv", "hipsparseZcsrmv", "library");
-    subst("cusparseZcsrsm2_analysis", "hipsparseZcsrsm2_analysis", "library");
-    subst("cusparseZcsrsm2_bufferSizeExt", "hipsparseZcsrsm2_bufferSizeExt", "library");
-    subst("cusparseZcsrsm2_solve", "hipsparseZcsrsm2_solve", "library");
-    subst("cusparseZcsrsv2_analysis", "hipsparseZcsrsv2_analysis", "library");
-    subst("cusparseZcsrsv2_bufferSize", "hipsparseZcsrsv2_bufferSize", "library");
-    subst("cusparseZcsrsv2_bufferSizeExt", "hipsparseZcsrsv2_bufferSizeExt", "library");
-    subst("cusparseZcsrsv2_solve", "hipsparseZcsrsv2_solve", "library");
-    subst("cusparseZcsru2csr", "hipsparseZcsru2csr", "library");
-    subst("cusparseZcsru2csr_bufferSizeExt", "hipsparseZcsru2csr_bufferSizeExt", "library");
-    subst("cusparseZdense2csc", "hipsparseZdense2csc", "library");
-    subst("cusparseZdense2csr", "hipsparseZdense2csr", "library");
-    subst("cusparseZdotci", "hipsparseZdotci", "library");
-    subst("cusparseZdoti", "hipsparseZdoti", "library");
-    subst("cusparseZgebsr2csr", "hipsparseZgebsr2csr", "library");
-    subst("cusparseZgebsr2gebsc", "hipsparseZgebsr2gebsc", "library");
-    subst("cusparseZgebsr2gebsc_bufferSize", "hipsparseZgebsr2gebsc_bufferSize", "library");
-    subst("cusparseZgebsr2gebsr", "hipsparseZgebsr2gebsr", "library");
-    subst("cusparseZgebsr2gebsr_bufferSize", "hipsparseZgebsr2gebsr_bufferSize", "library");
-    subst("cusparseZgemmi", "hipsparseZgemmi", "library");
-    subst("cusparseZgemvi", "hipsparseZgemvi", "library");
-    subst("cusparseZgemvi_bufferSize", "hipsparseZgemvi_bufferSize", "library");
-    subst("cusparseZgpsvInterleavedBatch", "hipsparseZgpsvInterleavedBatch", "library");
-    subst("cusparseZgpsvInterleavedBatch_bufferSizeExt", "hipsparseZgpsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseZgthr", "hipsparseZgthr", "library");
-    subst("cusparseZgthrz", "hipsparseZgthrz", "library");
-    subst("cusparseZgtsv2", "hipsparseZgtsv2", "library");
-    subst("cusparseZgtsv2StridedBatch", "hipsparseZgtsv2StridedBatch", "library");
-    subst("cusparseZgtsv2StridedBatch_bufferSizeExt", "hipsparseZgtsv2StridedBatch_bufferSizeExt", "library");
-    subst("cusparseZgtsv2_bufferSizeExt", "hipsparseZgtsv2_bufferSizeExt", "library");
-    subst("cusparseZgtsv2_nopivot", "hipsparseZgtsv2_nopivot", "library");
-    subst("cusparseZgtsv2_nopivot_bufferSizeExt", "hipsparseZgtsv2_nopivot_bufferSizeExt", "library");
-    subst("cusparseZgtsvInterleavedBatch", "hipsparseZgtsvInterleavedBatch", "library");
-    subst("cusparseZgtsvInterleavedBatch_bufferSizeExt", "hipsparseZgtsvInterleavedBatch_bufferSizeExt", "library");
-    subst("cusparseZhyb2csr", "hipsparseZhyb2csr", "library");
-    subst("cusparseZhybmv", "hipsparseZhybmv", "library");
-    subst("cusparseZnnz", "hipsparseZnnz", "library");
-    subst("cusparseZnnz_compress", "hipsparseZnnz_compress", "library");
-    subst("cusparseZsctr", "hipsparseZsctr", "library");
-    subst("nvrtcAddNameExpression", "hiprtcAddNameExpression", "library");
-    subst("nvrtcCompileProgram", "hiprtcCompileProgram", "library");
-    subst("nvrtcCreateProgram", "hiprtcCreateProgram", "library");
-    subst("nvrtcDestroyProgram", "hiprtcDestroyProgram", "library");
-    subst("nvrtcGetCUBIN", "hiprtcGetBitcode", "library");
-    subst("nvrtcGetCUBINSize", "hiprtcGetBitcodeSize", "library");
-    subst("nvrtcGetErrorString", "hiprtcGetErrorString", "library");
-    subst("nvrtcGetLoweredName", "hiprtcGetLoweredName", "library");
-    subst("nvrtcGetPTX", "hiprtcGetCode", "library");
-    subst("nvrtcGetPTXSize", "hiprtcGetCodeSize", "library");
-    subst("nvrtcGetProgramLog", "hiprtcGetProgramLog", "library");
-    subst("nvrtcGetProgramLogSize", "hiprtcGetProgramLogSize", "library");
-    subst("nvrtcVersion", "hiprtcVersion", "library");
-    subst("curand", "hiprand", "device_library");
-    subst("curand_discrete", "hiprand_discrete", "device_library");
-    subst("curand_discrete4", "hiprand_discrete4", "device_library");
-    subst("curand_init", "hiprand_init", "device_library");
-    subst("curand_log_normal", "hiprand_log_normal", "device_library");
-    subst("curand_log_normal2", "hiprand_log_normal2", "device_library");
-    subst("curand_log_normal2_double", "hiprand_log_normal2_double", "device_library");
-    subst("curand_log_normal4", "hiprand_log_normal4", "device_library");
-    subst("curand_log_normal4_double", "hiprand_log_normal4_double", "device_library");
-    subst("curand_log_normal_double", "hiprand_log_normal_double", "device_library");
-    subst("curand_normal", "hiprand_normal", "device_library");
-    subst("curand_normal2", "hiprand_normal2", "device_library");
-    subst("curand_normal2_double", "hiprand_normal2_double", "device_library");
-    subst("curand_normal4", "hiprand_normal4", "device_library");
-    subst("curand_normal4_double", "hiprand_normal4_double", "device_library");
-    subst("curand_normal_double", "hiprand_normal_double", "device_library");
-    subst("curand_poisson", "hiprand_poisson", "device_library");
-    subst("curand_poisson4", "hiprand_poisson4", "device_library");
-    subst("curand_uniform", "hiprand_uniform", "device_library");
-    subst("curand_uniform2_double", "hiprand_uniform2_double", "device_library");
-    subst("curand_uniform4", "hiprand_uniform4", "device_library");
-    subst("curand_uniform4_double", "hiprand_uniform4_double", "device_library");
-    subst("curand_uniform_double", "hiprand_uniform_double", "device_library");
-    subst("__half", "__half", "device_type");
-    subst("__half2", "__half2", "device_type");
-    subst("__half2_raw", "__half2_raw", "device_type");
-    subst("__half_raw", "__half_raw", "device_type");
-    subst("__nv_bfloat16", "hip_bfloat16", "device_type");
-    subst("caffe2\/core\/common_cudnn.h", "caffe2\/core\/hip\/common_miopen.h", "include");
-    subst("caffe2\/operators\/spatial_batch_norm_op.h", "caffe2\/operators\/hip\/spatial_batch_norm_op_miopen.hip", "include");
-    subst("channel_descriptor.h", "hip\/channel_descriptor.h", "include");
-    subst("cooperative_groups.h", "hip\/hip_cooperative_groups.h", "include");
-    subst("cublasLt.h", "hipblaslt.h", "include");
-    subst("cublas_api.h", "hipblas.h", "include");
-    subst("cuda_fp16.h", "hip\/hip_fp16.h", "include");
-    subst("cuda_profiler_api.h", "hip\/hip_runtime_api.h", "include");
-    subst("cuda_runtime_api.h", "hip\/hip_runtime_api.h", "include");
-    subst("cuda_texture_types.h", "hip\/hip_texture_types.h", "include");
-    subst("cufftXt.h", "hipfft\/hipfftXt.h", "include");
-    subst("curand_discrete.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_discrete2.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_globals.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_kernel.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_lognormal.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_mrg32k3a.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_mtgp32.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_mtgp32_host.h", "hiprand\/hiprand_mtgp32_host.h", "include");
-    subst("curand_mtgp32_kernel.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_mtgp32dc_p_11213.h", "rocrand_mtgp32_11213.h", "include");
-    subst("curand_normal.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_normal_static.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_philox4x32_x.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_poisson.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_precalc.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("curand_uniform.h", "hiprand\/hiprand_kernel.h", "include");
-    subst("device_functions.h", "hip\/device_functions.h", "include");
-    subst("device_launch_parameters.h", "", "include");
-    subst("driver_types.h", "hip\/driver_types.h", "include");
-    subst("library_types.h", "hip\/library_types.h", "include");
-    subst("math_constants.h", "hip\/hip_math_constants.h", "include");
-    subst("texture_fetch_functions.h", "", "include");
-    subst("vector_types.h", "hip\/hip_vector_types.h", "include");
-    subst("cuComplex.h", "hip\/hip_complex.h", "include_cuda_main_header");
-    subst("cub\/cub.cuh", "hipcub\/hipcub.hpp", "include_cuda_main_header");
-    subst("cublas.h", "hipblas.h", "include_cuda_main_header");
-    subst("cuda.h", "hip\/hip_runtime.h", "include_cuda_main_header");
-    subst("cuda_runtime.h", "hip\/hip_runtime.h", "include_cuda_main_header");
-    subst("cudnn.h", "hipDNN.h", "include_cuda_main_header");
-    subst("cufft.h", "hipfft\/hipfft.h", "include_cuda_main_header");
-    subst("curand.h", "hiprand\/hiprand.h", "include_cuda_main_header");
-    subst("cusolverDn.h", "hipsolver.h", "include_cuda_main_header");
-    subst("cusolverMg.h", "hipsolver.h", "include_cuda_main_header");
-    subst("cusolverRf.h", "hipsolver.h", "include_cuda_main_header");
-    subst("cusolverSp.h", "hipsolver.h", "include_cuda_main_header");
-    subst("cusolverSp_LOWLEVEL_PREVIEW.h", "hipsolver.h", "include_cuda_main_header");
-    subst("cusolver_common.h", "hipsolver.h", "include_cuda_main_header");
-    subst("cusparse.h", "hipsparse.h", "include_cuda_main_header");
-    subst("nvrtc.h", "hiprtc.h", "include_cuda_main_header");
-    subst("cublas_v2.h", "hipblas.h", "include_cuda_main_header_v2");
-    subst("cusparse_v2.h", "hipsparse.h", "include_cuda_main_header_v2");
-    subst("CUDAContext", "HIPContext", "type");
-    subst("CUDA_ARRAY3D_DESCRIPTOR", "HIP_ARRAY3D_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY3D_DESCRIPTOR_st", "HIP_ARRAY3D_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY3D_DESCRIPTOR_v2", "HIP_ARRAY3D_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY_DESCRIPTOR", "HIP_ARRAY_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY_DESCRIPTOR_st", "HIP_ARRAY_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY_DESCRIPTOR_v1", "HIP_ARRAY_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY_DESCRIPTOR_v1_st", "HIP_ARRAY_DESCRIPTOR", "type");
-    subst("CUDA_ARRAY_DESCRIPTOR_v2", "HIP_ARRAY_DESCRIPTOR", "type");
-    subst("CUDA_CHILD_GRAPH_NODE_PARAMS", "hipChildGraphNodeParams", "type");
-    subst("CUDA_CHILD_GRAPH_NODE_PARAMS_st", "hipChildGraphNodeParams", "type");
-    subst("CUDA_EVENT_RECORD_NODE_PARAMS", "hipEventRecordNodeParams", "type");
-    subst("CUDA_EVENT_RECORD_NODE_PARAMS_st", "hipEventRecordNodeParams", "type");
-    subst("CUDA_EVENT_WAIT_NODE_PARAMS", "hipEventWaitNodeParams", "type");
-    subst("CUDA_EVENT_WAIT_NODE_PARAMS_st", "hipEventWaitNodeParams", "type");
-    subst("CUDA_EXTERNAL_MEMORY_BUFFER_DESC", "hipExternalMemoryBufferDesc", "type");
-    subst("CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st", "hipExternalMemoryBufferDesc_st", "type");
-    subst("CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1", "hipExternalMemoryBufferDesc", "type");
-    subst("CUDA_EXTERNAL_MEMORY_HANDLE_DESC", "hipExternalMemoryHandleDesc", "type");
-    subst("CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st", "hipExternalMemoryHandleDesc_st", "type");
-    subst("CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1", "hipExternalMemoryHandleDesc", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC", "hipExternalSemaphoreHandleDesc", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st", "hipExternalSemaphoreHandleDesc_st", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1", "hipExternalSemaphoreHandleDesc", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS", "hipExternalSemaphoreSignalParams", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st", "hipExternalSemaphoreSignalParams_st", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1", "hipExternalSemaphoreSignalParams", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS", "hipExternalSemaphoreWaitParams", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st", "hipExternalSemaphoreWaitParams_st", "type");
-    subst("CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1", "hipExternalSemaphoreWaitParams", "type");
-    subst("CUDA_EXT_SEM_SIGNAL_NODE_PARAMS", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("CUDA_EXT_SEM_WAIT_NODE_PARAMS", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("CUDA_EXT_SEM_WAIT_NODE_PARAMS_st", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("CUDA_GRAPH_INSTANTIATE_PARAMS", "hipGraphInstantiateParams", "type");
-    subst("CUDA_GRAPH_INSTANTIATE_PARAMS_st", "hipGraphInstantiateParams", "type");
-    subst("CUDA_HOST_NODE_PARAMS", "hipHostNodeParams", "type");
-    subst("CUDA_HOST_NODE_PARAMS_st", "hipHostNodeParams", "type");
-    subst("CUDA_HOST_NODE_PARAMS_v1", "hipHostNodeParams", "type");
-    subst("CUDA_KERNEL_NODE_PARAMS", "hipKernelNodeParams", "type");
-    subst("CUDA_KERNEL_NODE_PARAMS_st", "hipKernelNodeParams", "type");
-    subst("CUDA_KERNEL_NODE_PARAMS_v1", "hipKernelNodeParams", "type");
-    subst("CUDA_LAUNCH_PARAMS", "hipFunctionLaunchParams", "type");
-    subst("CUDA_LAUNCH_PARAMS_st", "hipFunctionLaunchParams_t", "type");
-    subst("CUDA_LAUNCH_PARAMS_v1", "hipFunctionLaunchParams", "type");
-    subst("CUDA_MEMCPY2D", "hip_Memcpy2D", "type");
-    subst("CUDA_MEMCPY2D_st", "hip_Memcpy2D", "type");
-    subst("CUDA_MEMCPY2D_v1", "hip_Memcpy2D", "type");
-    subst("CUDA_MEMCPY2D_v1_st", "hip_Memcpy2D", "type");
-    subst("CUDA_MEMCPY2D_v2", "hip_Memcpy2D", "type");
-    subst("CUDA_MEMCPY3D", "HIP_MEMCPY3D", "type");
-    subst("CUDA_MEMCPY3D_st", "HIP_MEMCPY3D", "type");
-    subst("CUDA_MEMCPY3D_v1", "HIP_MEMCPY3D", "type");
-    subst("CUDA_MEMCPY3D_v1_st", "HIP_MEMCPY3D", "type");
-    subst("CUDA_MEMCPY3D_v2", "HIP_MEMCPY3D", "type");
-    subst("CUDA_MEMCPY_NODE_PARAMS", "hipMemcpyNodeParams", "type");
-    subst("CUDA_MEMCPY_NODE_PARAMS_st", "hipMemcpyNodeParams", "type");
-    subst("CUDA_MEMSET_NODE_PARAMS", "HIP_MEMSET_NODE_PARAMS", "type");
-    subst("CUDA_MEMSET_NODE_PARAMS_st", "HIP_MEMSET_NODE_PARAMS", "type");
-    subst("CUDA_MEMSET_NODE_PARAMS_v1", "HIP_MEMSET_NODE_PARAMS", "type");
-    subst("CUDA_MEM_ALLOC_NODE_PARAMS", "hipMemAllocNodeParams", "type");
-    subst("CUDA_MEM_ALLOC_NODE_PARAMS_st", "hipMemAllocNodeParams", "type");
-    subst("CUDA_MEM_ALLOC_NODE_PARAMS_v1", "hipMemAllocNodeParams", "type");
-    subst("CUDA_MEM_ALLOC_NODE_PARAMS_v1_st", "hipMemAllocNodeParams", "type");
-    subst("CUDA_MEM_FREE_NODE_PARAMS", "hipMemFreeNodeParams", "type");
-    subst("CUDA_MEM_FREE_NODE_PARAMS_st", "hipMemFreeNodeParams", "type");
-    subst("CUDA_RESOURCE_DESC", "HIP_RESOURCE_DESC", "type");
-    subst("CUDA_RESOURCE_DESC_st", "HIP_RESOURCE_DESC_st", "type");
-    subst("CUDA_RESOURCE_DESC_v1", "HIP_RESOURCE_DESC", "type");
-    subst("CUDA_RESOURCE_VIEW_DESC", "HIP_RESOURCE_VIEW_DESC", "type");
-    subst("CUDA_RESOURCE_VIEW_DESC_st", "HIP_RESOURCE_VIEW_DESC_st", "type");
-    subst("CUDA_RESOURCE_VIEW_DESC_v1", "HIP_RESOURCE_VIEW_DESC", "type");
-    subst("CUDA_TEXTURE_DESC", "HIP_TEXTURE_DESC", "type");
-    subst("CUDA_TEXTURE_DESC_st", "HIP_TEXTURE_DESC_st", "type");
-    subst("CUDA_TEXTURE_DESC_v1", "HIP_TEXTURE_DESC", "type");
-    subst("CUGLDeviceList", "hipGLDeviceList", "type");
-    subst("CUGLDeviceList_enum", "hipGLDeviceList", "type");
-    subst("CUGPUDirectRDMAWritesOrdering", "hipGPUDirectRDMAWritesOrdering", "type");
-    subst("CUGPUDirectRDMAWritesOrdering_enum", "hipGPUDirectRDMAWritesOrdering", "type");
-    subst("CUaccessPolicyWindow", "hipAccessPolicyWindow", "type");
-    subst("CUaccessPolicyWindow_st", "hipAccessPolicyWindow", "type");
-    subst("CUaccessProperty", "hipAccessProperty", "type");
-    subst("CUaccessProperty_enum", "hipAccessProperty", "type");
-    subst("CUaddress_mode", "HIPaddress_mode", "type");
-    subst("CUaddress_mode_enum", "HIPaddress_mode_enum", "type");
-    subst("CUarray", "hipArray_t", "type");
-    subst("CUarrayMapInfo", "hipArrayMapInfo", "type");
-    subst("CUarrayMapInfo_st", "hipArrayMapInfo", "type");
-    subst("CUarrayMapInfo_v1", "hipArrayMapInfo", "type");
-    subst("CUarraySparseSubresourceType", "hipArraySparseSubresourceType", "type");
-    subst("CUarraySparseSubresourceType_enum", "hipArraySparseSubresourceType", "type");
-    subst("CUarray_format", "hipArray_Format", "type");
-    subst("CUarray_format_enum", "hipArray_Format", "type");
-    subst("CUarray_st", "hipArray", "type");
-    subst("CUcomputemode", "hipComputeMode", "type");
-    subst("CUcomputemode_enum", "hipComputeMode", "type");
-    subst("CUcontext", "hipCtx_t", "type");
-    subst("CUctx_st", "ihipCtx_t", "type");
-    subst("CUdevice", "hipDevice_t", "type");
-    subst("CUdevice_P2PAttribute", "hipDeviceP2PAttr", "type");
-    subst("CUdevice_P2PAttribute_enum", "hipDeviceP2PAttr", "type");
-    subst("CUdevice_attribute", "hipDeviceAttribute_t", "type");
-    subst("CUdevice_attribute_enum", "hipDeviceAttribute_t", "type");
-    subst("CUdevice_v1", "hipDevice_t", "type");
-    subst("CUdeviceptr", "hipDeviceptr_t", "type");
-    subst("CUdeviceptr_v1", "hipDeviceptr_t", "type");
-    subst("CUdeviceptr_v2", "hipDeviceptr_t", "type");
-    subst("CUevent", "hipEvent_t", "type");
-    subst("CUevent_st", "ihipEvent_t", "type");
-    subst("CUexternalMemory", "hipExternalMemory_t", "type");
-    subst("CUexternalMemoryHandleType", "hipExternalMemoryHandleType", "type");
-    subst("CUexternalMemoryHandleType_enum", "hipExternalMemoryHandleType_enum", "type");
-    subst("CUexternalSemaphore", "hipExternalSemaphore_t", "type");
-    subst("CUexternalSemaphoreHandleType", "hipExternalSemaphoreHandleType", "type");
-    subst("CUexternalSemaphoreHandleType_enum", "hipExternalSemaphoreHandleType_enum", "type");
-    subst("CUfilter_mode", "HIPfilter_mode", "type");
-    subst("CUfilter_mode_enum", "HIPfilter_mode_enum", "type");
-    subst("CUflushGPUDirectRDMAWritesOptions", "hipFlushGPUDirectRDMAWritesOptions", "type");
-    subst("CUflushGPUDirectRDMAWritesOptions_enum", "hipFlushGPUDirectRDMAWritesOptions", "type");
-    subst("CUfunc_cache", "hipFuncCache_t", "type");
-    subst("CUfunc_cache_enum", "hipFuncCache_t", "type");
-    subst("CUfunc_st", "ihipModuleSymbol_t", "type");
-    subst("CUfunction", "hipFunction_t", "type");
-    subst("CUfunction_attribute", "hipFunction_attribute", "type");
-    subst("CUfunction_attribute_enum", "hipFunction_attribute", "type");
-    subst("CUgraph", "hipGraph_t", "type");
-    subst("CUgraphDebugDot_flags", "hipGraphDebugDotFlags", "type");
-    subst("CUgraphDebugDot_flags_enum", "hipGraphDebugDotFlags", "type");
-    subst("CUgraphExec", "hipGraphExec_t", "type");
-    subst("CUgraphExecUpdateResult", "hipGraphExecUpdateResult", "type");
-    subst("CUgraphExecUpdateResult_enum", "hipGraphExecUpdateResult", "type");
-    subst("CUgraphExec_st", "hipGraphExec", "type");
-    subst("CUgraphInstantiateResult", "hipGraphInstantiateResult", "type");
-    subst("CUgraphInstantiateResult_enum", "hipGraphInstantiateResult", "type");
-    subst("CUgraphInstantiate_flags", "hipGraphInstantiateFlags", "type");
-    subst("CUgraphInstantiate_flags_enum", "hipGraphInstantiateFlags", "type");
-    subst("CUgraphMem_attribute", "hipGraphMemAttributeType", "type");
-    subst("CUgraphMem_attribute_enum", "hipGraphMemAttributeType", "type");
-    subst("CUgraphNode", "hipGraphNode_t", "type");
-    subst("CUgraphNodeParams", "hipGraphNodeParams", "type");
-    subst("CUgraphNodeParams_st", "hipGraphNodeParams", "type");
-    subst("CUgraphNodeType", "hipGraphNodeType", "type");
-    subst("CUgraphNodeType_enum", "hipGraphNodeType", "type");
-    subst("CUgraphNode_st", "hipGraphNode", "type");
-    subst("CUgraph_st", "ihipGraph", "type");
-    subst("CUgraphicsRegisterFlags", "hipGraphicsRegisterFlags", "type");
-    subst("CUgraphicsRegisterFlags_enum", "hipGraphicsRegisterFlags", "type");
-    subst("CUgraphicsResource", "hipGraphicsResource_t", "type");
-    subst("CUgraphicsResource_st", "hipGraphicsResource", "type");
-    subst("CUhostFn", "hipHostFn_t", "type");
-    subst("CUipcEventHandle", "hipIpcEventHandle_t", "type");
-    subst("CUipcEventHandle_st", "hipIpcEventHandle_st", "type");
-    subst("CUipcEventHandle_v1", "hipIpcEventHandle_t", "type");
-    subst("CUipcMemHandle", "hipIpcMemHandle_t", "type");
-    subst("CUipcMemHandle_st", "hipIpcMemHandle_st", "type");
-    subst("CUipcMemHandle_v1", "hipIpcMemHandle_t", "type");
-    subst("CUjitInputType", "hiprtcJITInputType", "type");
-    subst("CUjitInputType_enum", "hiprtcJITInputType", "type");
-    subst("CUjit_option", "hipJitOption", "type");
-    subst("CUjit_option_enum", "hipJitOption", "type");
-    subst("CUkernelNodeAttrID", "hipKernelNodeAttrID", "type");
-    subst("CUkernelNodeAttrID_enum", "hipKernelNodeAttrID", "type");
-    subst("CUkernelNodeAttrValue", "hipKernelNodeAttrValue", "type");
-    subst("CUkernelNodeAttrValue_union", "hipKernelNodeAttrValue", "type");
-    subst("CUkernelNodeAttrValue_v1", "hipKernelNodeAttrValue", "type");
-    subst("CUlimit", "hipLimit_t", "type");
-    subst("CUlimit_enum", "hipLimit_t", "type");
-    subst("CUlinkState", "hiprtcLinkState", "type");
-    subst("CUlinkState_st", "ihiprtcLinkState", "type");
-    subst("CUmemAccessDesc", "hipMemAccessDesc", "type");
-    subst("CUmemAccessDesc_st", "hipMemAccessDesc", "type");
-    subst("CUmemAccessDesc_v1", "hipMemAccessDesc", "type");
-    subst("CUmemAccess_flags", "hipMemAccessFlags", "type");
-    subst("CUmemAccess_flags_enum", "hipMemAccessFlags", "type");
-    subst("CUmemAllocationGranularity_flags", "hipMemAllocationGranularity_flags", "type");
-    subst("CUmemAllocationGranularity_flags_enum", "hipMemAllocationGranularity_flags", "type");
-    subst("CUmemAllocationHandleType", "hipMemAllocationHandleType", "type");
-    subst("CUmemAllocationHandleType_enum", "hipMemAllocationHandleType", "type");
-    subst("CUmemAllocationProp", "hipMemAllocationProp", "type");
-    subst("CUmemAllocationProp_st", "hipMemAllocationProp", "type");
-    subst("CUmemAllocationProp_v1", "hipMemAllocationProp", "type");
-    subst("CUmemAllocationType", "hipMemAllocationType", "type");
-    subst("CUmemAllocationType_enum", "hipMemAllocationType", "type");
-    subst("CUmemGenericAllocationHandle", "hipMemGenericAllocationHandle_t", "type");
-    subst("CUmemGenericAllocationHandle_v1", "hipMemGenericAllocationHandle_t", "type");
-    subst("CUmemHandleType", "hipMemHandleType", "type");
-    subst("CUmemHandleType_enum", "hipMemHandleType", "type");
-    subst("CUmemLocation", "hipMemLocation", "type");
-    subst("CUmemLocationType", "hipMemLocationType", "type");
-    subst("CUmemLocationType_enum", "hipMemLocationType", "type");
-    subst("CUmemLocation_st", "hipMemLocation", "type");
-    subst("CUmemLocation_v1", "hipMemLocation", "type");
-    subst("CUmemOperationType", "hipMemOperationType", "type");
-    subst("CUmemOperationType_enum", "hipMemOperationType", "type");
-    subst("CUmemPoolHandle_st", "ihipMemPoolHandle_t", "type");
-    subst("CUmemPoolProps", "hipMemPoolProps", "type");
-    subst("CUmemPoolProps_st", "hipMemPoolProps", "type");
-    subst("CUmemPoolProps_v1", "hipMemPoolProps", "type");
-    subst("CUmemPoolPtrExportData", "hipMemPoolPtrExportData", "type");
-    subst("CUmemPoolPtrExportData_st", "hipMemPoolPtrExportData", "type");
-    subst("CUmemPoolPtrExportData_v1", "hipMemPoolPtrExportData", "type");
-    subst("CUmemPool_attribute", "hipMemPoolAttr", "type");
-    subst("CUmemPool_attribute_enum", "hipMemPoolAttr", "type");
-    subst("CUmem_advise", "hipMemoryAdvise", "type");
-    subst("CUmem_advise_enum", "hipMemoryAdvise", "type");
-    subst("CUmem_range_attribute", "hipMemRangeAttribute", "type");
-    subst("CUmem_range_attribute_enum", "hipMemRangeAttribute", "type");
-    subst("CUmemoryPool", "hipMemPool_t", "type");
-    subst("CUmemorytype", "hipMemoryType", "type");
-    subst("CUmemorytype_enum", "hipMemoryType", "type");
-    subst("CUmipmappedArray", "hipMipmappedArray_t", "type");
-    subst("CUmipmappedArray_st", "hipMipmappedArray", "type");
-    subst("CUmod_st", "ihipModule_t", "type");
-    subst("CUmodule", "hipModule_t", "type");
-    subst("CUoccupancyB2DSize", "void*", "type");
-    subst("CUpointer_attribute", "hipPointer_attribute", "type");
-    subst("CUpointer_attribute_enum", "hipPointer_attribute", "type");
-    subst("CUresourceViewFormat", "HIPresourceViewFormat", "type");
-    subst("CUresourceViewFormat_enum", "HIPresourceViewFormat_enum", "type");
-    subst("CUresourcetype", "HIPresourcetype", "type");
-    subst("CUresourcetype_enum", "HIPresourcetype_enum", "type");
-    subst("CUresult", "hipError_t", "type");
-    subst("CUsharedconfig", "hipSharedMemConfig", "type");
-    subst("CUsharedconfig_enum", "hipSharedMemConfig", "type");
-    subst("CUstream", "hipStream_t", "type");
-    subst("CUstreamCallback", "hipStreamCallback_t", "type");
-    subst("CUstreamCaptureMode", "hipStreamCaptureMode", "type");
-    subst("CUstreamCaptureMode_enum", "hipStreamCaptureMode", "type");
-    subst("CUstreamCaptureStatus", "hipStreamCaptureStatus", "type");
-    subst("CUstreamCaptureStatus_enum", "hipStreamCaptureStatus", "type");
-    subst("CUstreamUpdateCaptureDependencies_flags", "hipStreamUpdateCaptureDependenciesFlags", "type");
-    subst("CUstreamUpdateCaptureDependencies_flags_enum", "hipStreamUpdateCaptureDependenciesFlags", "type");
-    subst("CUstream_st", "ihipStream_t", "type");
-    subst("CUsurfObject", "hipSurfaceObject_t", "type");
-    subst("CUsurfObject_v1", "hipSurfaceObject_t", "type");
-    subst("CUtexObject", "hipTextureObject_t", "type");
-    subst("CUtexObject_v1", "hipTextureObject_t", "type");
-    subst("CUtexref", "hipTexRef", "type");
-    subst("CUtexref_st", "textureReference", "type");
-    subst("CUuserObject", "hipUserObject_t", "type");
-    subst("CUuserObjectRetain_flags", "hipUserObjectRetainFlags", "type");
-    subst("CUuserObjectRetain_flags_enum", "hipUserObjectRetainFlags", "type");
-    subst("CUuserObject_flags", "hipUserObjectFlags", "type");
-    subst("CUuserObject_flags_enum", "hipUserObjectFlags", "type");
-    subst("CUuserObject_st", "hipUserObject", "type");
-    subst("CUuuid", "hipUUID", "type");
-    subst("CUuuid_st", "hipUUID_t", "type");
-    subst("GLenum", "GLenum", "type");
-    subst("GLuint", "GLuint", "type");
-    subst("bsric02Info", "bsric02Info", "type");
-    subst("bsric02Info_t", "bsric02Info_t", "type");
-    subst("bsrilu02Info", "bsrilu02Info", "type");
-    subst("bsrilu02Info_t", "bsrilu02Info_t", "type");
-    subst("bsrsm2Info", "bsrsm2Info", "type");
-    subst("bsrsm2Info_t", "bsrsm2Info_t", "type");
-    subst("bsrsv2Info", "bsrsv2Info", "type");
-    subst("bsrsv2Info_t", "bsrsv2Info_t", "type");
-    subst("csrgemm2Info", "csrgemm2Info", "type");
-    subst("csrgemm2Info_t", "csrgemm2Info_t", "type");
-    subst("csric02Info", "csric02Info", "type");
-    subst("csric02Info_t", "csric02Info_t", "type");
-    subst("csrilu02Info", "csrilu02Info", "type");
-    subst("csrilu02Info_t", "csrilu02Info_t", "type");
-    subst("csrsm2Info_t", "csrsm2Info_t", "type");
-    subst("csrsv2Info_t", "csrsv2Info_t", "type");
-    subst("csru2csrInfo", "csru2csrInfo", "type");
-    subst("csru2csrInfo_t", "csru2csrInfo_t", "type");
-    subst("cuComplex", "hipComplex", "type");
-    subst("cuDoubleComplex", "hipDoubleComplex", "type");
-    subst("cuFloatComplex", "hipFloatComplex", "type");
-    subst("cublasAtomicsMode_t", "hipblasAtomicsMode_t", "type");
-    subst("cublasComputeType_t", "hipblasComputeType_t", "type");
-    subst("cublasDataType_t", "hipDataType", "type");
-    subst("cublasDiagType_t", "hipblasDiagType_t", "type");
-    subst("cublasFillMode_t", "hipblasFillMode_t", "type");
-    subst("cublasGemmAlgo_t", "hipblasGemmAlgo_t", "type");
-    subst("cublasHandle_t", "hipblasHandle_t", "type");
-    subst("cublasLtEpilogue_t", "hipblasLtEpilogue_t", "type");
-    subst("cublasLtHandle_t", "hipblasLtHandle_t", "type");
-    subst("cublasLtMatmulAlgo_t", "hipblasLtMatmulAlgo_t", "type");
-    subst("cublasLtMatmulDescAttributes_t", "hipblasLtMatmulDescAttributes_t", "type");
-    subst("cublasLtMatmulDescOpaque_t", "hipblasLtMatmulDescOpaque_t", "type");
-    subst("cublasLtMatmulDesc_t", "hipblasLtMatmulDesc_t", "type");
-    subst("cublasLtMatmulHeuristicResult_t", "hipblasLtMatmulHeuristicResult_t", "type");
-    subst("cublasLtMatmulPreferenceAttributes_t", "hipblasLtMatmulPreferenceAttributes_t", "type");
-    subst("cublasLtMatmulPreferenceOpaque_t", "hipblasLtMatmulPreferenceOpaque_t", "type");
-    subst("cublasLtMatmulPreference_t", "hipblasLtMatmulPreference_t", "type");
-    subst("cublasLtMatrixLayoutAttribute_t", "hipblasLtMatrixLayoutAttribute_t", "type");
-    subst("cublasLtMatrixLayoutOpaque_t", "hipblasLtMatrixLayoutOpaque_t", "type");
-    subst("cublasLtMatrixLayoutStruct", "hipblasLtMatrixLayoutOpaque_t", "type");
-    subst("cublasLtMatrixLayout_t", "hipblasLtMatrixLayout_t", "type");
-    subst("cublasLtMatrixTransformDescAttributes_t", "hipblasLtMatrixTransformDescAttributes_t", "type");
-    subst("cublasLtMatrixTransformDescOpaque_t", "hipblasLtMatrixTransformDescOpaque_t", "type");
-    subst("cublasLtMatrixTransformDesc_t", "hipblasLtMatrixTransformDesc_t", "type");
-    subst("cublasLtPointerMode_t", "hipblasLtPointerMode_t", "type");
-    subst("cublasMath_t", "hipblasMath_t", "type");
-    subst("cublasOperation_t", "hipblasOperation_t", "type");
-    subst("cublasPointerMode_t", "hipblasPointerMode_t", "type");
-    subst("cublasSideMode_t", "hipblasSideMode_t", "type");
-    subst("cublasStatus", "hipblasStatus_t", "type");
-    subst("cublasStatus_t", "hipblasStatus_t", "type");
-    subst("cudaAccessPolicyWindow", "hipAccessPolicyWindow", "type");
-    subst("cudaAccessProperty", "hipAccessProperty", "type");
-    subst("cudaArray", "hipArray", "type");
-    subst("cudaArray_const_t", "hipArray_const_t", "type");
-    subst("cudaArray_t", "hipArray_t", "type");
-    subst("cudaChannelFormatDesc", "hipChannelFormatDesc", "type");
-    subst("cudaChannelFormatKind", "hipChannelFormatKind", "type");
-    subst("cudaChildGraphNodeParams", "hipChildGraphNodeParams", "type");
-    subst("cudaComputeMode", "hipComputeMode", "type");
-    subst("cudaDataType", "hipDataType", "type");
-    subst("cudaDataType_t", "hipDataType", "type");
-    subst("cudaDeviceAttr", "hipDeviceAttribute_t", "type");
-    subst("cudaDeviceP2PAttr", "hipDeviceP2PAttr", "type");
-    subst("cudaDeviceProp", "hipDeviceProp_t", "type");
-    subst("cudaError", "hipError_t", "type");
-    subst("cudaError_enum", "hipError_t", "type");
-    subst("cudaError_t", "hipError_t", "type");
-    subst("cudaEventRecordNodeParams", "hipEventRecordNodeParams", "type");
-    subst("cudaEventWaitNodeParams", "hipEventWaitNodeParams", "type");
-    subst("cudaEvent_t", "hipEvent_t", "type");
-    subst("cudaExtent", "hipExtent", "type");
-    subst("cudaExternalMemoryBufferDesc", "hipExternalMemoryBufferDesc", "type");
-    subst("cudaExternalMemoryHandleDesc", "hipExternalMemoryHandleDesc", "type");
-    subst("cudaExternalMemoryHandleType", "hipExternalMemoryHandleType", "type");
-    subst("cudaExternalMemory_t", "hipExternalMemory_t", "type");
-    subst("cudaExternalSemaphoreHandleDesc", "hipExternalSemaphoreHandleDesc", "type");
-    subst("cudaExternalSemaphoreHandleType", "hipExternalSemaphoreHandleType", "type");
-    subst("cudaExternalSemaphoreSignalNodeParams", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("cudaExternalSemaphoreSignalNodeParamsV2", "hipExternalSemaphoreSignalNodeParams", "type");
-    subst("cudaExternalSemaphoreSignalParams", "hipExternalSemaphoreSignalParams", "type");
-    subst("cudaExternalSemaphoreSignalParams_v1", "hipExternalSemaphoreSignalParams", "type");
-    subst("cudaExternalSemaphoreWaitNodeParams", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("cudaExternalSemaphoreWaitNodeParamsV2", "hipExternalSemaphoreWaitNodeParams", "type");
-    subst("cudaExternalSemaphoreWaitParams", "hipExternalSemaphoreWaitParams", "type");
-    subst("cudaExternalSemaphoreWaitParams_v1", "hipExternalSemaphoreWaitParams", "type");
-    subst("cudaExternalSemaphore_t", "hipExternalSemaphore_t", "type");
-    subst("cudaFlushGPUDirectRDMAWritesOptions", "hipFlushGPUDirectRDMAWritesOptions", "type");
-    subst("cudaFuncAttribute", "hipFuncAttribute", "type");
-    subst("cudaFuncAttributes", "hipFuncAttributes", "type");
-    subst("cudaFuncCache", "hipFuncCache_t", "type");
-    subst("cudaFunction_t", "hipFunction_t", "type");
-    subst("cudaGLDeviceList", "hipGLDeviceList", "type");
-    subst("cudaGPUDirectRDMAWritesOrdering", "hipGPUDirectRDMAWritesOrdering", "type");
-    subst("cudaGraphDebugDotFlags", "hipGraphDebugDotFlags", "type");
-    subst("cudaGraphExecUpdateResult", "hipGraphExecUpdateResult", "type");
-    subst("cudaGraphExec_t", "hipGraphExec_t", "type");
-    subst("cudaGraphInstantiateFlags", "hipGraphInstantiateFlags", "type");
-    subst("cudaGraphInstantiateParams", "hipGraphInstantiateParams", "type");
-    subst("cudaGraphInstantiateParams_st", "hipGraphInstantiateParams", "type");
-    subst("cudaGraphInstantiateResult", "hipGraphInstantiateResult", "type");
-    subst("cudaGraphMemAttributeType", "hipGraphMemAttributeType", "type");
-    subst("cudaGraphNodeParams", "hipGraphNodeParams", "type");
-    subst("cudaGraphNodeType", "hipGraphNodeType", "type");
-    subst("cudaGraphNode_t", "hipGraphNode_t", "type");
-    subst("cudaGraph_t", "hipGraph_t", "type");
-    subst("cudaGraphicsRegisterFlags", "hipGraphicsRegisterFlags", "type");
-    subst("cudaGraphicsResource", "hipGraphicsResource", "type");
-    subst("cudaGraphicsResource_t", "hipGraphicsResource_t", "type");
-    subst("cudaHostFn_t", "hipHostFn_t", "type");
-    subst("cudaHostNodeParams", "hipHostNodeParams", "type");
-    subst("cudaIpcEventHandle_st", "hipIpcEventHandle_st", "type");
-    subst("cudaIpcEventHandle_t", "hipIpcEventHandle_t", "type");
-    subst("cudaIpcMemHandle_st", "hipIpcMemHandle_st", "type");
-    subst("cudaIpcMemHandle_t", "hipIpcMemHandle_t", "type");
-    subst("cudaKernelNodeAttrID", "hipKernelNodeAttrID", "type");
-    subst("cudaKernelNodeAttrValue", "hipKernelNodeAttrValue", "type");
-    subst("cudaKernelNodeParams", "hipKernelNodeParams", "type");
-    subst("cudaLaunchParams", "hipLaunchParams", "type");
-    subst("cudaLimit", "hipLimit_t", "type");
-    subst("cudaMemAccessDesc", "hipMemAccessDesc", "type");
-    subst("cudaMemAccessFlags", "hipMemAccessFlags", "type");
-    subst("cudaMemAllocNodeParams", "hipMemAllocNodeParams", "type");
-    subst("cudaMemAllocationHandleType", "hipMemAllocationHandleType", "type");
-    subst("cudaMemAllocationType", "hipMemAllocationType", "type");
-    subst("cudaMemFreeNodeParams", "hipMemFreeNodeParams", "type");
-    subst("cudaMemLocation", "hipMemLocation", "type");
-    subst("cudaMemLocationType", "hipMemLocationType", "type");
-    subst("cudaMemPoolAttr", "hipMemPoolAttr", "type");
-    subst("cudaMemPoolProps", "hipMemPoolProps", "type");
-    subst("cudaMemPoolPtrExportData", "hipMemPoolPtrExportData", "type");
-    subst("cudaMemPool_t", "hipMemPool_t", "type");
-    subst("cudaMemRangeAttribute", "hipMemRangeAttribute", "type");
-    subst("cudaMemcpy3DParms", "hipMemcpy3DParms", "type");
-    subst("cudaMemcpyKind", "hipMemcpyKind", "type");
-    subst("cudaMemcpyNodeParams", "hipMemcpyNodeParams", "type");
-    subst("cudaMemoryAdvise", "hipMemoryAdvise", "type");
-    subst("cudaMemoryType", "hipMemoryType", "type");
-    subst("cudaMemsetParams", "hipMemsetParams", "type");
-    subst("cudaMipmappedArray", "hipMipmappedArray", "type");
-    subst("cudaMipmappedArray_const_t", "hipMipmappedArray_const_t", "type");
-    subst("cudaMipmappedArray_t", "hipMipmappedArray_t", "type");
-    subst("cudaPitchedPtr", "hipPitchedPtr", "type");
-    subst("cudaPointerAttributes", "hipPointerAttribute_t", "type");
-    subst("cudaPos", "hipPos", "type");
-    subst("cudaResourceDesc", "hipResourceDesc", "type");
-    subst("cudaResourceType", "hipResourceType", "type");
-    subst("cudaResourceViewDesc", "hipResourceViewDesc", "type");
-    subst("cudaResourceViewFormat", "hipResourceViewFormat", "type");
-    subst("cudaSharedMemConfig", "hipSharedMemConfig", "type");
-    subst("cudaStreamCallback_t", "hipStreamCallback_t", "type");
-    subst("cudaStreamCaptureMode", "hipStreamCaptureMode", "type");
-    subst("cudaStreamCaptureStatus", "hipStreamCaptureStatus", "type");
-    subst("cudaStreamUpdateCaptureDependenciesFlags", "hipStreamUpdateCaptureDependenciesFlags", "type");
-    subst("cudaStream_t", "hipStream_t", "type");
-    subst("cudaSurfaceBoundaryMode", "hipSurfaceBoundaryMode", "type");
-    subst("cudaSurfaceObject_t", "hipSurfaceObject_t", "type");
-    subst("cudaTextureAddressMode", "hipTextureAddressMode", "type");
-    subst("cudaTextureDesc", "hipTextureDesc", "type");
-    subst("cudaTextureFilterMode", "hipTextureFilterMode", "type");
-    subst("cudaTextureObject_t", "hipTextureObject_t", "type");
-    subst("cudaTextureReadMode", "hipTextureReadMode", "type");
-    subst("cudaUUID_t", "hipUUID", "type");
-    subst("cudaUserObjectFlags", "hipUserObjectFlags", "type");
-    subst("cudaUserObjectRetainFlags", "hipUserObjectRetainFlags", "type");
-    subst("cudaUserObject_t", "hipUserObject_t", "type");
-    subst("cudnnActivationDescriptor_t", "hipdnnActivationDescriptor_t", "type");
-    subst("cudnnActivationMode_t", "hipdnnActivationMode_t", "type");
-    subst("cudnnBatchNormMode_t", "hipdnnBatchNormMode_t", "type");
-    subst("cudnnConvolutionBwdDataAlgoPerfStruct", "hipdnnConvolutionBwdDataAlgoPerf_t", "type");
-    subst("cudnnConvolutionBwdDataAlgoPerf_t", "hipdnnConvolutionBwdDataAlgoPerf_t", "type");
-    subst("cudnnConvolutionBwdDataAlgo_t", "hipdnnConvolutionBwdDataAlgo_t", "type");
-    subst("cudnnConvolutionBwdDataPreference_t", "hipdnnConvolutionBwdDataPreference_t", "type");
-    subst("cudnnConvolutionBwdFilterAlgoPerfStruct", "hipdnnConvolutionBwdFilterAlgoPerf_t", "type");
-    subst("cudnnConvolutionBwdFilterAlgoPerf_t", "hipdnnConvolutionBwdFilterAlgoPerf_t", "type");
-    subst("cudnnConvolutionBwdFilterAlgo_t", "hipdnnConvolutionBwdFilterAlgo_t", "type");
-    subst("cudnnConvolutionBwdFilterPreference_t", "hipdnnConvolutionBwdFilterPreference_t", "type");
-    subst("cudnnConvolutionDescriptor_t", "hipdnnConvolutionDescriptor_t", "type");
-    subst("cudnnConvolutionFwdAlgoPerfStruct", "hipdnnConvolutionFwdAlgoPerf_t", "type");
-    subst("cudnnConvolutionFwdAlgoPerf_t", "hipdnnConvolutionFwdAlgoPerf_t", "type");
-    subst("cudnnConvolutionFwdAlgo_t", "hipdnnConvolutionFwdAlgo_t", "type");
-    subst("cudnnConvolutionFwdPreference_t", "hipdnnConvolutionFwdPreference_t", "type");
-    subst("cudnnConvolutionMode_t", "hipdnnConvolutionMode_t", "type");
-    subst("cudnnDataType_t", "hipdnnDataType_t", "type");
-    subst("cudnnDirectionMode_t", "hipdnnDirectionMode_t", "type");
-    subst("cudnnDropoutDescriptor_t", "hipdnnDropoutDescriptor_t", "type");
-    subst("cudnnFilterDescriptor_t", "hipdnnFilterDescriptor_t", "type");
-    subst("cudnnHandle_t", "hipdnnHandle_t", "type");
-    subst("cudnnIndicesType_t", "hipdnnIndicesType_t", "type");
-    subst("cudnnLRNDescriptor_t", "hipdnnLRNDescriptor_t", "type");
-    subst("cudnnLRNMode_t", "hipdnnLRNMode_t", "type");
-    subst("cudnnMathType_t", "hipdnnMathType_t", "type");
-    subst("cudnnNanPropagation_t", "hipdnnNanPropagation_t", "type");
-    subst("cudnnOpTensorDescriptor_t", "hipdnnOpTensorDescriptor_t", "type");
-    subst("cudnnOpTensorOp_t", "hipdnnOpTensorOp_t", "type");
-    subst("cudnnPersistentRNNPlan_t", "hipdnnPersistentRNNPlan_t", "type");
-    subst("cudnnPoolingDescriptor_t", "hipdnnPoolingDescriptor_t", "type");
-    subst("cudnnPoolingMode_t", "hipdnnPoolingMode_t", "type");
-    subst("cudnnRNNAlgo_t", "hipdnnRNNAlgo_t", "type");
-    subst("cudnnRNNBiasMode_t", "hipdnnRNNBiasMode_t", "type");
-    subst("cudnnRNNDescriptor_t", "hipdnnRNNDescriptor_t", "type");
-    subst("cudnnRNNInputMode_t", "hipdnnRNNInputMode_t", "type");
-    subst("cudnnRNNMode_t", "hipdnnRNNMode_t", "type");
-    subst("cudnnReduceTensorDescriptor_t", "hipdnnReduceTensorDescriptor_t", "type");
-    subst("cudnnReduceTensorIndices_t", "hipdnnReduceTensorIndices_t", "type");
-    subst("cudnnReduceTensorOp_t", "hipdnnReduceTensorOp_t", "type");
-    subst("cudnnSoftmaxAlgorithm_t", "hipdnnSoftmaxAlgorithm_t", "type");
-    subst("cudnnSoftmaxMode_t", "hipdnnSoftmaxMode_t", "type");
-    subst("cudnnStatus_t", "hipdnnStatus_t", "type");
-    subst("cudnnTensorDescriptor_t", "hipdnnTensorDescriptor_t", "type");
-    subst("cudnnTensorFormat_t", "hipdnnTensorFormat_t", "type");
-    subst("cufftComplex", "hipfftComplex", "type");
-    subst("cufftDoubleComplex", "hipfftDoubleComplex", "type");
-    subst("cufftDoubleReal", "hipfftDoubleReal", "type");
-    subst("cufftHandle", "hipfftHandle", "type");
-    subst("cufftReal", "hipfftReal", "type");
-    subst("cufftResult", "hipfftResult", "type");
-    subst("cufftResult_t", "hipfftResult_t", "type");
-    subst("cufftType", "hipfftType", "type");
-    subst("cufftType_t", "hipfftType_t", "type");
-    subst("cufftXtCallbackType", "hipfftXtCallbackType", "type");
-    subst("cufftXtCallbackType_t", "hipfftXtCallbackType_t", "type");
-    subst("curandDirectionVectorSet", "hiprandDirectionVectorSet_t", "type");
-    subst("curandDirectionVectorSet_t", "hiprandDirectionVectorSet_t", "type");
-    subst("curandDirectionVectors32_t", "hiprandDirectionVectors32_t", "type");
-    subst("curandDiscreteDistribution_st", "hiprandDiscreteDistribution_st", "type");
-    subst("curandDiscreteDistribution_t", "hiprandDiscreteDistribution_t", "type");
-    subst("curandGenerator_st", "hiprandGenerator_st", "type");
-    subst("curandGenerator_t", "hiprandGenerator_t", "type");
-    subst("curandRngType", "hiprandRngType_t", "type");
-    subst("curandRngType_t", "hiprandRngType_t", "type");
-    subst("curandState", "hiprandState", "type");
-    subst("curandStateMRG32k3a", "hiprandStateMRG32k3a", "type");
-    subst("curandStateMRG32k3a_t", "hiprandStateMRG32k3a_t", "type");
-    subst("curandStateMtgp32", "hiprandStateMtgp32", "type");
-    subst("curandStateMtgp32_t", "hiprandStateMtgp32_t", "type");
-    subst("curandStatePhilox4_32_10", "hiprandStatePhilox4_32_10", "type");
-    subst("curandStatePhilox4_32_10_t", "hiprandStatePhilox4_32_10_t", "type");
-    subst("curandStateSobol32", "hiprandStateSobol32", "type");
-    subst("curandStateSobol32_t", "hiprandStateSobol32_t", "type");
-    subst("curandStateXORWOW", "hiprandStateXORWOW", "type");
-    subst("curandStateXORWOW_t", "hiprandStateXORWOW_t", "type");
-    subst("curandState_t", "hiprandState_t", "type");
-    subst("curandStatus", "hiprandStatus", "type");
-    subst("curandStatus_t", "hiprandStatus_t", "type");
-    subst("cusolverDnHandle_t", "hipsolverHandle_t", "type");
-    subst("cusolverEigMode_t", "hipsolverEigMode_t", "type");
-    subst("cusolverEigRange_t", "hipsolverEigRange_t", "type");
-    subst("cusolverEigType_t", "hipsolverEigType_t", "type");
-    subst("cusolverRfFactorization_t", "hipsolverRfFactorization_t", "type");
-    subst("cusolverRfHandle_t", "hipsolverRfHandle_t", "type");
-    subst("cusolverRfMatrixFormat_t", "hipsolverRfMatrixFormat_t", "type");
-    subst("cusolverRfNumericBoostReport_t", "hipsolverRfNumericBoostReport_t", "type");
-    subst("cusolverRfResetValuesFastMode_t", "hipsolverRfResetValuesFastMode_t", "type");
-    subst("cusolverRfTriangularSolve_t", "hipsolverRfTriangularSolve_t", "type");
-    subst("cusolverRfUnitDiagonal_t", "hipsolverRfUnitDiagonal_t", "type");
-    subst("cusolverSpHandle_t", "hipsolverSpHandle_t", "type");
-    subst("cusolverStatus_t", "hipsolverStatus_t", "type");
-    subst("cusparseAction_t", "hipsparseAction_t", "type");
-    subst("cusparseColorInfo_t", "hipsparseColorInfo_t", "type");
-    subst("cusparseConstDnMatDescr_t", "hipsparseConstDnMatDescr_t", "type");
-    subst("cusparseConstDnVecDescr_t", "hipsparseConstDnVecDescr_t", "type");
-    subst("cusparseConstSpMatDescr_t", "hipsparseConstSpMatDescr_t", "type");
-    subst("cusparseConstSpVecDescr_t", "hipsparseConstSpVecDescr_t", "type");
-    subst("cusparseCsr2CscAlg_t", "hipsparseCsr2CscAlg_t", "type");
-    subst("cusparseDenseToSparseAlg_t", "hipsparseDenseToSparseAlg_t", "type");
-    subst("cusparseDiagType_t", "hipsparseDiagType_t", "type");
-    subst("cusparseDirection_t", "hipsparseDirection_t", "type");
-    subst("cusparseDnMatDescr_t", "hipsparseDnMatDescr_t", "type");
-    subst("cusparseDnVecDescr_t", "hipsparseDnVecDescr_t", "type");
-    subst("cusparseFillMode_t", "hipsparseFillMode_t", "type");
-    subst("cusparseFormat_t", "hipsparseFormat_t", "type");
-    subst("cusparseHandle_t", "hipsparseHandle_t", "type");
-    subst("cusparseHybMat_t", "hipsparseHybMat_t", "type");
-    subst("cusparseHybPartition_t", "hipsparseHybPartition_t", "type");
-    subst("cusparseIndexBase_t", "hipsparseIndexBase_t", "type");
-    subst("cusparseIndexType_t", "hipsparseIndexType_t", "type");
-    subst("cusparseMatDescr_t", "hipsparseMatDescr_t", "type");
-    subst("cusparseMatrixType_t", "hipsparseMatrixType_t", "type");
-    subst("cusparseOperation_t", "hipsparseOperation_t", "type");
-    subst("cusparseOrder_t", "hipsparseOrder_t", "type");
-    subst("cusparsePointerMode_t", "hipsparsePointerMode_t", "type");
-    subst("cusparseSDDMMAlg_t", "hipsparseSDDMMAlg_t", "type");
-    subst("cusparseSolvePolicy_t", "hipsparseSolvePolicy_t", "type");
-    subst("cusparseSpGEMMAlg_t", "hipsparseSpGEMMAlg_t", "type");
-    subst("cusparseSpGEMMDescr", "hipsparseSpGEMMDescr", "type");
-    subst("cusparseSpGEMMDescr_t", "hipsparseSpGEMMDescr_t", "type");
-    subst("cusparseSpMMAlg_t", "hipsparseSpMMAlg_t", "type");
-    subst("cusparseSpMVAlg_t", "hipsparseSpMVAlg_t", "type");
-    subst("cusparseSpMatAttribute_t", "hipsparseSpMatAttribute_t", "type");
-    subst("cusparseSpMatDescr_t", "hipsparseSpMatDescr_t", "type");
-    subst("cusparseSpSMAlg_t", "hipsparseSpSMAlg_t", "type");
-    subst("cusparseSpSMDescr", "hipsparseSpSMDescr", "type");
-    subst("cusparseSpSMDescr_t", "hipsparseSpSMDescr_t", "type");
-    subst("cusparseSpSVAlg_t", "hipsparseSpSVAlg_t", "type");
-    subst("cusparseSpSVDescr", "hipsparseSpSVDescr", "type");
-    subst("cusparseSpSVDescr_t", "hipsparseSpSVDescr_t", "type");
-    subst("cusparseSpVecDescr_t", "hipsparseSpVecDescr_t", "type");
-    subst("cusparseSparseToDenseAlg_t", "hipsparseSparseToDenseAlg_t", "type");
-    subst("cusparseStatus_t", "hipsparseStatus_t", "type");
-    subst("gesvdjInfo_t", "hipsolverGesvdjInfo_t", "type");
-    subst("nvrtcProgram", "hiprtcProgram", "type");
-    subst("nvrtcResult", "hiprtcResult", "type");
-    subst("pruneInfo", "pruneInfo", "type");
-    subst("pruneInfo_t", "pruneInfo_t", "type");
-    subst("surfaceReference", "surfaceReference", "type");
-    subst("syevjInfo_t", "hipsolverSyevjInfo_t", "type");
-    subst("texture", "texture", "type");
-    subst("textureReference", "textureReference", "type");
-    subst("CUBLASLT_EPILOGUE_BGRADA", "HIPBLASLT_EPILOGUE_BGRADA", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_BGRADB", "HIPBLASLT_EPILOGUE_BGRADB", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_BIAS", "HIPBLASLT_EPILOGUE_BIAS", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_DEFAULT", "HIPBLASLT_EPILOGUE_DEFAULT", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_DGELU", "HIPBLASLT_EPILOGUE_DGELU", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_DGELU_BGRAD", "HIPBLASLT_EPILOGUE_DGELU_BGRAD", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_GELU", "HIPBLASLT_EPILOGUE_GELU", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_GELU_AUX", "HIPBLASLT_EPILOGUE_GELU_AUX", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_GELU_AUX_BIAS", "HIPBLASLT_EPILOGUE_GELU_AUX_BIAS", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_GELU_BIAS", "HIPBLASLT_EPILOGUE_GELU_BIAS", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_RELU", "HIPBLASLT_EPILOGUE_RELU", "numeric_literal");
-    subst("CUBLASLT_EPILOGUE_RELU_BIAS", "HIPBLASLT_EPILOGUE_RELU_BIAS", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_A_SCALE_POINTER", "HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", "HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_BIAS_POINTER", "HIPBLASLT_MATMUL_DESC_BIAS_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_B_SCALE_POINTER", "HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_C_SCALE_POINTER", "HIPBLASLT_MATMUL_DESC_C_SCALE_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_D_SCALE_POINTER", "HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_EPILOGUE", "HIPBLASLT_MATMUL_DESC_EPILOGUE", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE", "HIPBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD", "HIPBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER", "HIPBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER", "HIPBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_POINTER_MODE", "HIPBLASLT_MATMUL_DESC_POINTER_MODE", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_TRANSA", "HIPBLASLT_MATMUL_DESC_TRANSA", "numeric_literal");
-    subst("CUBLASLT_MATMUL_DESC_TRANSB", "HIPBLASLT_MATMUL_DESC_TRANSB", "numeric_literal");
-    subst("CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", "HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", "numeric_literal");
-    subst("CUBLASLT_MATMUL_PREF_SEARCH_MODE", "HIPBLASLT_MATMUL_PREF_SEARCH_MODE", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT", "HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_COLS", "HIPBLASLT_MATRIX_LAYOUT_COLS", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_LD", "HIPBLASLT_MATRIX_LAYOUT_LD", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_ORDER", "HIPBLASLT_MATRIX_LAYOUT_ORDER", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_ROWS", "HIPBLASLT_MATRIX_LAYOUT_ROWS", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET", "HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET", "numeric_literal");
-    subst("CUBLASLT_MATRIX_LAYOUT_TYPE", "HIPBLASLT_MATRIX_LAYOUT_TYPE", "numeric_literal");
-    subst("CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE", "HIPBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE", "numeric_literal");
-    subst("CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE", "HIPBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE", "numeric_literal");
-    subst("CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA", "HIPBLASLT_MATRIX_TRANSFORM_DESC_TRANSA", "numeric_literal");
-    subst("CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB", "HIPBLASLT_MATRIX_TRANSFORM_DESC_TRANSB", "numeric_literal");
-    subst("CUBLASLT_ORDER_COL", "HIPBLASLT_ORDER_COL", "numeric_literal");
-    subst("CUBLASLT_ORDER_ROW", "HIPBLASLT_ORDER_ROW", "numeric_literal");
-    subst("CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST", "HIPBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST", "numeric_literal");
-    subst("CUBLASLT_POINTER_MODE_DEVICE", "HIPBLASLT_POINTER_MODE_DEVICE", "numeric_literal");
-    subst("CUBLASLT_POINTER_MODE_HOST", "HIPBLASLT_POINTER_MODE_HOST", "numeric_literal");
-    subst("CUBLAS_ATOMICS_ALLOWED", "HIPBLAS_ATOMICS_ALLOWED", "numeric_literal");
-    subst("CUBLAS_ATOMICS_NOT_ALLOWED", "HIPBLAS_ATOMICS_NOT_ALLOWED", "numeric_literal");
-    subst("CUBLAS_COMPUTE_16F", "HIPBLAS_COMPUTE_16F", "numeric_literal");
-    subst("CUBLAS_COMPUTE_16F_PEDANTIC", "HIPBLAS_COMPUTE_16F_PEDANTIC", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32F", "HIPBLAS_COMPUTE_32F", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32F_FAST_16BF", "HIPBLAS_COMPUTE_32F_FAST_16BF", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32F_FAST_16F", "HIPBLAS_COMPUTE_32F_FAST_16F", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32F_FAST_TF32", "HIPBLAS_COMPUTE_32F_FAST_TF32", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32F_PEDANTIC", "HIPBLAS_COMPUTE_32F_PEDANTIC", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32I", "HIPBLAS_COMPUTE_32I", "numeric_literal");
-    subst("CUBLAS_COMPUTE_32I_PEDANTIC", "HIPBLAS_COMPUTE_32I_PEDANTIC", "numeric_literal");
-    subst("CUBLAS_COMPUTE_64F", "HIPBLAS_COMPUTE_64F", "numeric_literal");
-    subst("CUBLAS_COMPUTE_64F_PEDANTIC", "HIPBLAS_COMPUTE_64F_PEDANTIC", "numeric_literal");
-    subst("CUBLAS_DEFAULT_MATH", "HIPBLAS_DEFAULT_MATH", "numeric_literal");
-    subst("CUBLAS_DIAG_NON_UNIT", "HIPBLAS_DIAG_NON_UNIT", "numeric_literal");
-    subst("CUBLAS_DIAG_UNIT", "HIPBLAS_DIAG_UNIT", "numeric_literal");
-    subst("CUBLAS_FILL_MODE_FULL", "HIPBLAS_FILL_MODE_FULL", "numeric_literal");
-    subst("CUBLAS_FILL_MODE_LOWER", "HIPBLAS_FILL_MODE_LOWER", "numeric_literal");
-    subst("CUBLAS_FILL_MODE_UPPER", "HIPBLAS_FILL_MODE_UPPER", "numeric_literal");
-    subst("CUBLAS_GEMM_DEFAULT", "HIPBLAS_GEMM_DEFAULT", "numeric_literal");
-    subst("CUBLAS_GEMM_DFALT", "HIPBLAS_GEMM_DEFAULT", "numeric_literal");
-    subst("CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION", "HIPBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION", "numeric_literal");
-    subst("CUBLAS_OP_C", "HIPBLAS_OP_C", "numeric_literal");
-    subst("CUBLAS_OP_HERMITAN", "HIPBLAS_OP_C", "numeric_literal");
-    subst("CUBLAS_OP_N", "HIPBLAS_OP_N", "numeric_literal");
-    subst("CUBLAS_OP_T", "HIPBLAS_OP_T", "numeric_literal");
-    subst("CUBLAS_PEDANTIC_MATH", "HIPBLAS_PEDANTIC_MATH", "numeric_literal");
-    subst("CUBLAS_POINTER_MODE_DEVICE", "HIPBLAS_POINTER_MODE_DEVICE", "numeric_literal");
-    subst("CUBLAS_POINTER_MODE_HOST", "HIPBLAS_POINTER_MODE_HOST", "numeric_literal");
-    subst("CUBLAS_SIDE_LEFT", "HIPBLAS_SIDE_LEFT", "numeric_literal");
-    subst("CUBLAS_SIDE_RIGHT", "HIPBLAS_SIDE_RIGHT", "numeric_literal");
-    subst("CUBLAS_STATUS_ALLOC_FAILED", "HIPBLAS_STATUS_ALLOC_FAILED", "numeric_literal");
-    subst("CUBLAS_STATUS_ARCH_MISMATCH", "HIPBLAS_STATUS_ARCH_MISMATCH", "numeric_literal");
-    subst("CUBLAS_STATUS_EXECUTION_FAILED", "HIPBLAS_STATUS_EXECUTION_FAILED", "numeric_literal");
-    subst("CUBLAS_STATUS_INTERNAL_ERROR", "HIPBLAS_STATUS_INTERNAL_ERROR", "numeric_literal");
-    subst("CUBLAS_STATUS_INVALID_VALUE", "HIPBLAS_STATUS_INVALID_VALUE", "numeric_literal");
-    subst("CUBLAS_STATUS_LICENSE_ERROR", "HIPBLAS_STATUS_UNKNOWN", "numeric_literal");
-    subst("CUBLAS_STATUS_MAPPING_ERROR", "HIPBLAS_STATUS_MAPPING_ERROR", "numeric_literal");
-    subst("CUBLAS_STATUS_NOT_INITIALIZED", "HIPBLAS_STATUS_NOT_INITIALIZED", "numeric_literal");
-    subst("CUBLAS_STATUS_NOT_SUPPORTED", "HIPBLAS_STATUS_NOT_SUPPORTED", "numeric_literal");
-    subst("CUBLAS_STATUS_SUCCESS", "HIPBLAS_STATUS_SUCCESS", "numeric_literal");
-    subst("CUBLAS_TENSOR_OP_MATH", "HIPBLAS_TENSOR_OP_MATH", "numeric_literal");
-    subst("CUBLAS_TF32_TENSOR_OP_MATH", "HIPBLAS_TF32_TENSOR_OP_MATH", "numeric_literal");
-    subst("CUDA_C_16BF", "HIP_C_16BF", "numeric_literal");
-    subst("CUDA_C_16F", "HIP_C_16F", "numeric_literal");
-    subst("CUDA_C_32F", "HIP_C_32F", "numeric_literal");
-    subst("CUDA_C_32I", "HIP_C_32I", "numeric_literal");
-    subst("CUDA_C_32U", "HIP_C_32U", "numeric_literal");
-    subst("CUDA_C_64F", "HIP_C_64F", "numeric_literal");
-    subst("CUDA_C_8I", "HIP_C_8I", "numeric_literal");
-    subst("CUDA_C_8U", "HIP_C_8U", "numeric_literal");
-    subst("CUDA_ERROR_ALREADY_ACQUIRED", "hipErrorAlreadyAcquired", "numeric_literal");
-    subst("CUDA_ERROR_ALREADY_MAPPED", "hipErrorAlreadyMapped", "numeric_literal");
-    subst("CUDA_ERROR_ARRAY_IS_MAPPED", "hipErrorArrayIsMapped", "numeric_literal");
-    subst("CUDA_ERROR_ASSERT", "hipErrorAssert", "numeric_literal");
-    subst("CUDA_ERROR_CAPTURED_EVENT", "hipErrorCapturedEvent", "numeric_literal");
-    subst("CUDA_ERROR_CONTEXT_ALREADY_CURRENT", "hipErrorContextAlreadyCurrent", "numeric_literal");
-    subst("CUDA_ERROR_CONTEXT_ALREADY_IN_USE", "hipErrorContextAlreadyInUse", "numeric_literal");
-    subst("CUDA_ERROR_CONTEXT_IS_DESTROYED", "hipErrorContextIsDestroyed", "numeric_literal");
-    subst("CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", "hipErrorCooperativeLaunchTooLarge", "numeric_literal");
-    subst("CUDA_ERROR_DEINITIALIZED", "hipErrorDeinitialized", "numeric_literal");
-    subst("CUDA_ERROR_ECC_UNCORRECTABLE", "hipErrorECCNotCorrectable", "numeric_literal");
-    subst("CUDA_ERROR_FILE_NOT_FOUND", "hipErrorFileNotFound", "numeric_literal");
-    subst("CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE", "hipErrorGraphExecUpdateFailure", "numeric_literal");
-    subst("CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", "hipErrorHostMemoryAlreadyRegistered", "numeric_literal");
-    subst("CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", "hipErrorHostMemoryNotRegistered", "numeric_literal");
-    subst("CUDA_ERROR_ILLEGAL_ADDRESS", "hipErrorIllegalAddress", "numeric_literal");
-    subst("CUDA_ERROR_ILLEGAL_STATE", "hipErrorIllegalState", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_CONTEXT", "hipErrorInvalidContext", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_DEVICE", "hipErrorInvalidDevice", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_GRAPHICS_CONTEXT", "hipErrorInvalidGraphicsContext", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_HANDLE", "hipErrorInvalidHandle", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_IMAGE", "hipErrorInvalidImage", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_PTX", "hipErrorInvalidKernelFile", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_SOURCE", "hipErrorInvalidSource", "numeric_literal");
-    subst("CUDA_ERROR_INVALID_VALUE", "hipErrorInvalidValue", "numeric_literal");
-    subst("CUDA_ERROR_LAUNCH_FAILED", "hipErrorLaunchFailure", "numeric_literal");
-    subst("CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", "hipErrorLaunchOutOfResources", "numeric_literal");
-    subst("CUDA_ERROR_LAUNCH_TIMEOUT", "hipErrorLaunchTimeOut", "numeric_literal");
-    subst("CUDA_ERROR_MAP_FAILED", "hipErrorMapFailed", "numeric_literal");
-    subst("CUDA_ERROR_NOT_FOUND", "hipErrorNotFound", "numeric_literal");
-    subst("CUDA_ERROR_NOT_INITIALIZED", "hipErrorNotInitialized", "numeric_literal");
-    subst("CUDA_ERROR_NOT_MAPPED", "hipErrorNotMapped", "numeric_literal");
-    subst("CUDA_ERROR_NOT_MAPPED_AS_ARRAY", "hipErrorNotMappedAsArray", "numeric_literal");
-    subst("CUDA_ERROR_NOT_MAPPED_AS_POINTER", "hipErrorNotMappedAsPointer", "numeric_literal");
-    subst("CUDA_ERROR_NOT_READY", "hipErrorNotReady", "numeric_literal");
-    subst("CUDA_ERROR_NOT_SUPPORTED", "hipErrorNotSupported", "numeric_literal");
-    subst("CUDA_ERROR_NO_BINARY_FOR_GPU", "hipErrorNoBinaryForGpu", "numeric_literal");
-    subst("CUDA_ERROR_NO_DEVICE", "hipErrorNoDevice", "numeric_literal");
-    subst("CUDA_ERROR_OPERATING_SYSTEM", "hipErrorOperatingSystem", "numeric_literal");
-    subst("CUDA_ERROR_OUT_OF_MEMORY", "hipErrorOutOfMemory", "numeric_literal");
-    subst("CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", "hipErrorPeerAccessAlreadyEnabled", "numeric_literal");
-    subst("CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", "hipErrorPeerAccessNotEnabled", "numeric_literal");
-    subst("CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", "hipErrorPeerAccessUnsupported", "numeric_literal");
-    subst("CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", "hipErrorSetOnActiveProcess", "numeric_literal");
-    subst("CUDA_ERROR_PROFILER_ALREADY_STARTED", "hipErrorProfilerAlreadyStarted", "numeric_literal");
-    subst("CUDA_ERROR_PROFILER_ALREADY_STOPPED", "hipErrorProfilerAlreadyStopped", "numeric_literal");
-    subst("CUDA_ERROR_PROFILER_DISABLED", "hipErrorProfilerDisabled", "numeric_literal");
-    subst("CUDA_ERROR_PROFILER_NOT_INITIALIZED", "hipErrorProfilerNotInitialized", "numeric_literal");
-    subst("CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", "hipErrorSharedObjectInitFailed", "numeric_literal");
-    subst("CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", "hipErrorSharedObjectSymbolNotFound", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_IMPLICIT", "hipErrorStreamCaptureImplicit", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_INVALIDATED", "hipErrorStreamCaptureInvalidated", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_ISOLATION", "hipErrorStreamCaptureIsolation", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_MERGE", "hipErrorStreamCaptureMerge", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_UNJOINED", "hipErrorStreamCaptureUnjoined", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_UNMATCHED", "hipErrorStreamCaptureUnmatched", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED", "hipErrorStreamCaptureUnsupported", "numeric_literal");
-    subst("CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD", "hipErrorStreamCaptureWrongThread", "numeric_literal");
-    subst("CUDA_ERROR_UNKNOWN", "hipErrorUnknown", "numeric_literal");
-    subst("CUDA_ERROR_UNMAP_FAILED", "hipErrorUnmapFailed", "numeric_literal");
-    subst("CUDA_ERROR_UNSUPPORTED_LIMIT", "hipErrorUnsupportedLimit", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_ERROR", "hipGraphInstantiateError", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH", "hipGraphInstantiateFlagAutoFreeOnLaunch", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH", "hipGraphInstantiateFlagDeviceLaunch", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD", "hipGraphInstantiateFlagUpload", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY", "hipGraphInstantiateFlagUseNodePriority", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE", "hipGraphInstantiateInvalidStructure", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED", "hipGraphInstantiateMultipleDevicesNotSupported", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED", "hipGraphInstantiateNodeOperationNotSupported", "numeric_literal");
-    subst("CUDA_GRAPH_INSTANTIATE_SUCCESS", "hipGraphInstantiateSuccess", "numeric_literal");
-    subst("CUDA_R_16BF", "HIP_R_16BF", "numeric_literal");
-    subst("CUDA_R_16F", "HIP_R_16F", "numeric_literal");
-    subst("CUDA_R_32F", "HIP_R_32F", "numeric_literal");
-    subst("CUDA_R_32I", "HIP_R_32I", "numeric_literal");
-    subst("CUDA_R_32U", "HIP_R_32U", "numeric_literal");
-    subst("CUDA_R_64F", "HIP_R_64F", "numeric_literal");
-    subst("CUDA_R_8I", "HIP_R_8I", "numeric_literal");
-    subst("CUDA_R_8U", "HIP_R_8U", "numeric_literal");
-    subst("CUDA_SUCCESS", "hipSuccess", "numeric_literal");
-    subst("CUDNN_16BIT_INDICES", "HIPDNN_16BIT_INDICES", "numeric_literal");
-    subst("CUDNN_32BIT_INDICES", "HIPDNN_32BIT_INDICES", "numeric_literal");
-    subst("CUDNN_64BIT_INDICES", "HIPDNN_64BIT_INDICES", "numeric_literal");
-    subst("CUDNN_8BIT_INDICES", "HIPDNN_8BIT_INDICES", "numeric_literal");
-    subst("CUDNN_ACTIVATION_CLIPPED_RELU", "HIPDNN_ACTIVATION_CLIPPED_RELU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_ELU", "HIPDNN_ACTIVATION_ELU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_IDENTITY", "HIPDNN_ACTIVATION_PATHTRU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_RELU", "HIPDNN_ACTIVATION_RELU", "numeric_literal");
-    subst("CUDNN_ACTIVATION_SIGMOID", "HIPDNN_ACTIVATION_SIGMOID", "numeric_literal");
-    subst("CUDNN_ACTIVATION_SWISH", "HIPDNN_ACTIVATION_SWISH", "numeric_literal");
-    subst("CUDNN_ACTIVATION_TANH", "HIPDNN_ACTIVATION_TANH", "numeric_literal");
-    subst("CUDNN_BATCHNORM_PER_ACTIVATION", "HIPDNN_BATCHNORM_PER_ACTIVATION", "numeric_literal");
-    subst("CUDNN_BATCHNORM_SPATIAL", "HIPDNN_BATCHNORM_SPATIAL", "numeric_literal");
-    subst("CUDNN_BATCHNORM_SPATIAL_PERSISTENT", "HIPDNN_BATCHNORM_SPATIAL_PERSISTENT", "numeric_literal");
-    subst("CUDNN_BIDIRECTIONAL", "HIPDNN_BIDIRECTIONAL", "numeric_literal");
-    subst("CUDNN_BN_MIN_EPSILON", "HIPDNN_BN_MIN_EPSILON", "numeric_literal");
-    subst("CUDNN_CONVOLUTION", "HIPDNN_CONVOLUTION", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_0", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_0", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_1", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_1", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_TRANSPOSE_GEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED", "HIPDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE", "HIPDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST", "HIPDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT", "HIPDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_0", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_1", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_3", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", "HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE", "HIPDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST", "HIPDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT", "HIPDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_COUNT", "HIPDNN_CONVOLUTION_FWD_ALGO_COUNT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_DIRECT", "HIPDNN_CONVOLUTION_FWD_ALGO_DIRECT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_FFT", "HIPDNN_CONVOLUTION_FWD_ALGO_FFT", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", "HIPDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_GEMM", "HIPDNN_CONVOLUTION_FWD_ALGO_GEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", "HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", "HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", "HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", "HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_NO_WORKSPACE", "HIPDNN_CONVOLUTION_FWD_NO_WORKSPACE", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_PREFER_FASTEST", "HIPDNN_CONVOLUTION_FWD_PREFER_FASTEST", "numeric_literal");
-    subst("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT", "HIPDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT", "numeric_literal");
-    subst("CUDNN_CROSS_CORRELATION", "HIPDNN_CROSS_CORRELATION", "numeric_literal");
-    subst("CUDNN_DATA_DOUBLE", "HIPDNN_DATA_DOUBLE", "numeric_literal");
-    subst("CUDNN_DATA_FLOAT", "HIPDNN_DATA_FLOAT", "numeric_literal");
-    subst("CUDNN_DATA_HALF", "HIPDNN_DATA_HALF", "numeric_literal");
-    subst("CUDNN_DATA_INT32", "HIPDNN_DATA_INT32", "numeric_literal");
-    subst("CUDNN_DATA_INT8", "HIPDNN_DATA_INT8", "numeric_literal");
-    subst("CUDNN_DATA_INT8x4", "HIPDNN_DATA_INT8x4", "numeric_literal");
-    subst("CUDNN_DEFAULT_MATH", "HIPDNN_DEFAULT_MATH", "numeric_literal");
-    subst("CUDNN_GRU", "HIPDNN_GRU", "numeric_literal");
-    subst("CUDNN_LINEAR_INPUT", "HIPDNN_LINEAR_INPUT", "numeric_literal");
-    subst("CUDNN_LRN_CROSS_CHANNEL_DIM1", "HIPDNN_LRN_CROSS_CHANNEL", "numeric_literal");
-    subst("CUDNN_LSTM", "HIPDNN_LSTM", "numeric_literal");
-    subst("CUDNN_NOT_PROPAGATE_NAN", "HIPDNN_NOT_PROPAGATE_NAN", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_ADD", "HIPDNN_OP_TENSOR_ADD", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_MAX", "HIPDNN_OP_TENSOR_MAX", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_MIN", "HIPDNN_OP_TENSOR_MIN", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_MUL", "HIPDNN_OP_TENSOR_MUL", "numeric_literal");
-    subst("CUDNN_OP_TENSOR_SQRT", "HIPDNN_OP_TENSOR_SQRT", "numeric_literal");
-    subst("CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "HIPDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "numeric_literal");
-    subst("CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "HIPDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "numeric_literal");
-    subst("CUDNN_POOLING_MAX", "HIPDNN_POOLING_MAX", "numeric_literal");
-    subst("CUDNN_POOLING_MAX_DETERMINISTIC", "HIPDNN_POOLING_MAX_DETERMINISTIC", "numeric_literal");
-    subst("CUDNN_PROPAGATE_NAN", "HIPDNN_PROPAGATE_NAN", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_ADD", "HIPDNN_REDUCE_TENSOR_ADD", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_AMAX", "HIPDNN_REDUCE_TENSOR_AMAX", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_AVG", "HIPDNN_REDUCE_TENSOR_AVG", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_FLATTENED_INDICES", "HIPDNN_REDUCE_TENSOR_FLATTENED_INDICES", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MAX", "HIPDNN_REDUCE_TENSOR_MAX", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MIN", "HIPDNN_REDUCE_TENSOR_MIN", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MUL", "HIPDNN_REDUCE_TENSOR_MUL", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS", "HIPDNN_REDUCE_TENSOR_MUL_NO_ZEROS", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_NORM1", "HIPDNN_REDUCE_TENSOR_NORM1", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_NORM2", "HIPDNN_REDUCE_TENSOR_NORM2", "numeric_literal");
-    subst("CUDNN_REDUCE_TENSOR_NO_INDICES", "HIPDNN_REDUCE_TENSOR_NO_INDICES", "numeric_literal");
-    subst("CUDNN_RNN_ALGO_PERSIST_DYNAMIC", "HIPDNN_RNN_ALGO_PERSIST_DYNAMIC", "numeric_literal");
-    subst("CUDNN_RNN_ALGO_PERSIST_STATIC", "HIPDNN_RNN_ALGO_PERSIST_STATIC", "numeric_literal");
-    subst("CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H", "HIPDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H", "numeric_literal");
-    subst("CUDNN_RNN_ALGO_STANDARD", "HIPDNN_RNN_ALGO_STANDARD", "numeric_literal");
-    subst("CUDNN_RNN_DOUBLE_BIAS", "HIPDNN_RNN_WITH_BIAS", "numeric_literal");
-    subst("CUDNN_RNN_NO_BIAS", "HIPDNN_RNN_NO_BIAS", "numeric_literal");
-    subst("CUDNN_RNN_RELU", "HIPDNN_RNN_RELU", "numeric_literal");
-    subst("CUDNN_RNN_SINGLE_INP_BIAS", "HIPDNN_RNN_WITH_BIAS", "numeric_literal");
-    subst("CUDNN_RNN_SINGLE_REC_BIAS", "HIPDNN_RNN_WITH_BIAS", "numeric_literal");
-    subst("CUDNN_RNN_TANH", "HIPDNN_RNN_TANH", "numeric_literal");
-    subst("CUDNN_SKIP_INPUT", "HIPDNN_SKIP_INPUT", "numeric_literal");
-    subst("CUDNN_SOFTMAX_ACCURATE", "HIPDNN_SOFTMAX_ACCURATE", "numeric_literal");
-    subst("CUDNN_SOFTMAX_FAST", "HIPDNN_SOFTMAX_FAST", "numeric_literal");
-    subst("CUDNN_SOFTMAX_LOG", "HIPDNN_SOFTMAX_LOG", "numeric_literal");
-    subst("CUDNN_SOFTMAX_MODE_CHANNEL", "HIPDNN_SOFTMAX_MODE_CHANNEL", "numeric_literal");
-    subst("CUDNN_SOFTMAX_MODE_INSTANCE", "HIPDNN_SOFTMAX_MODE_INSTANCE", "numeric_literal");
-    subst("CUDNN_STATUS_ALLOC_FAILED", "HIPDNN_STATUS_ALLOC_FAILED", "numeric_literal");
-    subst("CUDNN_STATUS_ARCH_MISMATCH", "HIPDNN_STATUS_ARCH_MISMATCH", "numeric_literal");
-    subst("CUDNN_STATUS_BAD_PARAM", "HIPDNN_STATUS_BAD_PARAM", "numeric_literal");
-    subst("CUDNN_STATUS_EXECUTION_FAILED", "HIPDNN_STATUS_EXECUTION_FAILED", "numeric_literal");
-    subst("CUDNN_STATUS_INTERNAL_ERROR", "HIPDNN_STATUS_INTERNAL_ERROR", "numeric_literal");
-    subst("CUDNN_STATUS_INVALID_VALUE", "HIPDNN_STATUS_INVALID_VALUE", "numeric_literal");
-    subst("CUDNN_STATUS_LICENSE_ERROR", "HIPDNN_STATUS_LICENSE_ERROR", "numeric_literal");
-    subst("CUDNN_STATUS_MAPPING_ERROR", "HIPDNN_STATUS_MAPPING_ERROR", "numeric_literal");
-    subst("CUDNN_STATUS_NOT_INITIALIZED", "HIPDNN_STATUS_NOT_INITIALIZED", "numeric_literal");
-    subst("CUDNN_STATUS_NOT_SUPPORTED", "HIPDNN_STATUS_NOT_SUPPORTED", "numeric_literal");
-    subst("CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING", "HIPDNN_STATUS_RUNTIME_PREREQUISITE_MISSING", "numeric_literal");
-    subst("CUDNN_STATUS_SUCCESS", "HIPDNN_STATUS_SUCCESS", "numeric_literal");
-    subst("CUDNN_TENSOR_NCHW", "HIPDNN_TENSOR_NCHW", "numeric_literal");
-    subst("CUDNN_TENSOR_NCHW_VECT_C", "HIPDNN_TENSOR_NCHW_VECT_C", "numeric_literal");
-    subst("CUDNN_TENSOR_NHWC", "HIPDNN_TENSOR_NHWC", "numeric_literal");
-    subst("CUDNN_TENSOR_OP_MATH", "HIPDNN_TENSOR_OP_MATH", "numeric_literal");
-    subst("CUDNN_UNIDIRECTIONAL", "HIPDNN_UNIDIRECTIONAL", "numeric_literal");
-    subst("CUFFT_ALLOC_FAILED", "HIPFFT_ALLOC_FAILED", "numeric_literal");
-    subst("CUFFT_C2C", "HIPFFT_C2C", "numeric_literal");
-    subst("CUFFT_C2R", "HIPFFT_C2R", "numeric_literal");
-    subst("CUFFT_CB_LD_COMPLEX", "HIPFFT_CB_LD_COMPLEX", "numeric_literal");
-    subst("CUFFT_CB_LD_COMPLEX_DOUBLE", "HIPFFT_CB_LD_COMPLEX_DOUBLE", "numeric_literal");
-    subst("CUFFT_CB_LD_REAL", "HIPFFT_CB_LD_REAL", "numeric_literal");
-    subst("CUFFT_CB_LD_REAL_DOUBLE", "HIPFFT_CB_LD_REAL_DOUBLE", "numeric_literal");
-    subst("CUFFT_CB_ST_COMPLEX", "HIPFFT_CB_ST_COMPLEX", "numeric_literal");
-    subst("CUFFT_CB_ST_COMPLEX_DOUBLE", "HIPFFT_CB_ST_COMPLEX_DOUBLE", "numeric_literal");
-    subst("CUFFT_CB_ST_REAL", "HIPFFT_CB_ST_REAL", "numeric_literal");
-    subst("CUFFT_CB_ST_REAL_DOUBLE", "HIPFFT_CB_ST_REAL_DOUBLE", "numeric_literal");
-    subst("CUFFT_CB_UNDEFINED", "HIPFFT_CB_UNDEFINED", "numeric_literal");
-    subst("CUFFT_D2Z", "HIPFFT_D2Z", "numeric_literal");
-    subst("CUFFT_EXEC_FAILED", "HIPFFT_EXEC_FAILED", "numeric_literal");
-    subst("CUFFT_FORWARD", "HIPFFT_FORWARD", "numeric_literal");
-    subst("CUFFT_INCOMPLETE_PARAMETER_LIST", "HIPFFT_INCOMPLETE_PARAMETER_LIST", "numeric_literal");
-    subst("CUFFT_INTERNAL_ERROR", "HIPFFT_INTERNAL_ERROR", "numeric_literal");
-    subst("CUFFT_INVALID_DEVICE", "HIPFFT_INVALID_DEVICE", "numeric_literal");
-    subst("CUFFT_INVALID_PLAN", "HIPFFT_INVALID_PLAN", "numeric_literal");
-    subst("CUFFT_INVALID_SIZE", "HIPFFT_INVALID_SIZE", "numeric_literal");
-    subst("CUFFT_INVALID_TYPE", "HIPFFT_INVALID_TYPE", "numeric_literal");
-    subst("CUFFT_INVALID_VALUE", "HIPFFT_INVALID_VALUE", "numeric_literal");
-    subst("CUFFT_INVERSE", "HIPFFT_BACKWARD", "numeric_literal");
-    subst("CUFFT_NOT_IMPLEMENTED", "HIPFFT_NOT_IMPLEMENTED", "numeric_literal");
-    subst("CUFFT_NOT_SUPPORTED", "HIPFFT_NOT_SUPPORTED", "numeric_literal");
-    subst("CUFFT_NO_WORKSPACE", "HIPFFT_NO_WORKSPACE", "numeric_literal");
-    subst("CUFFT_PARSE_ERROR", "HIPFFT_PARSE_ERROR", "numeric_literal");
-    subst("CUFFT_R2C", "HIPFFT_R2C", "numeric_literal");
-    subst("CUFFT_SETUP_FAILED", "HIPFFT_SETUP_FAILED", "numeric_literal");
-    subst("CUFFT_SUCCESS", "HIPFFT_SUCCESS", "numeric_literal");
-    subst("CUFFT_UNALIGNED_DATA", "HIPFFT_UNALIGNED_DATA", "numeric_literal");
-    subst("CUFFT_Z2D", "HIPFFT_Z2D", "numeric_literal");
-    subst("CUFFT_Z2Z", "HIPFFT_Z2Z", "numeric_literal");
-    subst("CURAND_DIRECTION_VECTORS_32_JOEKUO6", "HIPRAND_DIRECTION_VECTORS_32_JOEKUO6", "numeric_literal");
-    subst("CURAND_DIRECTION_VECTORS_64_JOEKUO6", "HIPRAND_DIRECTION_VECTORS_64_JOEKUO6", "numeric_literal");
-    subst("CURAND_RNG_PSEUDO_DEFAULT", "HIPRAND_RNG_PSEUDO_DEFAULT", "numeric_literal");
-    subst("CURAND_RNG_PSEUDO_MRG32K3A", "HIPRAND_RNG_PSEUDO_MRG32K3A", "numeric_literal");
-    subst("CURAND_RNG_PSEUDO_MT19937", "HIPRAND_RNG_PSEUDO_MT19937", "numeric_literal");
-    subst("CURAND_RNG_PSEUDO_MTGP32", "HIPRAND_RNG_PSEUDO_MTGP32", "numeric_literal");
-    subst("CURAND_RNG_PSEUDO_PHILOX4_32_10", "HIPRAND_RNG_PSEUDO_PHILOX4_32_10", "numeric_literal");
-    subst("CURAND_RNG_PSEUDO_XORWOW", "HIPRAND_RNG_PSEUDO_XORWOW", "numeric_literal");
-    subst("CURAND_RNG_QUASI_DEFAULT", "HIPRAND_RNG_QUASI_DEFAULT", "numeric_literal");
-    subst("CURAND_RNG_QUASI_SCRAMBLED_SOBOL32", "HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32", "numeric_literal");
-    subst("CURAND_RNG_QUASI_SCRAMBLED_SOBOL64", "HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64", "numeric_literal");
-    subst("CURAND_RNG_QUASI_SOBOL32", "HIPRAND_RNG_QUASI_SOBOL32", "numeric_literal");
-    subst("CURAND_RNG_QUASI_SOBOL64", "HIPRAND_RNG_QUASI_SOBOL64", "numeric_literal");
-    subst("CURAND_RNG_TEST", "HIPRAND_RNG_TEST", "numeric_literal");
-    subst("CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", "numeric_literal");
-    subst("CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", "numeric_literal");
-    subst("CURAND_STATUS_ALLOCATION_FAILED", "HIPRAND_STATUS_ALLOCATION_FAILED", "numeric_literal");
-    subst("CURAND_STATUS_ARCH_MISMATCH", "HIPRAND_STATUS_ARCH_MISMATCH", "numeric_literal");
-    subst("CURAND_STATUS_DOUBLE_PRECISION_REQUIRED", "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED", "numeric_literal");
-    subst("CURAND_STATUS_INITIALIZATION_FAILED", "HIPRAND_STATUS_INITIALIZATION_FAILED", "numeric_literal");
-    subst("CURAND_STATUS_INTERNAL_ERROR", "HIPRAND_STATUS_INTERNAL_ERROR", "numeric_literal");
-    subst("CURAND_STATUS_LAUNCH_FAILURE", "HIPRAND_STATUS_LAUNCH_FAILURE", "numeric_literal");
-    subst("CURAND_STATUS_LENGTH_NOT_MULTIPLE", "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE", "numeric_literal");
-    subst("CURAND_STATUS_NOT_INITIALIZED", "HIPRAND_STATUS_NOT_INITIALIZED", "numeric_literal");
-    subst("CURAND_STATUS_OUT_OF_RANGE", "HIPRAND_STATUS_OUT_OF_RANGE", "numeric_literal");
-    subst("CURAND_STATUS_PREEXISTING_FAILURE", "HIPRAND_STATUS_PREEXISTING_FAILURE", "numeric_literal");
-    subst("CURAND_STATUS_SUCCESS", "HIPRAND_STATUS_SUCCESS", "numeric_literal");
-    subst("CURAND_STATUS_TYPE_ERROR", "HIPRAND_STATUS_TYPE_ERROR", "numeric_literal");
-    subst("CURAND_STATUS_VERSION_MISMATCH", "HIPRAND_STATUS_VERSION_MISMATCH", "numeric_literal");
-    subst("CUSOLVERRF_FACTORIZATION_ALG0", "HIPSOLVERRF_FACTORIZATION_ALG0", "numeric_literal");
-    subst("CUSOLVERRF_FACTORIZATION_ALG1", "HIPSOLVERRF_FACTORIZATION_ALG1", "numeric_literal");
-    subst("CUSOLVERRF_FACTORIZATION_ALG2", "HIPSOLVERRF_FACTORIZATION_ALG2", "numeric_literal");
-    subst("CUSOLVERRF_MATRIX_FORMAT_CSC", "HIPSOLVERRF_MATRIX_FORMAT_CSC", "numeric_literal");
-    subst("CUSOLVERRF_MATRIX_FORMAT_CSR", "HIPSOLVERRF_MATRIX_FORMAT_CSR", "numeric_literal");
-    subst("CUSOLVERRF_NUMERIC_BOOST_NOT_USED", "HIPSOLVERRF_NUMERIC_BOOST_NOT_USED", "numeric_literal");
-    subst("CUSOLVERRF_NUMERIC_BOOST_USED", "HIPSOLVERRF_NUMERIC_BOOST_USED", "numeric_literal");
-    subst("CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF", "HIPSOLVERRF_RESET_VALUES_FAST_MODE_OFF", "numeric_literal");
-    subst("CUSOLVERRF_RESET_VALUES_FAST_MODE_ON", "HIPSOLVERRF_RESET_VALUES_FAST_MODE_ON", "numeric_literal");
-    subst("CUSOLVERRF_TRIANGULAR_SOLVE_ALG1", "HIPSOLVERRF_TRIANGULAR_SOLVE_ALG1", "numeric_literal");
-    subst("CUSOLVERRF_TRIANGULAR_SOLVE_ALG2", "HIPSOLVERRF_TRIANGULAR_SOLVE_ALG2", "numeric_literal");
-    subst("CUSOLVERRF_TRIANGULAR_SOLVE_ALG3", "HIPSOLVERRF_TRIANGULAR_SOLVE_ALG3", "numeric_literal");
-    subst("CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L", "HIPSOLVERRF_UNIT_DIAGONAL_ASSUMED_L", "numeric_literal");
-    subst("CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U", "HIPSOLVERRF_UNIT_DIAGONAL_ASSUMED_U", "numeric_literal");
-    subst("CUSOLVERRF_UNIT_DIAGONAL_STORED_L", "HIPSOLVERRF_UNIT_DIAGONAL_STORED_L", "numeric_literal");
-    subst("CUSOLVERRF_UNIT_DIAGONAL_STORED_U", "HIPSOLVERRF_UNIT_DIAGONAL_STORED_U", "numeric_literal");
-    subst("CUSOLVER_EIG_MODE_NOVECTOR", "HIPSOLVER_EIG_MODE_NOVECTOR", "numeric_literal");
-    subst("CUSOLVER_EIG_MODE_VECTOR", "HIPSOLVER_EIG_MODE_VECTOR", "numeric_literal");
-    subst("CUSOLVER_EIG_RANGE_ALL", "HIPSOLVER_EIG_RANGE_ALL", "numeric_literal");
-    subst("CUSOLVER_EIG_RANGE_I", "HIPSOLVER_EIG_RANGE_I", "numeric_literal");
-    subst("CUSOLVER_EIG_RANGE_V", "HIPSOLVER_EIG_RANGE_V", "numeric_literal");
-    subst("CUSOLVER_EIG_TYPE_1", "HIPSOLVER_EIG_TYPE_1", "numeric_literal");
-    subst("CUSOLVER_EIG_TYPE_2", "HIPSOLVER_EIG_TYPE_2", "numeric_literal");
-    subst("CUSOLVER_EIG_TYPE_3", "HIPSOLVER_EIG_TYPE_3", "numeric_literal");
-    subst("CUSOLVER_STATUS_ALLOC_FAILED", "HIPSOLVER_STATUS_ALLOC_FAILED", "numeric_literal");
-    subst("CUSOLVER_STATUS_ARCH_MISMATCH", "HIPSOLVER_STATUS_ARCH_MISMATCH", "numeric_literal");
-    subst("CUSOLVER_STATUS_EXECUTION_FAILED", "HIPSOLVER_STATUS_EXECUTION_FAILED", "numeric_literal");
-    subst("CUSOLVER_STATUS_INTERNAL_ERROR", "HIPSOLVER_STATUS_INTERNAL_ERROR", "numeric_literal");
-    subst("CUSOLVER_STATUS_INVALID_VALUE", "HIPSOLVER_STATUS_INVALID_VALUE", "numeric_literal");
-    subst("CUSOLVER_STATUS_IRS_INTERNAL_ERROR", "HIPSOLVER_STATUS_INTERNAL_ERROR", "numeric_literal");
-    subst("CUSOLVER_STATUS_IRS_NOT_SUPPORTED", "HIPSOLVER_STATUS_NOT_SUPPORTED", "numeric_literal");
-    subst("CUSOLVER_STATUS_IRS_PARAMS_INVALID", "HIPSOLVER_STATUS_INVALID_VALUE", "numeric_literal");
-    subst("CUSOLVER_STATUS_MAPPING_ERROR", "HIPSOLVER_STATUS_MAPPING_ERROR", "numeric_literal");
-    subst("CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED", "HIPSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED", "numeric_literal");
-    subst("CUSOLVER_STATUS_NOT_INITIALIZED", "HIPSOLVER_STATUS_NOT_INITIALIZED", "numeric_literal");
-    subst("CUSOLVER_STATUS_NOT_SUPPORTED", "HIPSOLVER_STATUS_NOT_SUPPORTED", "numeric_literal");
-    subst("CUSOLVER_STATUS_SUCCESS", "HIPSOLVER_STATUS_SUCCESS", "numeric_literal");
-    subst("CUSOLVER_STATUS_ZERO_PIVOT", "HIPSOLVER_STATUS_ZERO_PIVOT", "numeric_literal");
-    subst("CUSPARSE_ACTION_NUMERIC", "HIPSPARSE_ACTION_NUMERIC", "numeric_literal");
-    subst("CUSPARSE_ACTION_SYMBOLIC", "HIPSPARSE_ACTION_SYMBOLIC", "numeric_literal");
-    subst("CUSPARSE_COOMM_ALG1", "HIPSPARSE_COOMM_ALG1", "numeric_literal");
-    subst("CUSPARSE_COOMM_ALG2", "HIPSPARSE_COOMM_ALG2", "numeric_literal");
-    subst("CUSPARSE_COOMM_ALG3", "HIPSPARSE_COOMM_ALG3", "numeric_literal");
-    subst("CUSPARSE_COOMV_ALG", "HIPSPARSE_COOMV_ALG", "numeric_literal");
-    subst("CUSPARSE_CSR2CSC_ALG1", "HIPSPARSE_CSR2CSC_ALG1", "numeric_literal");
-    subst("CUSPARSE_CSR2CSC_ALG2", "HIPSPARSE_CSR2CSC_ALG2", "numeric_literal");
-    subst("CUSPARSE_CSR2CSC_ALG_DEFAULT", "HIPSPARSE_CSR2CSC_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_CSRMM_ALG1", "HIPSPARSE_CSRMM_ALG1", "numeric_literal");
-    subst("CUSPARSE_CSRMV_ALG1", "HIPSPARSE_CSRMV_ALG1", "numeric_literal");
-    subst("CUSPARSE_CSRMV_ALG2", "HIPSPARSE_CSRMV_ALG2", "numeric_literal");
-    subst("CUSPARSE_DENSETOSPARSE_ALG_DEFAULT", "HIPSPARSE_DENSETOSPARSE_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_DIAG_TYPE_NON_UNIT", "HIPSPARSE_DIAG_TYPE_NON_UNIT", "numeric_literal");
-    subst("CUSPARSE_DIAG_TYPE_UNIT", "HIPSPARSE_DIAG_TYPE_UNIT", "numeric_literal");
-    subst("CUSPARSE_DIRECTION_COLUMN", "HIPSPARSE_DIRECTION_COLUMN", "numeric_literal");
-    subst("CUSPARSE_DIRECTION_ROW", "HIPSPARSE_DIRECTION_ROW", "numeric_literal");
-    subst("CUSPARSE_FILL_MODE_LOWER", "HIPSPARSE_FILL_MODE_LOWER", "numeric_literal");
-    subst("CUSPARSE_FILL_MODE_UPPER", "HIPSPARSE_FILL_MODE_UPPER", "numeric_literal");
-    subst("CUSPARSE_FORMAT_BLOCKED_ELL", "HIPSPARSE_FORMAT_BLOCKED_ELL", "numeric_literal");
-    subst("CUSPARSE_FORMAT_COO", "HIPSPARSE_FORMAT_COO", "numeric_literal");
-    subst("CUSPARSE_FORMAT_COO_AOS", "HIPSPARSE_FORMAT_COO_AOS", "numeric_literal");
-    subst("CUSPARSE_FORMAT_CSC", "HIPSPARSE_FORMAT_CSC", "numeric_literal");
-    subst("CUSPARSE_FORMAT_CSR", "HIPSPARSE_FORMAT_CSR", "numeric_literal");
-    subst("CUSPARSE_HYB_PARTITION_AUTO", "HIPSPARSE_HYB_PARTITION_AUTO", "numeric_literal");
-    subst("CUSPARSE_HYB_PARTITION_MAX", "HIPSPARSE_HYB_PARTITION_MAX", "numeric_literal");
-    subst("CUSPARSE_HYB_PARTITION_USER", "HIPSPARSE_HYB_PARTITION_USER", "numeric_literal");
-    subst("CUSPARSE_INDEX_16U", "HIPSPARSE_INDEX_16U", "numeric_literal");
-    subst("CUSPARSE_INDEX_32I", "HIPSPARSE_INDEX_32I", "numeric_literal");
-    subst("CUSPARSE_INDEX_64I", "HIPSPARSE_INDEX_64I", "numeric_literal");
-    subst("CUSPARSE_INDEX_BASE_ONE", "HIPSPARSE_INDEX_BASE_ONE", "numeric_literal");
-    subst("CUSPARSE_INDEX_BASE_ZERO", "HIPSPARSE_INDEX_BASE_ZERO", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_GENERAL", "HIPSPARSE_MATRIX_TYPE_GENERAL", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_HERMITIAN", "HIPSPARSE_MATRIX_TYPE_HERMITIAN", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_SYMMETRIC", "HIPSPARSE_MATRIX_TYPE_SYMMETRIC", "numeric_literal");
-    subst("CUSPARSE_MATRIX_TYPE_TRIANGULAR", "HIPSPARSE_MATRIX_TYPE_TRIANGULAR", "numeric_literal");
-    subst("CUSPARSE_MM_ALG_DEFAULT", "HIPSPARSE_MM_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_MV_ALG_DEFAULT", "HIPSPARSE_MV_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE", "HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE", "numeric_literal");
-    subst("CUSPARSE_OPERATION_NON_TRANSPOSE", "HIPSPARSE_OPERATION_NON_TRANSPOSE", "numeric_literal");
-    subst("CUSPARSE_OPERATION_TRANSPOSE", "HIPSPARSE_OPERATION_TRANSPOSE", "numeric_literal");
-    subst("CUSPARSE_ORDER_COL", "HIPSPARSE_ORDER_COL", "numeric_literal");
-    subst("CUSPARSE_ORDER_ROW", "HIPSPARSE_ORDER_ROW", "numeric_literal");
-    subst("CUSPARSE_POINTER_MODE_DEVICE", "HIPSPARSE_POINTER_MODE_DEVICE", "numeric_literal");
-    subst("CUSPARSE_POINTER_MODE_HOST", "HIPSPARSE_POINTER_MODE_HOST", "numeric_literal");
-    subst("CUSPARSE_SDDMM_ALG_DEFAULT", "HIPSPARSE_SDDMM_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_SOLVE_POLICY_NO_LEVEL", "HIPSPARSE_SOLVE_POLICY_NO_LEVEL", "numeric_literal");
-    subst("CUSPARSE_SOLVE_POLICY_USE_LEVEL", "HIPSPARSE_SOLVE_POLICY_USE_LEVEL", "numeric_literal");
-    subst("CUSPARSE_SPARSETODENSE_ALG_DEFAULT", "HIPSPARSE_SPARSETODENSE_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_ALG1", "HIPSPARSE_SPGEMM_ALG1", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_ALG2", "HIPSPARSE_SPGEMM_ALG2", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_ALG3", "HIPSPARSE_SPGEMM_ALG3", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC", "HIPSPARSE_SPGEMM_CSR_ALG_DETERMINISTIC", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC", "HIPSPARSE_SPGEMM_CSR_ALG_NONDETERMINISTIC", "numeric_literal");
-    subst("CUSPARSE_SPGEMM_DEFAULT", "HIPSPARSE_SPGEMM_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_SPMAT_DIAG_TYPE", "HIPSPARSE_SPMAT_DIAG_TYPE", "numeric_literal");
-    subst("CUSPARSE_SPMAT_FILL_MODE", "HIPSPARSE_SPMAT_FILL_MODE", "numeric_literal");
-    subst("CUSPARSE_SPMM_ALG_DEFAULT", "HIPSPARSE_SPMM_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_SPMM_BLOCKED_ELL_ALG1", "HIPSPARSE_SPMM_BLOCKED_ELL_ALG1", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG1", "HIPSPARSE_SPMM_COO_ALG1", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG2", "HIPSPARSE_SPMM_COO_ALG2", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG3", "HIPSPARSE_SPMM_COO_ALG3", "numeric_literal");
-    subst("CUSPARSE_SPMM_COO_ALG4", "HIPSPARSE_SPMM_COO_ALG4", "numeric_literal");
-    subst("CUSPARSE_SPMM_CSR_ALG1", "HIPSPARSE_SPMM_CSR_ALG1", "numeric_literal");
-    subst("CUSPARSE_SPMM_CSR_ALG2", "HIPSPARSE_SPMM_CSR_ALG2", "numeric_literal");
-    subst("CUSPARSE_SPMM_CSR_ALG3", "HIPSPARSE_SPMM_CSR_ALG3", "numeric_literal");
-    subst("CUSPARSE_SPMV_ALG_DEFAULT", "HIPSPARSE_SPMV_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_SPMV_COO_ALG1", "HIPSPARSE_SPMV_COO_ALG1", "numeric_literal");
-    subst("CUSPARSE_SPMV_COO_ALG2", "HIPSPARSE_SPMV_COO_ALG2", "numeric_literal");
-    subst("CUSPARSE_SPMV_CSR_ALG1", "HIPSPARSE_SPMV_CSR_ALG1", "numeric_literal");
-    subst("CUSPARSE_SPMV_CSR_ALG2", "HIPSPARSE_SPMV_CSR_ALG2", "numeric_literal");
-    subst("CUSPARSE_SPSM_ALG_DEFAULT", "HIPSPARSE_SPSM_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_SPSV_ALG_DEFAULT", "HIPSPARSE_SPSV_ALG_DEFAULT", "numeric_literal");
-    subst("CUSPARSE_STATUS_ALLOC_FAILED", "HIPSPARSE_STATUS_ALLOC_FAILED", "numeric_literal");
-    subst("CUSPARSE_STATUS_ARCH_MISMATCH", "HIPSPARSE_STATUS_ARCH_MISMATCH", "numeric_literal");
-    subst("CUSPARSE_STATUS_EXECUTION_FAILED", "HIPSPARSE_STATUS_EXECUTION_FAILED", "numeric_literal");
-    subst("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES", "HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES", "numeric_literal");
-    subst("CUSPARSE_STATUS_INTERNAL_ERROR", "HIPSPARSE_STATUS_INTERNAL_ERROR", "numeric_literal");
-    subst("CUSPARSE_STATUS_INVALID_VALUE", "HIPSPARSE_STATUS_INVALID_VALUE", "numeric_literal");
-    subst("CUSPARSE_STATUS_MAPPING_ERROR", "HIPSPARSE_STATUS_MAPPING_ERROR", "numeric_literal");
-    subst("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", "HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", "numeric_literal");
-    subst("CUSPARSE_STATUS_NOT_INITIALIZED", "HIPSPARSE_STATUS_NOT_INITIALIZED", "numeric_literal");
-    subst("CUSPARSE_STATUS_NOT_SUPPORTED", "HIPSPARSE_STATUS_NOT_SUPPORTED", "numeric_literal");
-    subst("CUSPARSE_STATUS_SUCCESS", "HIPSPARSE_STATUS_SUCCESS", "numeric_literal");
-    subst("CUSPARSE_STATUS_ZERO_PIVOT", "HIPSPARSE_STATUS_ZERO_PIVOT", "numeric_literal");
-    subst("CU_ACCESS_PROPERTY_NORMAL", "hipAccessPropertyNormal", "numeric_literal");
-    subst("CU_ACCESS_PROPERTY_PERSISTING", "hipAccessPropertyPersisting", "numeric_literal");
-    subst("CU_ACCESS_PROPERTY_STREAMING", "hipAccessPropertyStreaming", "numeric_literal");
-    subst("CU_AD_FORMAT_FLOAT", "HIP_AD_FORMAT_FLOAT", "numeric_literal");
-    subst("CU_AD_FORMAT_HALF", "HIP_AD_FORMAT_HALF", "numeric_literal");
-    subst("CU_AD_FORMAT_SIGNED_INT16", "HIP_AD_FORMAT_SIGNED_INT16", "numeric_literal");
-    subst("CU_AD_FORMAT_SIGNED_INT32", "HIP_AD_FORMAT_SIGNED_INT32", "numeric_literal");
-    subst("CU_AD_FORMAT_SIGNED_INT8", "HIP_AD_FORMAT_SIGNED_INT8", "numeric_literal");
-    subst("CU_AD_FORMAT_UNSIGNED_INT16", "HIP_AD_FORMAT_UNSIGNED_INT16", "numeric_literal");
-    subst("CU_AD_FORMAT_UNSIGNED_INT32", "HIP_AD_FORMAT_UNSIGNED_INT32", "numeric_literal");
-    subst("CU_AD_FORMAT_UNSIGNED_INT8", "HIP_AD_FORMAT_UNSIGNED_INT8", "numeric_literal");
-    subst("CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL", "hipArraySparseSubresourceTypeMiptail", "numeric_literal");
-    subst("CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL", "hipArraySparseSubresourceTypeSparseLevel", "numeric_literal");
-    subst("CU_COMPUTEMODE_DEFAULT", "hipComputeModeDefault", "numeric_literal");
-    subst("CU_COMPUTEMODE_EXCLUSIVE", "hipComputeModeExclusive", "numeric_literal");
-    subst("CU_COMPUTEMODE_EXCLUSIVE_PROCESS", "hipComputeModeExclusiveProcess", "numeric_literal");
-    subst("CU_COMPUTEMODE_PROHIBITED", "hipComputeModeProhibited", "numeric_literal");
-    subst("CU_CTX_BLOCKING_SYNC", "hipDeviceScheduleBlockingSync", "numeric_literal");
-    subst("CU_CTX_LMEM_RESIZE_TO_MAX", "hipDeviceLmemResizeToMax", "numeric_literal");
-    subst("CU_CTX_MAP_HOST", "hipDeviceMapHost", "numeric_literal");
-    subst("CU_CTX_SCHED_AUTO", "hipDeviceScheduleAuto", "numeric_literal");
-    subst("CU_CTX_SCHED_BLOCKING_SYNC", "hipDeviceScheduleBlockingSync", "numeric_literal");
-    subst("CU_CTX_SCHED_MASK", "hipDeviceScheduleMask", "numeric_literal");
-    subst("CU_CTX_SCHED_SPIN", "hipDeviceScheduleSpin", "numeric_literal");
-    subst("CU_CTX_SCHED_YIELD", "hipDeviceScheduleYield", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT", "hipDeviceAttributeAsyncEngineCount", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY", "hipDeviceAttributeCanMapHostMemory", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM", "hipDeviceAttributeCanUseHostPointerForRegisteredMem", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR", "hipDeviceAttributeCanUseStreamWaitValue", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1", "hipDeviceAttributeCanUseStreamWaitValue", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CLOCK_RATE", "hipDeviceAttributeClockRate", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR", "hipDeviceAttributeComputeCapabilityMajor", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR", "hipDeviceAttributeComputeCapabilityMinor", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_COMPUTE_MODE", "hipDeviceAttributeComputeMode", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED", "hipDeviceAttributeComputePreemptionSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS", "hipDeviceAttributeConcurrentKernels", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS", "hipDeviceAttributeConcurrentManagedAccess", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH", "hipDeviceAttributeCooperativeLaunch", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH", "hipDeviceAttributeCooperativeMultiDeviceLaunch", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST", "hipDeviceAttributeDirectManagedMemAccessFromHost", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_ECC_ENABLED", "hipDeviceAttributeEccEnabled", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED", "hipDeviceAttributeGlobalL1CacheSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH", "hipDeviceAttributeMemoryBusWidth", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_GPU_OVERLAP", "hipDeviceAttributeAsyncEngineCount", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED", "hipDeviceAttributeHostNativeAtomicSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED", "hipDeviceAttributeHostRegisterSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_INTEGRATED", "hipDeviceAttributeIntegrated", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT", "hipDeviceAttributeKernelExecTimeout", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE", "hipDeviceAttributeL2CacheSize", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED", "hipDeviceAttributeLocalL1CacheSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY", "hipDeviceAttributeManagedMemory", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH", "hipDeviceAttributeMaxSurface1DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH", "hipDeviceAttributeMaxSurface1D", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT", "hipDeviceAttributeMaxSurface2D", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT", "hipDeviceAttributeMaxSurface2DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH", "hipDeviceAttributeMaxSurface2DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH", "hipDeviceAttributeMaxSurface2D", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH", "hipDeviceAttributeMaxSurface3D", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT", "hipDeviceAttributeMaxSurface3D", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH", "hipDeviceAttributeMaxSurface3D", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH", "hipDeviceAttributeMaxSurfaceCubemapLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH", "hipDeviceAttributeMaxSurfaceCubemap", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH", "hipDeviceAttributeMaxTexture1DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH", "hipDeviceAttributeMaxTexture1DLinear", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH", "hipDeviceAttributeMaxTexture1DMipmap", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH", "hipDeviceAttributeMaxTexture1DWidth", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT", "hipDeviceAttributeMaxTexture2DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH", "hipDeviceAttributeMaxTexture2DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT", "hipDeviceAttributeMaxTexture2DGather", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH", "hipDeviceAttributeMaxTexture2DGather", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT", "hipDeviceAttributeMaxTexture2DHeight", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT", "hipDeviceAttributeMaxTexture2DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH", "hipDeviceAttributeMaxTexture2DLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT", "hipDeviceAttributeMaxTexture2DLinear", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH", "hipDeviceAttributeMaxTexture2DLinear", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH", "hipDeviceAttributeMaxTexture2DLinear", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT", "hipDeviceAttributeMaxTexture2DMipmap", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH", "hipDeviceAttributeMaxTexture2DMipmap", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH", "hipDeviceAttributeMaxTexture2DWidth", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH", "hipDeviceAttributeMaxTexture3DDepth", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE", "hipDeviceAttributeMaxTexture3DAlt", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT", "hipDeviceAttributeMaxTexture3DHeight", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE", "hipDeviceAttributeMaxTexture3DAlt", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH", "hipDeviceAttributeMaxTexture3DWidth", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE", "hipDeviceAttributeMaxTexture3DAlt", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH", "hipDeviceAttributeMaxTextureCubemapLayered", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH", "hipDeviceAttributeMaxTextureCubemap", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR", "hipDeviceAttributeMaxBlocksPerMultiprocessor", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X", "hipDeviceAttributeMaxBlockDimX", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y", "hipDeviceAttributeMaxBlockDimY", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z", "hipDeviceAttributeMaxBlockDimZ", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X", "hipDeviceAttributeMaxGridDimX", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y", "hipDeviceAttributeMaxGridDimY", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z", "hipDeviceAttributeMaxGridDimZ", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_PITCH", "hipDeviceAttributeMaxPitch", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK", "hipDeviceAttributeMaxRegistersPerBlock", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR", "hipDeviceAttributeMaxRegistersPerMultiprocessor", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK", "hipDeviceAttributeMaxSharedMemoryPerBlock", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN", "hipDeviceAttributeSharedMemPerBlockOptin", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR", "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK", "hipDeviceAttributeMaxThreadsPerBlock", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR", "hipDeviceAttributeMaxThreadsPerMultiProcessor", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE", "hipDeviceAttributeMemoryClockRate", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED", "hipDeviceAttributeMemoryPoolsSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT", "hipDeviceAttributeMultiprocessorCount", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD", "hipDeviceAttributeIsMultiGpuBoard", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID", "hipDeviceAttributeMultiGpuBoardGroupId", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS", "hipDeviceAttributePageableMemoryAccess", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES", "hipDeviceAttributePageableMemoryAccessUsesHostPageTables", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_PCI_BUS_ID", "hipDeviceAttributePciBusId", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID", "hipDeviceAttributePciDeviceId", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID", "hipDeviceAttributePciDomainID", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK", "hipDeviceAttributeMaxRegistersPerBlock", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK", "hipDeviceAttributeMaxSharedMemoryPerBlock", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO", "hipDeviceAttributeSingleToDoublePrecisionPerfRatio", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED", "hipDeviceAttributeStreamPrioritiesSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT", "hipDeviceAttributeSurfaceAlignment", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_TCC_DRIVER", "hipDeviceAttributeTccDriver", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT", "hipDeviceAttributeTextureAlignment", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT", "hipDeviceAttributeTexturePitchAlignment", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY", "hipDeviceAttributeTotalConstantMemory", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING", "hipDeviceAttributeUnifiedAddressing", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED", "hipDeviceAttributeVirtualMemoryManagementSupported", "numeric_literal");
-    subst("CU_DEVICE_ATTRIBUTE_WARP_SIZE", "hipDeviceAttributeWarpSize", "numeric_literal");
-    subst("CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED", "hipDevP2PAttrHipArrayAccessSupported", "numeric_literal");
-    subst("CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED", "hipDevP2PAttrAccessSupported", "numeric_literal");
-    subst("CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED", "hipDevP2PAttrHipArrayAccessSupported", "numeric_literal");
-    subst("CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED", "hipDevP2PAttrHipArrayAccessSupported", "numeric_literal");
-    subst("CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED", "hipDevP2PAttrNativeAtomicSupported", "numeric_literal");
-    subst("CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK", "hipDevP2PAttrPerformanceRank", "numeric_literal");
-    subst("CU_EVENT_BLOCKING_SYNC", "hipEventBlockingSync", "numeric_literal");
-    subst("CU_EVENT_DEFAULT", "hipEventDefault", "numeric_literal");
-    subst("CU_EVENT_DISABLE_TIMING", "hipEventDisableTiming", "numeric_literal");
-    subst("CU_EVENT_INTERPROCESS", "hipEventInterprocess", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE", "hipExternalMemoryHandleTypeD3D11Resource", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT", "hipExternalMemoryHandleTypeD3D11ResourceKmt", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP", "hipExternalMemoryHandleTypeD3D12Heap", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE", "hipExternalMemoryHandleTypeD3D12Resource", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD", "hipExternalMemoryHandleTypeOpaqueFd", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32", "hipExternalMemoryHandleTypeOpaqueWin32", "numeric_literal");
-    subst("CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT", "hipExternalMemoryHandleTypeOpaqueWin32Kmt", "numeric_literal");
-    subst("CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE", "hipExternalSemaphoreHandleTypeD3D12Fence", "numeric_literal");
-    subst("CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD", "hipExternalSemaphoreHandleTypeOpaqueFd", "numeric_literal");
-    subst("CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32", "hipExternalSemaphoreHandleTypeOpaqueWin32", "numeric_literal");
-    subst("CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT", "hipExternalSemaphoreHandleTypeOpaqueWin32Kmt", "numeric_literal");
-    subst("CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST", "hipFlushGPUDirectRDMAWritesOptionHost", "numeric_literal");
-    subst("CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS", "hipFlushGPUDirectRDMAWritesOptionMemOps", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_BINARY_VERSION", "HIP_FUNC_ATTRIBUTE_BINARY_VERSION", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_CACHE_MODE_CA", "HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES", "HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES", "HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_MAX", "HIP_FUNC_ATTRIBUTE_MAX", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES", "HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK", "HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_NUM_REGS", "HIP_FUNC_ATTRIBUTE_NUM_REGS", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT", "HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_PTX_VERSION", "HIP_FUNC_ATTRIBUTE_PTX_VERSION", "numeric_literal");
-    subst("CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES", "HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES", "numeric_literal");
-    subst("CU_FUNC_CACHE_PREFER_EQUAL", "hipFuncCachePreferEqual", "numeric_literal");
-    subst("CU_FUNC_CACHE_PREFER_L1", "hipFuncCachePreferL1", "numeric_literal");
-    subst("CU_FUNC_CACHE_PREFER_NONE", "hipFuncCachePreferNone", "numeric_literal");
-    subst("CU_FUNC_CACHE_PREFER_SHARED", "hipFuncCachePreferShared", "numeric_literal");
-    subst("CU_GL_DEVICE_LIST_ALL", "hipGLDeviceListAll", "numeric_literal");
-    subst("CU_GL_DEVICE_LIST_CURRENT_FRAME", "hipGLDeviceListCurrentFrame", "numeric_literal");
-    subst("CU_GL_DEVICE_LIST_NEXT_FRAME", "hipGLDeviceListNextFrame", "numeric_literal");
-    subst("CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES", "hipGPUDirectRDMAWritesOrderingAllDevices", "numeric_literal");
-    subst("CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE", "hipGPUDirectRDMAWritesOrderingNone", "numeric_literal");
-    subst("CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER", "hipGPUDirectRDMAWritesOrderingOwner", "numeric_literal");
-    subst("CU_GRAPHICS_REGISTER_FLAGS_NONE", "hipGraphicsRegisterFlagsNone", "numeric_literal");
-    subst("CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY", "hipGraphicsRegisterFlagsReadOnly", "numeric_literal");
-    subst("CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST", "hipGraphicsRegisterFlagsSurfaceLoadStore", "numeric_literal");
-    subst("CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER", "hipGraphicsRegisterFlagsTextureGather", "numeric_literal");
-    subst("CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD", "hipGraphicsRegisterFlagsWriteDiscard", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS", "hipGraphDebugDotFlagsEventNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS", "hipGraphDebugDotFlagsExtSemasSignalNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS", "hipGraphDebugDotFlagsExtSemasWaitNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES", "hipGraphDebugDotFlagsHandles", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS", "hipGraphDebugDotFlagsHostNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES", "hipGraphDebugDotFlagsKernelNodeAttributes", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS", "hipGraphDebugDotFlagsKernelNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS", "hipGraphDebugDotFlagsMemcpyNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS", "hipGraphDebugDotFlagsMemsetNodeParams", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES", "hipGraphDebugDotFlagsRuntimeTypes", "numeric_literal");
-    subst("CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE", "hipGraphDebugDotFlagsVerbose", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR", "hipGraphExecUpdateError", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED", "hipGraphExecUpdateErrorFunctionChanged", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED", "hipGraphExecUpdateErrorNodeTypeChanged", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED", "hipGraphExecUpdateErrorNotSupported", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED", "hipGraphExecUpdateErrorParametersChanged", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED", "hipGraphExecUpdateErrorTopologyChanged", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE", "hipGraphExecUpdateErrorUnsupportedFunctionChange", "numeric_literal");
-    subst("CU_GRAPH_EXEC_UPDATE_SUCCESS", "hipGraphExecUpdateSuccess", "numeric_literal");
-    subst("CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT", "hipGraphMemAttrReservedMemCurrent", "numeric_literal");
-    subst("CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH", "hipGraphMemAttrReservedMemHigh", "numeric_literal");
-    subst("CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT", "hipGraphMemAttrUsedMemCurrent", "numeric_literal");
-    subst("CU_GRAPH_MEM_ATTR_USED_MEM_HIGH", "hipGraphMemAttrUsedMemHigh", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_COUNT", "hipGraphNodeTypeCount", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_EMPTY", "hipGraphNodeTypeEmpty", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_EVENT_RECORD", "hipGraphNodeTypeEventRecord", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL", "hipGraphNodeTypeExtSemaphoreSignal", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT", "hipGraphNodeTypeExtSemaphoreWait", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_GRAPH", "hipGraphNodeTypeGraph", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_HOST", "hipGraphNodeTypeHost", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_KERNEL", "hipGraphNodeTypeKernel", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_MEMCPY", "hipGraphNodeTypeMemcpy", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_MEMSET", "hipGraphNodeTypeMemset", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_MEM_ALLOC", "hipGraphNodeTypeMemAlloc", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_MEM_FREE", "hipGraphNodeTypeMemFree", "numeric_literal");
-    subst("CU_GRAPH_NODE_TYPE_WAIT_EVENT", "hipGraphNodeTypeWaitEvent", "numeric_literal");
-    subst("CU_GRAPH_USER_OBJECT_MOVE", "hipGraphUserObjectMove", "numeric_literal");
-    subst("CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS", "hipIpcMemLazyEnablePeerAccess", "numeric_literal");
-    subst("CU_JIT_CACHE_MODE", "HIPRTC_JIT_CACHE_MODE", "numeric_literal");
-    subst("CU_JIT_ERROR_LOG_BUFFER", "HIPRTC_JIT_ERROR_LOG_BUFFER", "numeric_literal");
-    subst("CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES", "HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES", "numeric_literal");
-    subst("CU_JIT_FALLBACK_STRATEGY", "HIPRTC_JIT_FALLBACK_STRATEGY", "numeric_literal");
-    subst("CU_JIT_FAST_COMPILE", "HIPRTC_JIT_FAST_COMPILE", "numeric_literal");
-    subst("CU_JIT_GENERATE_DEBUG_INFO", "HIPRTC_JIT_GENERATE_DEBUG_INFO", "numeric_literal");
-    subst("CU_JIT_GENERATE_LINE_INFO", "HIPRTC_JIT_GENERATE_LINE_INFO", "numeric_literal");
-    subst("CU_JIT_INFO_LOG_BUFFER", "HIPRTC_JIT_INFO_LOG_BUFFER", "numeric_literal");
-    subst("CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES", "HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES", "numeric_literal");
-    subst("CU_JIT_INPUT_CUBIN", "HIPRTC_JIT_INPUT_CUBIN", "numeric_literal");
-    subst("CU_JIT_INPUT_FATBINARY", "HIPRTC_JIT_INPUT_FATBINARY", "numeric_literal");
-    subst("CU_JIT_INPUT_LIBRARY", "HIPRTC_JIT_INPUT_LIBRARY", "numeric_literal");
-    subst("CU_JIT_INPUT_NVVM", "HIPRTC_JIT_INPUT_NVVM", "numeric_literal");
-    subst("CU_JIT_INPUT_OBJECT", "HIPRTC_JIT_INPUT_OBJECT", "numeric_literal");
-    subst("CU_JIT_INPUT_PTX", "HIPRTC_JIT_INPUT_PTX", "numeric_literal");
-    subst("CU_JIT_LOG_VERBOSE", "HIPRTC_JIT_LOG_VERBOSE", "numeric_literal");
-    subst("CU_JIT_MAX_REGISTERS", "HIPRTC_JIT_MAX_REGISTERS", "numeric_literal");
-    subst("CU_JIT_NEW_SM3X_OPT", "HIPRTC_JIT_NEW_SM3X_OPT", "numeric_literal");
-    subst("CU_JIT_NUM_INPUT_TYPES", "HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES", "numeric_literal");
-    subst("CU_JIT_NUM_OPTIONS", "HIPRTC_JIT_NUM_OPTIONS", "numeric_literal");
-    subst("CU_JIT_OPTIMIZATION_LEVEL", "HIPRTC_JIT_OPTIMIZATION_LEVEL", "numeric_literal");
-    subst("CU_JIT_TARGET", "HIPRTC_JIT_TARGET", "numeric_literal");
-    subst("CU_JIT_TARGET_FROM_CUCONTEXT", "HIPRTC_JIT_TARGET_FROM_HIPCONTEXT", "numeric_literal");
-    subst("CU_JIT_THREADS_PER_BLOCK", "HIPRTC_JIT_THREADS_PER_BLOCK", "numeric_literal");
-    subst("CU_JIT_WALL_TIME", "HIPRTC_JIT_WALL_TIME", "numeric_literal");
-    subst("CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW", "hipKernelNodeAttributeAccessPolicyWindow", "numeric_literal");
-    subst("CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE", "hipKernelNodeAttributeCooperative", "numeric_literal");
-    subst("CU_LIMIT_MALLOC_HEAP_SIZE", "hipLimitMallocHeapSize", "numeric_literal");
-    subst("CU_LIMIT_PRINTF_FIFO_SIZE", "hipLimitPrintfFifoSize", "numeric_literal");
-    subst("CU_LIMIT_STACK_SIZE", "hipLimitStackSize", "numeric_literal");
-    subst("CU_MEMORYTYPE_ARRAY", "hipMemoryTypeArray", "numeric_literal");
-    subst("CU_MEMORYTYPE_DEVICE", "hipMemoryTypeDevice", "numeric_literal");
-    subst("CU_MEMORYTYPE_HOST", "hipMemoryTypeHost", "numeric_literal");
-    subst("CU_MEMORYTYPE_UNIFIED", "hipMemoryTypeUnified", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_RELEASE_THRESHOLD", "hipMemPoolAttrReleaseThreshold", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT", "hipMemPoolAttrReservedMemCurrent", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH", "hipMemPoolAttrReservedMemHigh", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES", "hipMemPoolReuseAllowInternalDependencies", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC", "hipMemPoolReuseAllowOpportunistic", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES", "hipMemPoolReuseFollowEventDependencies", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_USED_MEM_CURRENT", "hipMemPoolAttrUsedMemCurrent", "numeric_literal");
-    subst("CU_MEMPOOL_ATTR_USED_MEM_HIGH", "hipMemPoolAttrUsedMemHigh", "numeric_literal");
-    subst("CU_MEM_ACCESS_FLAGS_PROT_NONE", "hipMemAccessFlagsProtNone", "numeric_literal");
-    subst("CU_MEM_ACCESS_FLAGS_PROT_READ", "hipMemAccessFlagsProtRead", "numeric_literal");
-    subst("CU_MEM_ACCESS_FLAGS_PROT_READWRITE", "hipMemAccessFlagsProtReadWrite", "numeric_literal");
-    subst("CU_MEM_ADVISE_SET_ACCESSED_BY", "hipMemAdviseSetAccessedBy", "numeric_literal");
-    subst("CU_MEM_ADVISE_SET_PREFERRED_LOCATION", "hipMemAdviseSetPreferredLocation", "numeric_literal");
-    subst("CU_MEM_ADVISE_SET_READ_MOSTLY", "hipMemAdviseSetReadMostly", "numeric_literal");
-    subst("CU_MEM_ADVISE_UNSET_ACCESSED_BY", "hipMemAdviseUnsetAccessedBy", "numeric_literal");
-    subst("CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", "hipMemAdviseUnsetPreferredLocation", "numeric_literal");
-    subst("CU_MEM_ADVISE_UNSET_READ_MOSTLY", "hipMemAdviseUnsetReadMostly", "numeric_literal");
-    subst("CU_MEM_ALLOCATION_TYPE_INVALID", "hipMemAllocationTypeInvalid", "numeric_literal");
-    subst("CU_MEM_ALLOCATION_TYPE_MAX", "hipMemAllocationTypeMax", "numeric_literal");
-    subst("CU_MEM_ALLOCATION_TYPE_PINNED", "hipMemAllocationTypePinned", "numeric_literal");
-    subst("CU_MEM_ALLOC_GRANULARITY_MINIMUM", "hipMemAllocationGranularityMinimum", "numeric_literal");
-    subst("CU_MEM_ALLOC_GRANULARITY_RECOMMENDED", "hipMemAllocationGranularityRecommended", "numeric_literal");
-    subst("CU_MEM_ATTACH_GLOBAL", "hipMemAttachGlobal", "numeric_literal");
-    subst("CU_MEM_ATTACH_HOST", "hipMemAttachHost", "numeric_literal");
-    subst("CU_MEM_ATTACH_SINGLE", "hipMemAttachSingle", "numeric_literal");
-    subst("CU_MEM_HANDLE_TYPE_GENERIC", "hipMemHandleTypeGeneric", "numeric_literal");
-    subst("CU_MEM_HANDLE_TYPE_NONE", "hipMemHandleTypeNone", "numeric_literal");
-    subst("CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR", "hipMemHandleTypePosixFileDescriptor", "numeric_literal");
-    subst("CU_MEM_HANDLE_TYPE_WIN32", "hipMemHandleTypeWin32", "numeric_literal");
-    subst("CU_MEM_HANDLE_TYPE_WIN32_KMT", "hipMemHandleTypeWin32Kmt", "numeric_literal");
-    subst("CU_MEM_LOCATION_TYPE_DEVICE", "hipMemLocationTypeDevice", "numeric_literal");
-    subst("CU_MEM_LOCATION_TYPE_INVALID", "hipMemLocationTypeInvalid", "numeric_literal");
-    subst("CU_MEM_OPERATION_TYPE_MAP", "hipMemOperationTypeMap", "numeric_literal");
-    subst("CU_MEM_OPERATION_TYPE_UNMAP", "hipMemOperationTypeUnmap", "numeric_literal");
-    subst("CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY", "hipMemRangeAttributeAccessedBy", "numeric_literal");
-    subst("CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION", "hipMemRangeAttributeLastPrefetchLocation", "numeric_literal");
-    subst("CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION", "hipMemRangeAttributePreferredLocation", "numeric_literal");
-    subst("CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY", "hipMemRangeAttributeReadMostly", "numeric_literal");
-    subst("CU_OCCUPANCY_DEFAULT", "hipOccupancyDefault", "numeric_literal");
-    subst("CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE", "hipOccupancyDisableCachingOverride", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_ACCESS_FLAGS", "HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES", "HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_BUFFER_ID", "HIP_POINTER_ATTRIBUTE_BUFFER_ID", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_CONTEXT", "HIP_POINTER_ATTRIBUTE_CONTEXT", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL", "HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_DEVICE_POINTER", "HIP_POINTER_ATTRIBUTE_DEVICE_POINTER", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_HOST_POINTER", "HIP_POINTER_ATTRIBUTE_HOST_POINTER", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE", "HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE", "HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_IS_MANAGED", "HIP_POINTER_ATTRIBUTE_IS_MANAGED", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_MAPPED", "HIP_POINTER_ATTRIBUTE_MAPPED", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_MEMORY_TYPE", "HIP_POINTER_ATTRIBUTE_MEMORY_TYPE", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE", "HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_P2P_TOKENS", "HIP_POINTER_ATTRIBUTE_P2P_TOKENS", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_RANGE_SIZE", "HIP_POINTER_ATTRIBUTE_RANGE_SIZE", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_RANGE_START_ADDR", "HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR", "numeric_literal");
-    subst("CU_POINTER_ATTRIBUTE_SYNC_MEMOPS", "HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS", "numeric_literal");
-    subst("CU_RESOURCE_TYPE_ARRAY", "HIP_RESOURCE_TYPE_ARRAY", "numeric_literal");
-    subst("CU_RESOURCE_TYPE_LINEAR", "HIP_RESOURCE_TYPE_LINEAR", "numeric_literal");
-    subst("CU_RESOURCE_TYPE_MIPMAPPED_ARRAY", "HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY", "numeric_literal");
-    subst("CU_RESOURCE_TYPE_PITCH2D", "HIP_RESOURCE_TYPE_PITCH2D", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_FLOAT_1X16", "HIP_RES_VIEW_FORMAT_FLOAT_1X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_FLOAT_1X32", "HIP_RES_VIEW_FORMAT_FLOAT_1X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_FLOAT_2X16", "HIP_RES_VIEW_FORMAT_FLOAT_2X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_FLOAT_2X32", "HIP_RES_VIEW_FORMAT_FLOAT_2X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_FLOAT_4X16", "HIP_RES_VIEW_FORMAT_FLOAT_4X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_FLOAT_4X32", "HIP_RES_VIEW_FORMAT_FLOAT_4X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_NONE", "HIP_RES_VIEW_FORMAT_NONE", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SIGNED_BC4", "HIP_RES_VIEW_FORMAT_SIGNED_BC4", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SIGNED_BC5", "HIP_RES_VIEW_FORMAT_SIGNED_BC5", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SIGNED_BC6H", "HIP_RES_VIEW_FORMAT_SIGNED_BC6H", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_1X16", "HIP_RES_VIEW_FORMAT_SINT_1X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_1X32", "HIP_RES_VIEW_FORMAT_SINT_1X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_1X8", "HIP_RES_VIEW_FORMAT_SINT_1X8", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_2X16", "HIP_RES_VIEW_FORMAT_SINT_2X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_2X32", "HIP_RES_VIEW_FORMAT_SINT_2X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_2X8", "HIP_RES_VIEW_FORMAT_SINT_2X8", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_4X16", "HIP_RES_VIEW_FORMAT_SINT_4X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_4X32", "HIP_RES_VIEW_FORMAT_SINT_4X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_SINT_4X8", "HIP_RES_VIEW_FORMAT_SINT_4X8", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_1X16", "HIP_RES_VIEW_FORMAT_UINT_1X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_1X32", "HIP_RES_VIEW_FORMAT_UINT_1X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_1X8", "HIP_RES_VIEW_FORMAT_UINT_1X8", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_2X16", "HIP_RES_VIEW_FORMAT_UINT_2X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_2X32", "HIP_RES_VIEW_FORMAT_UINT_2X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_2X8", "HIP_RES_VIEW_FORMAT_UINT_2X8", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_4X16", "HIP_RES_VIEW_FORMAT_UINT_4X16", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_4X32", "HIP_RES_VIEW_FORMAT_UINT_4X32", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UINT_4X8", "HIP_RES_VIEW_FORMAT_UINT_4X8", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC1", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC1", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC2", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC2", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC3", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC3", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC4", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC4", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC5", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC5", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC6H", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H", "numeric_literal");
-    subst("CU_RES_VIEW_FORMAT_UNSIGNED_BC7", "HIP_RES_VIEW_FORMAT_UNSIGNED_BC7", "numeric_literal");
-    subst("CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE", "hipSharedMemBankSizeDefault", "numeric_literal");
-    subst("CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE", "hipSharedMemBankSizeEightByte", "numeric_literal");
-    subst("CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE", "hipSharedMemBankSizeFourByte", "numeric_literal");
-    subst("CU_STREAM_ADD_CAPTURE_DEPENDENCIES", "hipStreamAddCaptureDependencies", "numeric_literal");
-    subst("CU_STREAM_CAPTURE_MODE_GLOBAL", "hipStreamCaptureModeGlobal", "numeric_literal");
-    subst("CU_STREAM_CAPTURE_MODE_RELAXED", "hipStreamCaptureModeRelaxed", "numeric_literal");
-    subst("CU_STREAM_CAPTURE_MODE_THREAD_LOCAL", "hipStreamCaptureModeThreadLocal", "numeric_literal");
-    subst("CU_STREAM_CAPTURE_STATUS_ACTIVE", "hipStreamCaptureStatusActive", "numeric_literal");
-    subst("CU_STREAM_CAPTURE_STATUS_INVALIDATED", "hipStreamCaptureStatusInvalidated", "numeric_literal");
-    subst("CU_STREAM_CAPTURE_STATUS_NONE", "hipStreamCaptureStatusNone", "numeric_literal");
-    subst("CU_STREAM_DEFAULT", "hipStreamDefault", "numeric_literal");
-    subst("CU_STREAM_NON_BLOCKING", "hipStreamNonBlocking", "numeric_literal");
-    subst("CU_STREAM_SET_CAPTURE_DEPENDENCIES", "hipStreamSetCaptureDependencies", "numeric_literal");
-    subst("CU_STREAM_WAIT_VALUE_AND", "hipStreamWaitValueAnd", "numeric_literal");
-    subst("CU_STREAM_WAIT_VALUE_EQ", "hipStreamWaitValueEq", "numeric_literal");
-    subst("CU_STREAM_WAIT_VALUE_GEQ", "hipStreamWaitValueGte", "numeric_literal");
-    subst("CU_STREAM_WAIT_VALUE_NOR", "hipStreamWaitValueNor", "numeric_literal");
-    subst("CU_TR_ADDRESS_MODE_BORDER", "HIP_TR_ADDRESS_MODE_BORDER", "numeric_literal");
-    subst("CU_TR_ADDRESS_MODE_CLAMP", "HIP_TR_ADDRESS_MODE_CLAMP", "numeric_literal");
-    subst("CU_TR_ADDRESS_MODE_MIRROR", "HIP_TR_ADDRESS_MODE_MIRROR", "numeric_literal");
-    subst("CU_TR_ADDRESS_MODE_WRAP", "HIP_TR_ADDRESS_MODE_WRAP", "numeric_literal");
-    subst("CU_TR_FILTER_MODE_LINEAR", "HIP_TR_FILTER_MODE_LINEAR", "numeric_literal");
-    subst("CU_TR_FILTER_MODE_POINT", "HIP_TR_FILTER_MODE_POINT", "numeric_literal");
-    subst("CU_USER_OBJECT_NO_DESTRUCTOR_SYNC", "hipUserObjectNoDestructorSync", "numeric_literal");
-    subst("NVRTC_ERROR_BUILTIN_OPERATION_FAILURE", "HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE", "numeric_literal");
-    subst("NVRTC_ERROR_COMPILATION", "HIPRTC_ERROR_COMPILATION", "numeric_literal");
-    subst("NVRTC_ERROR_INTERNAL_ERROR", "HIPRTC_ERROR_INTERNAL_ERROR", "numeric_literal");
-    subst("NVRTC_ERROR_INVALID_INPUT", "HIPRTC_ERROR_INVALID_INPUT", "numeric_literal");
-    subst("NVRTC_ERROR_INVALID_OPTION", "HIPRTC_ERROR_INVALID_OPTION", "numeric_literal");
-    subst("NVRTC_ERROR_INVALID_PROGRAM", "HIPRTC_ERROR_INVALID_PROGRAM", "numeric_literal");
-    subst("NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID", "HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID", "numeric_literal");
-    subst("NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION", "HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION", "numeric_literal");
-    subst("NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", "HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", "numeric_literal");
-    subst("NVRTC_ERROR_OUT_OF_MEMORY", "HIPRTC_ERROR_OUT_OF_MEMORY", "numeric_literal");
-    subst("NVRTC_ERROR_PROGRAM_CREATION_FAILURE", "HIPRTC_ERROR_PROGRAM_CREATION_FAILURE", "numeric_literal");
-    subst("NVRTC_SUCCESS", "HIPRTC_SUCCESS", "numeric_literal");
-    subst("cublasLtOrder_t", "hipblasLtOrder_t", "numeric_literal");
-    subst("cudaAccessPropertyNormal", "hipAccessPropertyNormal", "numeric_literal");
-    subst("cudaAccessPropertyPersisting", "hipAccessPropertyPersisting", "numeric_literal");
-    subst("cudaAccessPropertyStreaming", "hipAccessPropertyStreaming", "numeric_literal");
-    subst("cudaAddressModeBorder", "hipAddressModeBorder", "numeric_literal");
-    subst("cudaAddressModeClamp", "hipAddressModeClamp", "numeric_literal");
-    subst("cudaAddressModeMirror", "hipAddressModeMirror", "numeric_literal");
-    subst("cudaAddressModeWrap", "hipAddressModeWrap", "numeric_literal");
-    subst("cudaBoundaryModeClamp", "hipBoundaryModeClamp", "numeric_literal");
-    subst("cudaBoundaryModeTrap", "hipBoundaryModeTrap", "numeric_literal");
-    subst("cudaBoundaryModeZero", "hipBoundaryModeZero", "numeric_literal");
-    subst("cudaChannelFormatKindFloat", "hipChannelFormatKindFloat", "numeric_literal");
-    subst("cudaChannelFormatKindNone", "hipChannelFormatKindNone", "numeric_literal");
-    subst("cudaChannelFormatKindSigned", "hipChannelFormatKindSigned", "numeric_literal");
-    subst("cudaChannelFormatKindUnsigned", "hipChannelFormatKindUnsigned", "numeric_literal");
-    subst("cudaComputeModeDefault", "hipComputeModeDefault", "numeric_literal");
-    subst("cudaComputeModeExclusive", "hipComputeModeExclusive", "numeric_literal");
-    subst("cudaComputeModeExclusiveProcess", "hipComputeModeExclusiveProcess", "numeric_literal");
-    subst("cudaComputeModeProhibited", "hipComputeModeProhibited", "numeric_literal");
-    subst("cudaDevAttrAsyncEngineCount", "hipDeviceAttributeAsyncEngineCount", "numeric_literal");
-    subst("cudaDevAttrCanMapHostMemory", "hipDeviceAttributeCanMapHostMemory", "numeric_literal");
-    subst("cudaDevAttrCanUseHostPointerForRegisteredMem", "hipDeviceAttributeCanUseHostPointerForRegisteredMem", "numeric_literal");
-    subst("cudaDevAttrClockRate", "hipDeviceAttributeClockRate", "numeric_literal");
-    subst("cudaDevAttrComputeCapabilityMajor", "hipDeviceAttributeComputeCapabilityMajor", "numeric_literal");
-    subst("cudaDevAttrComputeCapabilityMinor", "hipDeviceAttributeComputeCapabilityMinor", "numeric_literal");
-    subst("cudaDevAttrComputeMode", "hipDeviceAttributeComputeMode", "numeric_literal");
-    subst("cudaDevAttrComputePreemptionSupported", "hipDeviceAttributeComputePreemptionSupported", "numeric_literal");
-    subst("cudaDevAttrConcurrentKernels", "hipDeviceAttributeConcurrentKernels", "numeric_literal");
-    subst("cudaDevAttrConcurrentManagedAccess", "hipDeviceAttributeConcurrentManagedAccess", "numeric_literal");
-    subst("cudaDevAttrCooperativeLaunch", "hipDeviceAttributeCooperativeLaunch", "numeric_literal");
-    subst("cudaDevAttrCooperativeMultiDeviceLaunch", "hipDeviceAttributeCooperativeMultiDeviceLaunch", "numeric_literal");
-    subst("cudaDevAttrDirectManagedMemAccessFromHost", "hipDeviceAttributeDirectManagedMemAccessFromHost", "numeric_literal");
-    subst("cudaDevAttrEccEnabled", "hipDeviceAttributeEccEnabled", "numeric_literal");
-    subst("cudaDevAttrGlobalL1CacheSupported", "hipDeviceAttributeGlobalL1CacheSupported", "numeric_literal");
-    subst("cudaDevAttrGlobalMemoryBusWidth", "hipDeviceAttributeMemoryBusWidth", "numeric_literal");
-    subst("cudaDevAttrGpuOverlap", "hipDeviceAttributeAsyncEngineCount", "numeric_literal");
-    subst("cudaDevAttrHostNativeAtomicSupported", "hipDeviceAttributeHostNativeAtomicSupported", "numeric_literal");
-    subst("cudaDevAttrHostRegisterSupported", "hipDeviceAttributeHostRegisterSupported", "numeric_literal");
-    subst("cudaDevAttrIntegrated", "hipDeviceAttributeIntegrated", "numeric_literal");
-    subst("cudaDevAttrIsMultiGpuBoard", "hipDeviceAttributeIsMultiGpuBoard", "numeric_literal");
-    subst("cudaDevAttrKernelExecTimeout", "hipDeviceAttributeKernelExecTimeout", "numeric_literal");
-    subst("cudaDevAttrL2CacheSize", "hipDeviceAttributeL2CacheSize", "numeric_literal");
-    subst("cudaDevAttrLocalL1CacheSupported", "hipDeviceAttributeLocalL1CacheSupported", "numeric_literal");
-    subst("cudaDevAttrManagedMemory", "hipDeviceAttributeManagedMemory", "numeric_literal");
-    subst("cudaDevAttrMaxBlockDimX", "hipDeviceAttributeMaxBlockDimX", "numeric_literal");
-    subst("cudaDevAttrMaxBlockDimY", "hipDeviceAttributeMaxBlockDimY", "numeric_literal");
-    subst("cudaDevAttrMaxBlockDimZ", "hipDeviceAttributeMaxBlockDimZ", "numeric_literal");
-    subst("cudaDevAttrMaxBlocksPerMultiprocessor", "hipDeviceAttributeMaxBlocksPerMultiprocessor", "numeric_literal");
-    subst("cudaDevAttrMaxGridDimX", "hipDeviceAttributeMaxGridDimX", "numeric_literal");
-    subst("cudaDevAttrMaxGridDimY", "hipDeviceAttributeMaxGridDimY", "numeric_literal");
-    subst("cudaDevAttrMaxGridDimZ", "hipDeviceAttributeMaxGridDimZ", "numeric_literal");
-    subst("cudaDevAttrMaxPitch", "hipDeviceAttributeMaxPitch", "numeric_literal");
-    subst("cudaDevAttrMaxRegistersPerBlock", "hipDeviceAttributeMaxRegistersPerBlock", "numeric_literal");
-    subst("cudaDevAttrMaxRegistersPerMultiprocessor", "hipDeviceAttributeMaxRegistersPerMultiprocessor", "numeric_literal");
-    subst("cudaDevAttrMaxSharedMemoryPerBlock", "hipDeviceAttributeMaxSharedMemoryPerBlock", "numeric_literal");
-    subst("cudaDevAttrMaxSharedMemoryPerBlockOptin", "hipDeviceAttributeSharedMemPerBlockOptin", "numeric_literal");
-    subst("cudaDevAttrMaxSharedMemoryPerMultiprocessor", "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", "numeric_literal");
-    subst("cudaDevAttrMaxSurface1DLayeredWidth", "hipDeviceAttributeMaxSurface1DLayered", "numeric_literal");
-    subst("cudaDevAttrMaxSurface1DWidth", "hipDeviceAttributeMaxSurface1D", "numeric_literal");
-    subst("cudaDevAttrMaxSurface2DHeight", "hipDeviceAttributeMaxSurface2D", "numeric_literal");
-    subst("cudaDevAttrMaxSurface2DLayeredHeight", "hipDeviceAttributeMaxSurface2DLayered", "numeric_literal");
-    subst("cudaDevAttrMaxSurface2DLayeredWidth", "hipDeviceAttributeMaxSurface2DLayered", "numeric_literal");
-    subst("cudaDevAttrMaxSurface2DWidth", "hipDeviceAttributeMaxSurface2D", "numeric_literal");
-    subst("cudaDevAttrMaxSurface3DDepth", "hipDeviceAttributeMaxSurface3D", "numeric_literal");
-    subst("cudaDevAttrMaxSurface3DHeight", "hipDeviceAttributeMaxSurface3D", "numeric_literal");
-    subst("cudaDevAttrMaxSurface3DWidth", "hipDeviceAttributeMaxSurface3D", "numeric_literal");
-    subst("cudaDevAttrMaxSurfaceCubemapLayeredWidth", "hipDeviceAttributeMaxSurfaceCubemapLayered", "numeric_literal");
-    subst("cudaDevAttrMaxSurfaceCubemapWidth", "hipDeviceAttributeMaxSurfaceCubemap", "numeric_literal");
-    subst("cudaDevAttrMaxTexture1DLayeredWidth", "hipDeviceAttributeMaxTexture1DLayered", "numeric_literal");
-    subst("cudaDevAttrMaxTexture1DLinearWidth", "hipDeviceAttributeMaxTexture1DLinear", "numeric_literal");
-    subst("cudaDevAttrMaxTexture1DMipmappedWidth", "hipDeviceAttributeMaxTexture1DMipmap", "numeric_literal");
-    subst("cudaDevAttrMaxTexture1DWidth", "hipDeviceAttributeMaxTexture1DWidth", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DGatherHeight", "hipDeviceAttributeMaxTexture2DGather", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DGatherWidth", "hipDeviceAttributeMaxTexture2DGather", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DHeight", "hipDeviceAttributeMaxTexture2DHeight", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DLayeredHeight", "hipDeviceAttributeMaxTexture2DLayered", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DLayeredWidth", "hipDeviceAttributeMaxTexture2DLayered", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DLinearHeight", "hipDeviceAttributeMaxTexture2DLinear", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DLinearPitch", "hipDeviceAttributeMaxTexture2DLinear", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DLinearWidth", "hipDeviceAttributeMaxTexture2DLinear", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DMipmappedHeight", "hipDeviceAttributeMaxTexture2DMipmap", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DMipmappedWidth", "hipDeviceAttributeMaxTexture2DMipmap", "numeric_literal");
-    subst("cudaDevAttrMaxTexture2DWidth", "hipDeviceAttributeMaxTexture2DWidth", "numeric_literal");
-    subst("cudaDevAttrMaxTexture3DDepth", "hipDeviceAttributeMaxTexture3DDepth", "numeric_literal");
-    subst("cudaDevAttrMaxTexture3DDepthAlt", "hipDeviceAttributeMaxTexture3DAlt", "numeric_literal");
-    subst("cudaDevAttrMaxTexture3DHeight", "hipDeviceAttributeMaxTexture3DHeight", "numeric_literal");
-    subst("cudaDevAttrMaxTexture3DHeightAlt", "hipDeviceAttributeMaxTexture3DAlt", "numeric_literal");
-    subst("cudaDevAttrMaxTexture3DWidth", "hipDeviceAttributeMaxTexture3DWidth", "numeric_literal");
-    subst("cudaDevAttrMaxTexture3DWidthAlt", "hipDeviceAttributeMaxTexture3DAlt", "numeric_literal");
-    subst("cudaDevAttrMaxTextureCubemapLayeredWidth", "hipDeviceAttributeMaxTextureCubemapLayered", "numeric_literal");
-    subst("cudaDevAttrMaxTextureCubemapWidth", "hipDeviceAttributeMaxTextureCubemap", "numeric_literal");
-    subst("cudaDevAttrMaxThreadsPerBlock", "hipDeviceAttributeMaxThreadsPerBlock", "numeric_literal");
-    subst("cudaDevAttrMaxThreadsPerMultiProcessor", "hipDeviceAttributeMaxThreadsPerMultiProcessor", "numeric_literal");
-    subst("cudaDevAttrMemoryClockRate", "hipDeviceAttributeMemoryClockRate", "numeric_literal");
-    subst("cudaDevAttrMemoryPoolsSupported", "hipDeviceAttributeMemoryPoolsSupported", "numeric_literal");
-    subst("cudaDevAttrMultiGpuBoardGroupID", "hipDeviceAttributeMultiGpuBoardGroupID", "numeric_literal");
-    subst("cudaDevAttrMultiProcessorCount", "hipDeviceAttributeMultiprocessorCount", "numeric_literal");
-    subst("cudaDevAttrPageableMemoryAccess", "hipDeviceAttributePageableMemoryAccess", "numeric_literal");
-    subst("cudaDevAttrPageableMemoryAccessUsesHostPageTables", "hipDeviceAttributePageableMemoryAccessUsesHostPageTables", "numeric_literal");
-    subst("cudaDevAttrPciBusId", "hipDeviceAttributePciBusId", "numeric_literal");
-    subst("cudaDevAttrPciDeviceId", "hipDeviceAttributePciDeviceId", "numeric_literal");
-    subst("cudaDevAttrPciDomainId", "hipDeviceAttributePciDomainID", "numeric_literal");
-    subst("cudaDevAttrReserved94", "hipDeviceAttributeCanUseStreamWaitValue", "numeric_literal");
-    subst("cudaDevAttrSingleToDoublePrecisionPerfRatio", "hipDeviceAttributeSingleToDoublePrecisionPerfRatio", "numeric_literal");
-    subst("cudaDevAttrStreamPrioritiesSupported", "hipDeviceAttributeStreamPrioritiesSupported", "numeric_literal");
-    subst("cudaDevAttrSurfaceAlignment", "hipDeviceAttributeSurfaceAlignment", "numeric_literal");
-    subst("cudaDevAttrTccDriver", "hipDeviceAttributeTccDriver", "numeric_literal");
-    subst("cudaDevAttrTextureAlignment", "hipDeviceAttributeTextureAlignment", "numeric_literal");
-    subst("cudaDevAttrTexturePitchAlignment", "hipDeviceAttributeTexturePitchAlignment", "numeric_literal");
-    subst("cudaDevAttrTotalConstantMemory", "hipDeviceAttributeTotalConstantMemory", "numeric_literal");
-    subst("cudaDevAttrUnifiedAddressing", "hipDeviceAttributeUnifiedAddressing", "numeric_literal");
-    subst("cudaDevAttrWarpSize", "hipDeviceAttributeWarpSize", "numeric_literal");
-    subst("cudaDevP2PAttrAccessSupported", "hipDevP2PAttrAccessSupported", "numeric_literal");
-    subst("cudaDevP2PAttrCudaArrayAccessSupported", "hipDevP2PAttrHipArrayAccessSupported", "numeric_literal");
-    subst("cudaDevP2PAttrNativeAtomicSupported", "hipDevP2PAttrNativeAtomicSupported", "numeric_literal");
-    subst("cudaDevP2PAttrPerformanceRank", "hipDevP2PAttrPerformanceRank", "numeric_literal");
-    subst("cudaErrorAlreadyAcquired", "hipErrorAlreadyAcquired", "numeric_literal");
-    subst("cudaErrorAlreadyMapped", "hipErrorAlreadyMapped", "numeric_literal");
-    subst("cudaErrorArrayIsMapped", "hipErrorArrayIsMapped", "numeric_literal");
-    subst("cudaErrorAssert", "hipErrorAssert", "numeric_literal");
-    subst("cudaErrorCapturedEvent", "hipErrorCapturedEvent", "numeric_literal");
-    subst("cudaErrorContextIsDestroyed", "hipErrorContextIsDestroyed", "numeric_literal");
-    subst("cudaErrorCooperativeLaunchTooLarge", "hipErrorCooperativeLaunchTooLarge", "numeric_literal");
-    subst("cudaErrorCudartUnloading", "hipErrorDeinitialized", "numeric_literal");
-    subst("cudaErrorDeviceAlreadyInUse", "hipErrorContextAlreadyInUse", "numeric_literal");
-    subst("cudaErrorDeviceUninitialized", "hipErrorInvalidContext", "numeric_literal");
-    subst("cudaErrorECCUncorrectable", "hipErrorECCNotCorrectable", "numeric_literal");
-    subst("cudaErrorFileNotFound", "hipErrorFileNotFound", "numeric_literal");
-    subst("cudaErrorGraphExecUpdateFailure", "hipErrorGraphExecUpdateFailure", "numeric_literal");
-    subst("cudaErrorHostMemoryAlreadyRegistered", "hipErrorHostMemoryAlreadyRegistered", "numeric_literal");
-    subst("cudaErrorHostMemoryNotRegistered", "hipErrorHostMemoryNotRegistered", "numeric_literal");
-    subst("cudaErrorIllegalAddress", "hipErrorIllegalAddress", "numeric_literal");
-    subst("cudaErrorIllegalState", "hipErrorIllegalState", "numeric_literal");
-    subst("cudaErrorInitializationError", "hipErrorNotInitialized", "numeric_literal");
-    subst("cudaErrorInsufficientDriver", "hipErrorInsufficientDriver", "numeric_literal");
-    subst("cudaErrorInvalidConfiguration", "hipErrorInvalidConfiguration", "numeric_literal");
-    subst("cudaErrorInvalidDevice", "hipErrorInvalidDevice", "numeric_literal");
-    subst("cudaErrorInvalidDeviceFunction", "hipErrorInvalidDeviceFunction", "numeric_literal");
-    subst("cudaErrorInvalidDevicePointer", "hipErrorInvalidDevicePointer", "numeric_literal");
-    subst("cudaErrorInvalidGraphicsContext", "hipErrorInvalidGraphicsContext", "numeric_literal");
-    subst("cudaErrorInvalidKernelImage", "hipErrorInvalidImage", "numeric_literal");
-    subst("cudaErrorInvalidMemcpyDirection", "hipErrorInvalidMemcpyDirection", "numeric_literal");
-    subst("cudaErrorInvalidPitchValue", "hipErrorInvalidPitchValue", "numeric_literal");
-    subst("cudaErrorInvalidPtx", "hipErrorInvalidKernelFile", "numeric_literal");
-    subst("cudaErrorInvalidResourceHandle", "hipErrorInvalidHandle", "numeric_literal");
-    subst("cudaErrorInvalidSource", "hipErrorInvalidSource", "numeric_literal");
-    subst("cudaErrorInvalidSymbol", "hipErrorInvalidSymbol", "numeric_literal");
-    subst("cudaErrorInvalidValue", "hipErrorInvalidValue", "numeric_literal");
-    subst("cudaErrorLaunchFailure", "hipErrorLaunchFailure", "numeric_literal");
-    subst("cudaErrorLaunchOutOfResources", "hipErrorLaunchOutOfResources", "numeric_literal");
-    subst("cudaErrorLaunchTimeout", "hipErrorLaunchTimeOut", "numeric_literal");
-    subst("cudaErrorLossyQuery", "hipErrorLossyQuery", "numeric_literal");
-    subst("cudaErrorMapBufferObjectFailed", "hipErrorMapFailed", "numeric_literal");
-    subst("cudaErrorMemoryAllocation", "hipErrorOutOfMemory", "numeric_literal");
-    subst("cudaErrorMissingConfiguration", "hipErrorMissingConfiguration", "numeric_literal");
-    subst("cudaErrorNoDevice", "hipErrorNoDevice", "numeric_literal");
-    subst("cudaErrorNoKernelImageForDevice", "hipErrorNoBinaryForGpu", "numeric_literal");
-    subst("cudaErrorNotMapped", "hipErrorNotMapped", "numeric_literal");
-    subst("cudaErrorNotMappedAsArray", "hipErrorNotMappedAsArray", "numeric_literal");
-    subst("cudaErrorNotMappedAsPointer", "hipErrorNotMappedAsPointer", "numeric_literal");
-    subst("cudaErrorNotReady", "hipErrorNotReady", "numeric_literal");
-    subst("cudaErrorNotSupported", "hipErrorNotSupported", "numeric_literal");
-    subst("cudaErrorOperatingSystem", "hipErrorOperatingSystem", "numeric_literal");
-    subst("cudaErrorPeerAccessAlreadyEnabled", "hipErrorPeerAccessAlreadyEnabled", "numeric_literal");
-    subst("cudaErrorPeerAccessNotEnabled", "hipErrorPeerAccessNotEnabled", "numeric_literal");
-    subst("cudaErrorPeerAccessUnsupported", "hipErrorPeerAccessUnsupported", "numeric_literal");
-    subst("cudaErrorPriorLaunchFailure", "hipErrorPriorLaunchFailure", "numeric_literal");
-    subst("cudaErrorProfilerAlreadyStarted", "hipErrorProfilerAlreadyStarted", "numeric_literal");
-    subst("cudaErrorProfilerAlreadyStopped", "hipErrorProfilerAlreadyStopped", "numeric_literal");
-    subst("cudaErrorProfilerDisabled", "hipErrorProfilerDisabled", "numeric_literal");
-    subst("cudaErrorProfilerNotInitialized", "hipErrorProfilerNotInitialized", "numeric_literal");
-    subst("cudaErrorSetOnActiveProcess", "hipErrorSetOnActiveProcess", "numeric_literal");
-    subst("cudaErrorSharedObjectInitFailed", "hipErrorSharedObjectInitFailed", "numeric_literal");
-    subst("cudaErrorSharedObjectSymbolNotFound", "hipErrorSharedObjectSymbolNotFound", "numeric_literal");
-    subst("cudaErrorStreamCaptureImplicit", "hipErrorStreamCaptureImplicit", "numeric_literal");
-    subst("cudaErrorStreamCaptureInvalidated", "hipErrorStreamCaptureInvalidated", "numeric_literal");
-    subst("cudaErrorStreamCaptureIsolation", "hipErrorStreamCaptureIsolation", "numeric_literal");
-    subst("cudaErrorStreamCaptureMerge", "hipErrorStreamCaptureMerge", "numeric_literal");
-    subst("cudaErrorStreamCaptureUnjoined", "hipErrorStreamCaptureUnjoined", "numeric_literal");
-    subst("cudaErrorStreamCaptureUnmatched", "hipErrorStreamCaptureUnmatched", "numeric_literal");
-    subst("cudaErrorStreamCaptureUnsupported", "hipErrorStreamCaptureUnsupported", "numeric_literal");
-    subst("cudaErrorStreamCaptureWrongThread", "hipErrorStreamCaptureWrongThread", "numeric_literal");
-    subst("cudaErrorSymbolNotFound", "hipErrorNotFound", "numeric_literal");
-    subst("cudaErrorUnknown", "hipErrorUnknown", "numeric_literal");
-    subst("cudaErrorUnmapBufferObjectFailed", "hipErrorUnmapFailed", "numeric_literal");
-    subst("cudaErrorUnsupportedLimit", "hipErrorUnsupportedLimit", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeD3D11Resource", "hipExternalMemoryHandleTypeD3D11Resource", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeD3D11ResourceKmt", "hipExternalMemoryHandleTypeD3D11ResourceKmt", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeD3D12Heap", "hipExternalMemoryHandleTypeD3D12Heap", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeD3D12Resource", "hipExternalMemoryHandleTypeD3D12Resource", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeOpaqueFd", "hipExternalMemoryHandleTypeOpaqueFd", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeOpaqueWin32", "hipExternalMemoryHandleTypeOpaqueWin32", "numeric_literal");
-    subst("cudaExternalMemoryHandleTypeOpaqueWin32Kmt", "hipExternalMemoryHandleTypeOpaqueWin32Kmt", "numeric_literal");
-    subst("cudaExternalSemaphoreHandleTypeD3D12Fence", "hipExternalSemaphoreHandleTypeD3D12Fence", "numeric_literal");
-    subst("cudaExternalSemaphoreHandleTypeOpaqueFd", "hipExternalSemaphoreHandleTypeOpaqueFd", "numeric_literal");
-    subst("cudaExternalSemaphoreHandleTypeOpaqueWin32", "hipExternalSemaphoreHandleTypeOpaqueWin32", "numeric_literal");
-    subst("cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt", "hipExternalSemaphoreHandleTypeOpaqueWin32Kmt", "numeric_literal");
-    subst("cudaFilterModeLinear", "hipFilterModeLinear", "numeric_literal");
-    subst("cudaFilterModePoint", "hipFilterModePoint", "numeric_literal");
-    subst("cudaFlushGPUDirectRDMAWritesOptionHost", "hipFlushGPUDirectRDMAWritesOptionHost", "numeric_literal");
-    subst("cudaFlushGPUDirectRDMAWritesOptionMemOps", "hipFlushGPUDirectRDMAWritesOptionMemOps", "numeric_literal");
-    subst("cudaFuncAttributeMax", "hipFuncAttributeMax", "numeric_literal");
-    subst("cudaFuncAttributeMaxDynamicSharedMemorySize", "hipFuncAttributeMaxDynamicSharedMemorySize", "numeric_literal");
-    subst("cudaFuncAttributePreferredSharedMemoryCarveout", "hipFuncAttributePreferredSharedMemoryCarveout", "numeric_literal");
-    subst("cudaFuncCachePreferEqual", "hipFuncCachePreferEqual", "numeric_literal");
-    subst("cudaFuncCachePreferL1", "hipFuncCachePreferL1", "numeric_literal");
-    subst("cudaFuncCachePreferNone", "hipFuncCachePreferNone", "numeric_literal");
-    subst("cudaFuncCachePreferShared", "hipFuncCachePreferShared", "numeric_literal");
-    subst("cudaGLDeviceListAll", "hipGLDeviceListAll", "numeric_literal");
-    subst("cudaGLDeviceListCurrentFrame", "hipGLDeviceListCurrentFrame", "numeric_literal");
-    subst("cudaGLDeviceListNextFrame", "hipGLDeviceListNextFrame", "numeric_literal");
-    subst("cudaGPUDirectRDMAWritesOrderingAllDevices", "hipGPUDirectRDMAWritesOrderingAllDevices", "numeric_literal");
-    subst("cudaGPUDirectRDMAWritesOrderingNone", "hipGPUDirectRDMAWritesOrderingNone", "numeric_literal");
-    subst("cudaGPUDirectRDMAWritesOrderingOwner", "hipGPUDirectRDMAWritesOrderingOwner", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsEventNodeParams", "hipGraphDebugDotFlagsEventNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsExtSemasSignalNodeParams", "hipGraphDebugDotFlagsExtSemasSignalNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsExtSemasWaitNodeParams", "hipGraphDebugDotFlagsExtSemasWaitNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsHandles", "hipGraphDebugDotFlagsHandles", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsHostNodeParams", "hipGraphDebugDotFlagsHostNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsKernelNodeAttributes", "hipGraphDebugDotFlagsKernelNodeAttributes", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsKernelNodeParams", "hipGraphDebugDotFlagsKernelNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsMemcpyNodeParams", "hipGraphDebugDotFlagsMemcpyNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsMemsetNodeParams", "hipGraphDebugDotFlagsMemsetNodeParams", "numeric_literal");
-    subst("cudaGraphDebugDotFlagsVerbose", "hipGraphDebugDotFlagsVerbose", "numeric_literal");
-    subst("cudaGraphExecUpdateError", "hipGraphExecUpdateError", "numeric_literal");
-    subst("cudaGraphExecUpdateErrorFunctionChanged", "hipGraphExecUpdateErrorFunctionChanged", "numeric_literal");
-    subst("cudaGraphExecUpdateErrorNodeTypeChanged", "hipGraphExecUpdateErrorNodeTypeChanged", "numeric_literal");
-    subst("cudaGraphExecUpdateErrorNotSupported", "hipGraphExecUpdateErrorNotSupported", "numeric_literal");
-    subst("cudaGraphExecUpdateErrorParametersChanged", "hipGraphExecUpdateErrorParametersChanged", "numeric_literal");
-    subst("cudaGraphExecUpdateErrorTopologyChanged", "hipGraphExecUpdateErrorTopologyChanged", "numeric_literal");
-    subst("cudaGraphExecUpdateErrorUnsupportedFunctionChange", "hipGraphExecUpdateErrorUnsupportedFunctionChange", "numeric_literal");
-    subst("cudaGraphExecUpdateSuccess", "hipGraphExecUpdateSuccess", "numeric_literal");
-    subst("cudaGraphInstantiateError", "hipGraphInstantiateError", "numeric_literal");
-    subst("cudaGraphInstantiateFlagAutoFreeOnLaunch", "hipGraphInstantiateFlagAutoFreeOnLaunch", "numeric_literal");
-    subst("cudaGraphInstantiateFlagDeviceLaunch", "hipGraphInstantiateFlagDeviceLaunch", "numeric_literal");
-    subst("cudaGraphInstantiateFlagUpload", "hipGraphInstantiateFlagUpload", "numeric_literal");
-    subst("cudaGraphInstantiateFlagUseNodePriority", "hipGraphInstantiateFlagUseNodePriority", "numeric_literal");
-    subst("cudaGraphInstantiateInvalidStructure", "hipGraphInstantiateInvalidStructure", "numeric_literal");
-    subst("cudaGraphInstantiateMultipleDevicesNotSupported", "hipGraphInstantiateMultipleDevicesNotSupported", "numeric_literal");
-    subst("cudaGraphInstantiateNodeOperationNotSupported", "hipGraphInstantiateNodeOperationNotSupported", "numeric_literal");
-    subst("cudaGraphInstantiateSuccess", "hipGraphInstantiateSuccess", "numeric_literal");
-    subst("cudaGraphMemAttrReservedMemCurrent", "hipGraphMemAttrReservedMemCurrent", "numeric_literal");
-    subst("cudaGraphMemAttrReservedMemHigh", "hipGraphMemAttrReservedMemHigh", "numeric_literal");
-    subst("cudaGraphMemAttrUsedMemCurrent", "hipGraphMemAttrUsedMemCurrent", "numeric_literal");
-    subst("cudaGraphMemAttrUsedMemHigh", "hipGraphMemAttrUsedMemHigh", "numeric_literal");
-    subst("cudaGraphNodeTypeConditional", "hipGraphNodeTypeConditional", "numeric_literal");
-    subst("cudaGraphNodeTypeCount", "hipGraphNodeTypeCount", "numeric_literal");
-    subst("cudaGraphNodeTypeEmpty", "hipGraphNodeTypeEmpty", "numeric_literal");
-    subst("cudaGraphNodeTypeEventRecord", "hipGraphNodeTypeEventRecord", "numeric_literal");
-    subst("cudaGraphNodeTypeExtSemaphoreSignal", "hipGraphNodeTypeExtSemaphoreSignal", "numeric_literal");
-    subst("cudaGraphNodeTypeExtSemaphoreWait", "hipGraphNodeTypeExtSemaphoreWait", "numeric_literal");
-    subst("cudaGraphNodeTypeGraph", "hipGraphNodeTypeGraph", "numeric_literal");
-    subst("cudaGraphNodeTypeHost", "hipGraphNodeTypeHost", "numeric_literal");
-    subst("cudaGraphNodeTypeKernel", "hipGraphNodeTypeKernel", "numeric_literal");
-    subst("cudaGraphNodeTypeMemAlloc", "hipGraphNodeTypeMemAlloc", "numeric_literal");
-    subst("cudaGraphNodeTypeMemFree", "hipGraphNodeTypeMemFree", "numeric_literal");
-    subst("cudaGraphNodeTypeMemcpy", "hipGraphNodeTypeMemcpy", "numeric_literal");
-    subst("cudaGraphNodeTypeMemset", "hipGraphNodeTypeMemset", "numeric_literal");
-    subst("cudaGraphNodeTypeWaitEvent", "hipGraphNodeTypeWaitEvent", "numeric_literal");
-    subst("cudaGraphUserObjectMove", "hipGraphUserObjectMove", "numeric_literal");
-    subst("cudaGraphicsRegisterFlagsNone", "hipGraphicsRegisterFlagsNone", "numeric_literal");
-    subst("cudaGraphicsRegisterFlagsReadOnly", "hipGraphicsRegisterFlagsReadOnly", "numeric_literal");
-    subst("cudaGraphicsRegisterFlagsSurfaceLoadStore", "hipGraphicsRegisterFlagsSurfaceLoadStore", "numeric_literal");
-    subst("cudaGraphicsRegisterFlagsTextureGather", "hipGraphicsRegisterFlagsTextureGather", "numeric_literal");
-    subst("cudaGraphicsRegisterFlagsWriteDiscard", "hipGraphicsRegisterFlagsWriteDiscard", "numeric_literal");
-    subst("cudaKernelNodeAttributeAccessPolicyWindow", "hipKernelNodeAttributeAccessPolicyWindow", "numeric_literal");
-    subst("cudaKernelNodeAttributeCooperative", "hipKernelNodeAttributeCooperative", "numeric_literal");
-    subst("cudaLimitMallocHeapSize", "hipLimitMallocHeapSize", "numeric_literal");
-    subst("cudaLimitPrintfFifoSize", "hipLimitPrintfFifoSize", "numeric_literal");
-    subst("cudaLimitStackSize", "hipLimitStackSize", "numeric_literal");
-    subst("cudaMemAccessFlagsProtNone", "hipMemAccessFlagsProtNone", "numeric_literal");
-    subst("cudaMemAccessFlagsProtRead", "hipMemAccessFlagsProtRead", "numeric_literal");
-    subst("cudaMemAccessFlagsProtReadWrite", "hipMemAccessFlagsProtReadWrite", "numeric_literal");
-    subst("cudaMemAdviseSetAccessedBy", "hipMemAdviseSetAccessedBy", "numeric_literal");
-    subst("cudaMemAdviseSetPreferredLocation", "hipMemAdviseSetPreferredLocation", "numeric_literal");
-    subst("cudaMemAdviseSetReadMostly", "hipMemAdviseSetReadMostly", "numeric_literal");
-    subst("cudaMemAdviseUnsetAccessedBy", "hipMemAdviseUnsetAccessedBy", "numeric_literal");
-    subst("cudaMemAdviseUnsetPreferredLocation", "hipMemAdviseUnsetPreferredLocation", "numeric_literal");
-    subst("cudaMemAdviseUnsetReadMostly", "hipMemAdviseUnsetReadMostly", "numeric_literal");
-    subst("cudaMemAllocationTypeInvalid", "hipMemAllocationTypeInvalid", "numeric_literal");
-    subst("cudaMemAllocationTypeMax", "hipMemAllocationTypeMax", "numeric_literal");
-    subst("cudaMemAllocationTypePinned", "hipMemAllocationTypePinned", "numeric_literal");
-    subst("cudaMemHandleTypeNone", "hipMemHandleTypeNone", "numeric_literal");
-    subst("cudaMemHandleTypePosixFileDescriptor", "hipMemHandleTypePosixFileDescriptor", "numeric_literal");
-    subst("cudaMemHandleTypeWin32", "hipMemHandleTypeWin32", "numeric_literal");
-    subst("cudaMemHandleTypeWin32Kmt", "hipMemHandleTypeWin32Kmt", "numeric_literal");
-    subst("cudaMemLocationTypeDevice", "hipMemLocationTypeDevice", "numeric_literal");
-    subst("cudaMemLocationTypeInvalid", "hipMemLocationTypeInvalid", "numeric_literal");
-    subst("cudaMemPoolAttrReleaseThreshold", "hipMemPoolAttrReleaseThreshold", "numeric_literal");
-    subst("cudaMemPoolAttrReservedMemCurrent", "hipMemPoolAttrReservedMemCurrent", "numeric_literal");
-    subst("cudaMemPoolAttrReservedMemHigh", "hipMemPoolAttrReservedMemHigh", "numeric_literal");
-    subst("cudaMemPoolAttrUsedMemCurrent", "hipMemPoolAttrUsedMemCurrent", "numeric_literal");
-    subst("cudaMemPoolAttrUsedMemHigh", "hipMemPoolAttrUsedMemHigh", "numeric_literal");
-    subst("cudaMemPoolReuseAllowInternalDependencies", "hipMemPoolReuseAllowInternalDependencies", "numeric_literal");
-    subst("cudaMemPoolReuseAllowOpportunistic", "hipMemPoolReuseAllowOpportunistic", "numeric_literal");
-    subst("cudaMemPoolReuseFollowEventDependencies", "hipMemPoolReuseFollowEventDependencies", "numeric_literal");
-    subst("cudaMemRangeAttributeAccessedBy", "hipMemRangeAttributeAccessedBy", "numeric_literal");
-    subst("cudaMemRangeAttributeLastPrefetchLocation", "hipMemRangeAttributeLastPrefetchLocation", "numeric_literal");
-    subst("cudaMemRangeAttributePreferredLocation", "hipMemRangeAttributePreferredLocation", "numeric_literal");
-    subst("cudaMemRangeAttributeReadMostly", "hipMemRangeAttributeReadMostly", "numeric_literal");
-    subst("cudaMemcpyDefault", "hipMemcpyDefault", "numeric_literal");
-    subst("cudaMemcpyDeviceToDevice", "hipMemcpyDeviceToDevice", "numeric_literal");
-    subst("cudaMemcpyDeviceToHost", "hipMemcpyDeviceToHost", "numeric_literal");
-    subst("cudaMemcpyHostToDevice", "hipMemcpyHostToDevice", "numeric_literal");
-    subst("cudaMemcpyHostToHost", "hipMemcpyHostToHost", "numeric_literal");
-    subst("cudaMemoryTypeDevice", "hipMemoryTypeDevice", "numeric_literal");
-    subst("cudaMemoryTypeHost", "hipMemoryTypeHost", "numeric_literal");
-    subst("cudaMemoryTypeManaged", "hipMemoryTypeManaged", "numeric_literal");
-    subst("cudaReadModeElementType", "hipReadModeElementType", "numeric_literal");
-    subst("cudaReadModeNormalizedFloat", "hipReadModeNormalizedFloat", "numeric_literal");
-    subst("cudaResViewFormatFloat1", "hipResViewFormatFloat1", "numeric_literal");
-    subst("cudaResViewFormatFloat2", "hipResViewFormatFloat2", "numeric_literal");
-    subst("cudaResViewFormatFloat4", "hipResViewFormatFloat4", "numeric_literal");
-    subst("cudaResViewFormatHalf1", "hipResViewFormatHalf1", "numeric_literal");
-    subst("cudaResViewFormatHalf2", "hipResViewFormatHalf2", "numeric_literal");
-    subst("cudaResViewFormatHalf4", "hipResViewFormatHalf4", "numeric_literal");
-    subst("cudaResViewFormatNone", "hipResViewFormatNone", "numeric_literal");
-    subst("cudaResViewFormatSignedBlockCompressed4", "hipResViewFormatSignedBlockCompressed4", "numeric_literal");
-    subst("cudaResViewFormatSignedBlockCompressed5", "hipResViewFormatSignedBlockCompressed5", "numeric_literal");
-    subst("cudaResViewFormatSignedBlockCompressed6H", "hipResViewFormatSignedBlockCompressed6H", "numeric_literal");
-    subst("cudaResViewFormatSignedChar1", "hipResViewFormatSignedChar1", "numeric_literal");
-    subst("cudaResViewFormatSignedChar2", "hipResViewFormatSignedChar2", "numeric_literal");
-    subst("cudaResViewFormatSignedChar4", "hipResViewFormatSignedChar4", "numeric_literal");
-    subst("cudaResViewFormatSignedInt1", "hipResViewFormatSignedInt1", "numeric_literal");
-    subst("cudaResViewFormatSignedInt2", "hipResViewFormatSignedInt2", "numeric_literal");
-    subst("cudaResViewFormatSignedInt4", "hipResViewFormatSignedInt4", "numeric_literal");
-    subst("cudaResViewFormatSignedShort1", "hipResViewFormatSignedShort1", "numeric_literal");
-    subst("cudaResViewFormatSignedShort2", "hipResViewFormatSignedShort2", "numeric_literal");
-    subst("cudaResViewFormatSignedShort4", "hipResViewFormatSignedShort4", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed1", "hipResViewFormatUnsignedBlockCompressed1", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed2", "hipResViewFormatUnsignedBlockCompressed2", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed3", "hipResViewFormatUnsignedBlockCompressed3", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed4", "hipResViewFormatUnsignedBlockCompressed4", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed5", "hipResViewFormatUnsignedBlockCompressed5", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed6H", "hipResViewFormatUnsignedBlockCompressed6H", "numeric_literal");
-    subst("cudaResViewFormatUnsignedBlockCompressed7", "hipResViewFormatUnsignedBlockCompressed7", "numeric_literal");
-    subst("cudaResViewFormatUnsignedChar1", "hipResViewFormatUnsignedChar1", "numeric_literal");
-    subst("cudaResViewFormatUnsignedChar2", "hipResViewFormatUnsignedChar2", "numeric_literal");
-    subst("cudaResViewFormatUnsignedChar4", "hipResViewFormatUnsignedChar4", "numeric_literal");
-    subst("cudaResViewFormatUnsignedInt1", "hipResViewFormatUnsignedInt1", "numeric_literal");
-    subst("cudaResViewFormatUnsignedInt2", "hipResViewFormatUnsignedInt2", "numeric_literal");
-    subst("cudaResViewFormatUnsignedInt4", "hipResViewFormatUnsignedInt4", "numeric_literal");
-    subst("cudaResViewFormatUnsignedShort1", "hipResViewFormatUnsignedShort1", "numeric_literal");
-    subst("cudaResViewFormatUnsignedShort2", "hipResViewFormatUnsignedShort2", "numeric_literal");
-    subst("cudaResViewFormatUnsignedShort4", "hipResViewFormatUnsignedShort4", "numeric_literal");
-    subst("cudaResourceTypeArray", "hipResourceTypeArray", "numeric_literal");
-    subst("cudaResourceTypeLinear", "hipResourceTypeLinear", "numeric_literal");
-    subst("cudaResourceTypeMipmappedArray", "hipResourceTypeMipmappedArray", "numeric_literal");
-    subst("cudaResourceTypePitch2D", "hipResourceTypePitch2D", "numeric_literal");
-    subst("cudaSharedMemBankSizeDefault", "hipSharedMemBankSizeDefault", "numeric_literal");
-    subst("cudaSharedMemBankSizeEightByte", "hipSharedMemBankSizeEightByte", "numeric_literal");
-    subst("cudaSharedMemBankSizeFourByte", "hipSharedMemBankSizeFourByte", "numeric_literal");
-    subst("cudaStreamAddCaptureDependencies", "hipStreamAddCaptureDependencies", "numeric_literal");
-    subst("cudaStreamCaptureModeGlobal", "hipStreamCaptureModeGlobal", "numeric_literal");
-    subst("cudaStreamCaptureModeRelaxed", "hipStreamCaptureModeRelaxed", "numeric_literal");
-    subst("cudaStreamCaptureModeThreadLocal", "hipStreamCaptureModeThreadLocal", "numeric_literal");
-    subst("cudaStreamCaptureStatusActive", "hipStreamCaptureStatusActive", "numeric_literal");
-    subst("cudaStreamCaptureStatusInvalidated", "hipStreamCaptureStatusInvalidated", "numeric_literal");
-    subst("cudaStreamCaptureStatusNone", "hipStreamCaptureStatusNone", "numeric_literal");
-    subst("cudaStreamSetCaptureDependencies", "hipStreamSetCaptureDependencies", "numeric_literal");
-    subst("cudaSuccess", "hipSuccess", "numeric_literal");
-    subst("cudaUserObjectNoDestructorSync", "hipUserObjectNoDestructorSync", "numeric_literal");
-    subst("cusolver_int_t", "int", "numeric_literal");
-    subst("CUB_MAX", "CUB_MAX", "define");
-    subst("CUB_MIN", "CUB_MIN", "define");
-    subst("CUB_NAMESPACE_BEGIN", "BEGIN_HIPCUB_NAMESPACE", "define");
-    subst("CUB_NAMESPACE_END", "END_HIPCUB_NAMESPACE", "define");
-    subst("CUB_PTX_ARCH", "HIPCUB_ARCH", "define");
-    subst("CUB_PTX_WARP_THREADS", "HIPCUB_WARP_THREADS", "define");
-    subst("CUB_RUNTIME_FUNCTION", "HIPCUB_RUNTIME_FUNCTION", "define");
-    subst("CUB_STDERR", "HIPCUB_STDERR", "define");
-    subst("CUDART_2_OVER_PI", "HIP_2_OVER_PI", "define");
-    subst("CUDART_2_OVER_PI_F", "HIP_2_OVER_PI_F", "define");
-    subst("CUDART_3PIO4", "HIP_3PIO4", "define");
-    subst("CUDART_3PIO4_F", "HIP_3PIO4_F", "define");
-    subst("CUDART_DBL2INT_CVT", "HIP_DBL2INT_CVT", "define");
-    subst("CUDART_INF", "HIP_INF", "define");
-    subst("CUDART_INF_F", "HIP_INF_F", "define");
-    subst("CUDART_L2E", "HIP_L2E", "define");
-    subst("CUDART_L2E_F", "HIP_L2E_F", "define");
-    subst("CUDART_L2E_HI", "HIP_L2E_HI", "define");
-    subst("CUDART_L2E_LO", "HIP_L2E_LO", "define");
-    subst("CUDART_L2T", "HIP_L2T", "define");
-    subst("CUDART_L2T_F", "HIP_L2T_F", "define");
-    subst("CUDART_LG2", "HIP_LG2", "define");
-    subst("CUDART_LG2_F", "HIP_LG2_F", "define");
-    subst("CUDART_LG2_HI", "HIP_LG2_HI", "define");
-    subst("CUDART_LG2_LO", "HIP_LG2_LO", "define");
-    subst("CUDART_LG2_X_1024", "HIP_LG2_X_1024", "define");
-    subst("CUDART_LG2_X_1075", "HIP_LG2_X_1075", "define");
-    subst("CUDART_LGE", "HIP_LGE", "define");
-    subst("CUDART_LGE_F", "HIP_LGE_F", "define");
-    subst("CUDART_LGE_HI", "HIP_LGE_HI", "define");
-    subst("CUDART_LGE_LO", "HIP_LGE_LO", "define");
-    subst("CUDART_LN2", "HIP_LN2", "define");
-    subst("CUDART_LN2_F", "HIP_LN2_F", "define");
-    subst("CUDART_LN2_HI", "HIP_LN2_HI", "define");
-    subst("CUDART_LN2_LO", "HIP_LN2_LO", "define");
-    subst("CUDART_LN2_X_1024", "HIP_LN2_X_1024", "define");
-    subst("CUDART_LN2_X_1025", "HIP_LN2_X_1025", "define");
-    subst("CUDART_LN2_X_1075", "HIP_LN2_X_1075", "define");
-    subst("CUDART_LNPI", "HIP_LNPI", "define");
-    subst("CUDART_LNPI_F", "HIP_LNPI_F", "define");
-    subst("CUDART_LNT", "HIP_LNT", "define");
-    subst("CUDART_LNT_F", "HIP_LNT_F", "define");
-    subst("CUDART_LNT_HI", "HIP_LNT_HI", "define");
-    subst("CUDART_LNT_LO", "HIP_LNT_LO", "define");
-    subst("CUDART_MAX_NORMAL_F", "HIP_MAX_NORMAL_F", "define");
-    subst("CUDART_MIN_DENORM", "HIP_MIN_DENORM", "define");
-    subst("CUDART_MIN_DENORM_F", "HIP_MIN_DENORM_F", "define");
-    subst("CUDART_NAN", "HIP_NAN", "define");
-    subst("CUDART_NAN_F", "HIP_NAN_F", "define");
-    subst("CUDART_NEG_ZERO", "HIP_NEG_ZERO", "define");
-    subst("CUDART_NEG_ZERO_F", "HIP_NEG_ZERO_F", "define");
-    subst("CUDART_NORM_HUGE_F", "HIP_NORM_HUGE_F", "define");
-    subst("CUDART_ONE", "HIP_ONE", "define");
-    subst("CUDART_ONE_F", "HIP_ONE_F", "define");
-    subst("CUDART_PI", "HIP_PI", "define");
-    subst("CUDART_PIO2", "HIP_PIO2", "define");
-    subst("CUDART_PIO2_F", "HIP_PIO2_F", "define");
-    subst("CUDART_PIO2_HI", "HIP_PIO2_HI", "define");
-    subst("CUDART_PIO2_LO", "HIP_PIO2_LO", "define");
-    subst("CUDART_PIO4", "HIP_PIO4", "define");
-    subst("CUDART_PIO4_F", "HIP_PIO4_F", "define");
-    subst("CUDART_PIO4_HI", "HIP_PIO4_HI", "define");
-    subst("CUDART_PIO4_LO", "HIP_PIO4_LO", "define");
-    subst("CUDART_PI_F", "HIP_PI_F", "define");
-    subst("CUDART_PI_HI", "HIP_PI_HI", "define");
-    subst("CUDART_PI_LO", "HIP_PI_LO", "define");
-    subst("CUDART_REMQUO_BITS_F", "HIP_REMQUO_BITS_F", "define");
-    subst("CUDART_REMQUO_MASK_F", "HIP_REMQUO_MASK_F", "define");
-    subst("CUDART_SQRT_2OPI", "HIP_SQRT_2OPI", "define");
-    subst("CUDART_SQRT_2PI", "HIP_SQRT_2PI", "define");
-    subst("CUDART_SQRT_2PI_HI", "HIP_SQRT_2PI_HI", "define");
-    subst("CUDART_SQRT_2PI_LO", "HIP_SQRT_2PI_LO", "define");
-    subst("CUDART_SQRT_2_OVER_PI_F", "HIP_SQRT_2_OVER_PI_F", "define");
-    subst("CUDART_SQRT_HALF", "HIP_SQRT_HALF", "define");
-    subst("CUDART_SQRT_HALF_F", "HIP_SQRT_HALF_F", "define");
-    subst("CUDART_SQRT_HALF_HI", "HIP_SQRT_HALF_HI", "define");
-    subst("CUDART_SQRT_HALF_HI_F", "HIP_SQRT_HALF_HI_F", "define");
-    subst("CUDART_SQRT_HALF_LO", "HIP_SQRT_HALF_LO", "define");
-    subst("CUDART_SQRT_HALF_LO_F", "HIP_SQRT_HALF_LO_F", "define");
-    subst("CUDART_SQRT_PIO2", "HIP_SQRT_PIO2", "define");
-    subst("CUDART_SQRT_PIO2_HI", "HIP_SQRT_PIO2_HI", "define");
-    subst("CUDART_SQRT_PIO2_LO", "HIP_SQRT_PIO2_LO", "define");
-    subst("CUDART_SQRT_TWO", "HIP_SQRT_TWO", "define");
-    subst("CUDART_SQRT_TWO_F", "HIP_SQRT_TWO_F", "define");
-    subst("CUDART_THIRD", "HIP_THIRD", "define");
-    subst("CUDART_THIRD_F", "HIP_THIRD_F", "define");
-    subst("CUDART_TRIG_PLOSS", "HIP_TRIG_PLOSS", "define");
-    subst("CUDART_TRIG_PLOSS_F", "HIP_TRIG_PLOSS_F", "define");
-    subst("CUDART_TWOTHIRD", "HIP_TWOTHIRD", "define");
-    subst("CUDART_TWO_TO_126_F", "HIP_TWO_TO_126_F", "define");
-    subst("CUDART_TWO_TO_23", "HIP_TWO_TO_23", "define");
-    subst("CUDART_TWO_TO_23_F", "HIP_TWO_TO_23_F", "define");
-    subst("CUDART_TWO_TO_24_F", "HIP_TWO_TO_24_F", "define");
-    subst("CUDART_TWO_TO_31_F", "HIP_TWO_TO_31_F", "define");
-    subst("CUDART_TWO_TO_32_F", "HIP_TWO_TO_32_F", "define");
-    subst("CUDART_TWO_TO_52", "HIP_TWO_TO_52", "define");
-    subst("CUDART_TWO_TO_53", "HIP_TWO_TO_53", "define");
-    subst("CUDART_TWO_TO_54", "HIP_TWO_TO_54", "define");
-    subst("CUDART_TWO_TO_M1022", "HIP_TWO_TO_M1022", "define");
-    subst("CUDART_TWO_TO_M126_F", "HIP_TWO_TO_M126_F", "define");
-    subst("CUDART_TWO_TO_M54", "HIP_TWO_TO_M54", "define");
-    subst("CUDART_ZERO", "HIP_ZERO", "define");
-    subst("CUDART_ZERO_F", "HIP_ZERO_F", "define");
-    subst("CUDA_ARRAY3D_CUBEMAP", "hipArrayCubemap", "define");
-    subst("CUDA_ARRAY3D_LAYERED", "hipArrayLayered", "define");
-    subst("CUDA_ARRAY3D_SURFACE_LDST", "hipArraySurfaceLoadStore", "define");
-    subst("CUDA_ARRAY3D_TEXTURE_GATHER", "hipArrayTextureGather", "define");
-    subst("CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC", "hipCooperativeLaunchMultiDeviceNoPostSync", "define");
-    subst("CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC", "hipCooperativeLaunchMultiDeviceNoPreSync", "define");
-    subst("CUDA_EXTERNAL_MEMORY_DEDICATED", "hipExternalMemoryDedicated", "define");
-    subst("CUDA_IPC_HANDLE_SIZE", "HIP_IPC_HANDLE_SIZE", "define");
-    subst("CU_DEVICE_CPU", "hipCpuDeviceId", "define");
-    subst("CU_DEVICE_INVALID", "hipInvalidDeviceId", "define");
-    subst("CU_IPC_HANDLE_SIZE", "HIP_IPC_HANDLE_SIZE", "define");
-    subst("CU_LAUNCH_PARAM_BUFFER_POINTER", "HIP_LAUNCH_PARAM_BUFFER_POINTER", "define");
-    subst("CU_LAUNCH_PARAM_BUFFER_SIZE", "HIP_LAUNCH_PARAM_BUFFER_SIZE", "define");
-    subst("CU_LAUNCH_PARAM_END", "HIP_LAUNCH_PARAM_END", "define");
-    subst("CU_MEMHOSTALLOC_DEVICEMAP", "hipHostMallocMapped", "define");
-    subst("CU_MEMHOSTALLOC_PORTABLE", "hipHostMallocPortable", "define");
-    subst("CU_MEMHOSTALLOC_WRITECOMBINED", "hipHostMallocWriteCombined", "define");
-    subst("CU_MEMHOSTREGISTER_DEVICEMAP", "hipHostRegisterMapped", "define");
-    subst("CU_MEMHOSTREGISTER_IOMEMORY", "hipHostRegisterIoMemory", "define");
-    subst("CU_MEMHOSTREGISTER_PORTABLE", "hipHostRegisterPortable", "define");
-    subst("CU_MEMHOSTREGISTER_READ_ONLY", "hipHostRegisterReadOnly", "define");
-    subst("CU_STREAM_PER_THREAD", "hipStreamPerThread", "define");
-    subst("CU_TRSA_OVERRIDE_FORMAT", "HIP_TRSA_OVERRIDE_FORMAT", "define");
-    subst("CU_TRSF_NORMALIZED_COORDINATES", "HIP_TRSF_NORMALIZED_COORDINATES", "define");
-    subst("CU_TRSF_READ_AS_INTEGER", "HIP_TRSF_READ_AS_INTEGER", "define");
-    subst("CU_TRSF_SRGB", "HIP_TRSF_SRGB", "define");
-    subst("CubDebug", "HipcubDebug", "define");
-    subst("REGISTER_CUDA_OPERATOR", "REGISTER_HIP_OPERATOR", "define");
-    subst("REGISTER_CUDA_OPERATOR_CREATOR", "REGISTER_HIP_OPERATOR_CREATOR", "define");
-    subst("_CubLog", "_HipcubLog", "define");
-    subst("__CUB_ALIGN_BYTES", "__HIPCUB_ALIGN_BYTES", "define");
-    subst("__CUDACC__", "__HIPCC__", "define");
-    subst("cudaArrayCubemap", "hipArrayCubemap", "define");
-    subst("cudaArrayDefault", "hipArrayDefault", "define");
-    subst("cudaArrayLayered", "hipArrayLayered", "define");
-    subst("cudaArraySurfaceLoadStore", "hipArraySurfaceLoadStore", "define");
-    subst("cudaArrayTextureGather", "hipArrayTextureGather", "define");
-    subst("cudaCooperativeLaunchMultiDeviceNoPostSync", "hipCooperativeLaunchMultiDeviceNoPostSync", "define");
-    subst("cudaCooperativeLaunchMultiDeviceNoPreSync", "hipCooperativeLaunchMultiDeviceNoPreSync", "define");
-    subst("cudaCpuDeviceId", "hipCpuDeviceId", "define");
-    subst("cudaDeviceBlockingSync", "hipDeviceScheduleBlockingSync", "define");
-    subst("cudaDeviceLmemResizeToMax", "hipDeviceLmemResizeToMax", "define");
-    subst("cudaDeviceMapHost", "hipDeviceMapHost", "define");
-    subst("cudaDeviceScheduleAuto", "hipDeviceScheduleAuto", "define");
-    subst("cudaDeviceScheduleBlockingSync", "hipDeviceScheduleBlockingSync", "define");
-    subst("cudaDeviceScheduleMask", "hipDeviceScheduleMask", "define");
-    subst("cudaDeviceScheduleSpin", "hipDeviceScheduleSpin", "define");
-    subst("cudaDeviceScheduleYield", "hipDeviceScheduleYield", "define");
-    subst("cudaEventBlockingSync", "hipEventBlockingSync", "define");
-    subst("cudaEventDefault", "hipEventDefault", "define");
-    subst("cudaEventDisableTiming", "hipEventDisableTiming", "define");
-    subst("cudaEventInterprocess", "hipEventInterprocess", "define");
-    subst("cudaExternalMemoryDedicated", "hipExternalMemoryDedicated", "define");
-    subst("cudaHostAllocDefault", "hipHostMallocDefault", "define");
-    subst("cudaHostAllocMapped", "hipHostMallocMapped", "define");
-    subst("cudaHostAllocPortable", "hipHostMallocPortable", "define");
-    subst("cudaHostAllocWriteCombined", "hipHostMallocWriteCombined", "define");
-    subst("cudaHostRegisterDefault", "hipHostRegisterDefault", "define");
-    subst("cudaHostRegisterIoMemory", "hipHostRegisterIoMemory", "define");
-    subst("cudaHostRegisterMapped", "hipHostRegisterMapped", "define");
-    subst("cudaHostRegisterPortable", "hipHostRegisterPortable", "define");
-    subst("cudaHostRegisterReadOnly", "hipHostRegisterReadOnly", "define");
-    subst("cudaInvalidDeviceId", "hipInvalidDeviceId", "define");
-    subst("cudaIpcMemLazyEnablePeerAccess", "hipIpcMemLazyEnablePeerAccess", "define");
-    subst("cudaMemAttachGlobal", "hipMemAttachGlobal", "define");
-    subst("cudaMemAttachHost", "hipMemAttachHost", "define");
-    subst("cudaMemAttachSingle", "hipMemAttachSingle", "define");
-    subst("cudaOccupancyDefault", "hipOccupancyDefault", "define");
-    subst("cudaOccupancyDisableCachingOverride", "hipOccupancyDisableCachingOverride", "define");
-    subst("cudaStreamDefault", "hipStreamDefault", "define");
-    subst("cudaStreamNonBlocking", "hipStreamNonBlocking", "define");
-    subst("cudaStreamPerThread", "hipStreamPerThread", "define");
-    subst("cudaTextureType1D", "hipTextureType1D", "define");
-    subst("cudaTextureType1DLayered", "hipTextureType1DLayered", "define");
-    subst("cudaTextureType2D", "hipTextureType2D", "define");
-    subst("cudaTextureType2DLayered", "hipTextureType2DLayered", "define");
-    subst("cudaTextureType3D", "hipTextureType3D", "define");
-    subst("cudaTextureTypeCubemap", "hipTextureTypeCubemap", "define");
-    subst("cudaTextureTypeCubemapLayered", "hipTextureTypeCubemapLayered", "define");
-}
-
-# CUDA Kernel Launch Syntax
-sub transformKernelLaunch {
-    no warnings qw/uninitialized/;
-    my $k = 0;
-
-    # kern<...><<<Dg, Db, Ns, S>>>() syntax
-    $k += s/([:|\w]+)\s*<(.+)>\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(\s*\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), $3, $4, $5, $6)/g;
-    # kern<...><<<Dg, Db, Ns, S>>>(...) syntax
-    $k += s/([:|\w]+)\s*<(.+)>\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), $3, $4, $5, $6, /g;
-    # kern<<<Dg, Db, Ns, S>>>() syntax
-    $k += s/([:|\w]+)\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(\s*\)/hipLaunchKernelGGL($1, $2, $3, $4, $5)/g;
-    # kern<<<Dg, Db, Ns, S>>>(...) syntax
-    $k += s/([:|\w]+)\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(/hipLaunchKernelGGL($1, $2, $3, $4, $5, /g;
-
-    # kern<...><<<Dg, Db, Ns>>>() syntax
-    $k += s/([:|\w]+)\s*<(.+)>\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(\s*\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), $3, $4, $5, 0)/g;
-    # kern<...><<<Dg, Db, Ns>>>(...) syntax
-    $k += s/([:|\w]+)\s*<(.+)>\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), $3, $4, $5, 0, /g;
-    # kern<<<Dg, Db, Ns>>>() syntax
-    $k += s/([:|\w]+)\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(\s*\)/hipLaunchKernelGGL($1, $2, $3, $4, 0)/g;
-    # kern<<<Dg, Db, Ns>>>(...) syntax
-    $k += s/([:|\w]+)\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(/hipLaunchKernelGGL($1, $2, $3, $4, 0, /g;
-
-    # kern<...><<<Dg, Db>>>() syntax
-    $k += s/([:|\w]+)\s*<(.+)>\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(\s*\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), $3, $4, 0, 0)/g;
-    # kern<...><<<Dg, Db>>>(...) syntax
-    $k += s/([:|\w]+)\s*<(.+)>\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), $3, $4, 0, 0, /g;
-    # kern<<<Dg, Db>>>() syntax
-    $k += s/([:|\w]+)\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(\s*\)/hipLaunchKernelGGL($1, $2, $3, 0, 0)/g;
-    # kern<<<Dg, Db>>>(...) syntax
-    $k += s/([:|\w]+)\s*<<<\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*,\s*([^,\(\)]+|[\w\s:]*\([\w|\s|,|:|\+|\*|\-|\/|(?R)]+\))\s*>>>\s*\(/hipLaunchKernelGGL($1, $2, $3, 0, 0, /g;
-
-    if ($k) {
-        $ft{'kernel_launch'} += $k;
-        $Tkernels{$1}++;
-    }
-}
-
-sub transformCubNamespace {
-    my $k = 0;
-    $k += s/using\s*namespace\s*cub/using namespace hipcub/g;
-    $k += s/\bcub::\b/hipcub::/g;
-    return $k;
-}
-
-sub transformHostFunctions {
-    my $k = 0;
-    foreach $func (
-        "hipMemcpyToSymbol",
-        "hipMemcpyToSymbolAsync"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),/$func\(HIP_SYMBOL\($2\),/g;
-    }
-    foreach $func (
-        "hipGetSymbolAddress",
-        "hipGetSymbolSize",
-        "hipGetTextureReference",
-        "hipGraphMemcpyNodeSetParamsToSymbol",
-        "hipMemcpyFromSymbol",
-        "hipMemcpyFromSymbolAsync"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3HIP_SYMBOL\($4\)$5/g;
-    }
-    foreach $func (
-        "hipFuncSetAttribute",
-        "hipFuncSetCacheConfig",
-        "hipFuncSetSharedMemConfig",
-        "hipLaunchCooperativeKernel",
-        "hipLaunchKernel"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),/$func\(reinterpret_cast<const void*>\($2\),/g;
-    }
-    foreach $func (
-        "hipFuncGetAttributes"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3reinterpret_cast<const void*>\($4\)$5/g;
-    }
-    foreach $func (
-        "hipGraphExecMemcpyNodeSetParamsToSymbol",
-        "hipGraphMemcpyNodeSetParamsFromSymbol"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3,$4HIP_SYMBOL\($5\)$6/g;
-    }
-    foreach $func (
-        "hipModuleOccupancyMaxPotentialBlockSize",
-        "hipModuleOccupancyMaxPotentialBlockSizeWithFlags",
-        "hipModuleOccupancyMaxPotentialBlockSizeWithFlags"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([^,\)]+),([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3,$4$7/g;
-    }
-    foreach $func (
-        "hipGraphExecMemcpyNodeSetParamsFromSymbol"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([^,\)]+),([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3,$4,$5HIP_SYMBOL\($6\)$7/g;
-    }
-    foreach $func (
-        "hipGraphAddMemcpyNodeToSymbol"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([^,\)]+),([^,\)]+),([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3,$4,$5,$6HIP_SYMBOL\($7\)$8/g;
-    }
-    foreach $func (
-        "hipGraphAddMemcpyNodeFromSymbol"
-    )
-    {
-        $k += s/(?<!\/\/ CHECK: )($func)\s*\(([^,\)]+),([^,\)]+),([^,\)]+),([^,\)]+),([^,\)]+),([\s]*)([^,\)]+)(,\s*|\))/$func\($2,$3,$4,$5,$6,$7HIP_SYMBOL\($8\)$9/g;
-    }
-    return $k;
-}
-
-sub countSupportedDeviceFunctions {
-    my $k = 0;
-    foreach $func (
-        "ynf",
-        "yn",
-        "y1f",
-        "y1",
-        "y0f",
-        "y0",
-        "truncf",
-        "trunc",
-        "tgammaf",
-        "tgamma",
-        "tanhf",
-        "tanh",
-        "tanf",
-        "tan",
-        "sqrtf",
-        "sqrt",
-        "sinpif",
-        "sinpi",
-        "sinhf",
-        "sinh",
-        "sinf",
-        "sincospif",
-        "sincospi",
-        "sincosf",
-        "sincos",
-        "sin",
-        "signbit",
-        "scalbnf",
-        "scalbn",
-        "scalblnf",
-        "scalbln",
-        "rsqrtf",
-        "rsqrt",
-        "roundf",
-        "round",
-        "rnormf",
-        "rnorm4df",
-        "rnorm4d",
-        "rnorm3df",
-        "rnorm3d",
-        "rnorm",
-        "rintf",
-        "rint",
-        "rhypotf",
-        "rhypot",
-        "remquof",
-        "remquo",
-        "remainderf",
-        "remainder",
-        "rcbrtf",
-        "rcbrt",
-        "powf",
-        "pow",
-        "normf",
-        "normcdfinvf",
-        "normcdfinv",
-        "normcdff",
-        "normcdf",
-        "norm4df",
-        "norm4d",
-        "norm3df",
-        "norm3d",
-        "norm",
-        "nextafterf",
-        "nextafter",
-        "nearbyintf",
-        "nearbyint",
-        "nanf",
-        "nan",
-        "modff",
-        "modf",
-        "min",
-        "max",
-        "lroundf",
-        "lround",
-        "lrintf",
-        "lrint",
-        "logf",
-        "logbf",
-        "logb",
-        "log2f",
-        "log2",
-        "log1pf",
-        "log1p",
-        "log10f",
-        "log10",
-        "log",
-        "llroundf",
-        "llround",
-        "llrintf",
-        "llrint",
-        "llabs",
-        "lgammaf",
-        "lgamma",
-        "ldexpf",
-        "ldexp",
-        "labs",
-        "jnf",
-        "jn",
-        "j1f",
-        "j1",
-        "j0f",
-        "j0",
-        "isnan",
-        "isinf",
-        "isfinite",
-        "ilogbf",
-        "ilogb",
-        "hypotf",
-        "hypot",
-        "htrunc",
-        "hsqrt",
-        "hsin",
-        "hrsqrt",
-        "hrint",
-        "hrcp",
-        "hlog2",
-        "hlog10",
-        "hlog",
-        "hfloor",
-        "hexp2",
-        "hexp10",
-        "hexp",
-        "hcos",
-        "hceil",
-        "h2trunc",
-        "h2sqrt",
-        "h2sin",
-        "h2rsqrt",
-        "h2rint",
-        "h2rcp",
-        "h2log2",
-        "h2log10",
-        "h2log",
-        "h2floor",
-        "h2exp2",
-        "h2exp10",
-        "h2exp",
-        "h2cos",
-        "h2ceil",
-        "frexpf",
-        "frexp",
-        "fmodf",
-        "fmod",
-        "fminf",
-        "fmin",
-        "fmaxf",
-        "fmax",
-        "fmaf",
-        "fma",
-        "floorf",
-        "floor",
-        "fdividef",
-        "fdimf",
-        "fdim",
-        "fabsf",
-        "fabs",
-        "expm1f",
-        "expm1",
-        "expf",
-        "exp2f",
-        "exp2",
-        "exp10f",
-        "exp10",
-        "exp",
-        "erfinvf",
-        "erfinv",
-        "erff",
-        "erfcxf",
-        "erfcx",
-        "erfcinvf",
-        "erfcinv",
-        "erfcf",
-        "erfc",
-        "erf",
-        "cyl_bessel_i1f",
-        "cyl_bessel_i1",
-        "cyl_bessel_i0f",
-        "cyl_bessel_i0",
-        "cospif",
-        "cospi",
-        "coshf",
-        "cosh",
-        "cosf",
-        "cos",
-        "copysignf",
-        "copysign",
-        "clock64",
-        "clock",
-        "ceilf",
-        "ceil",
-        "cbrtf",
-        "cbrt",
-        "atomicXor_system",
-        "atomicXor",
-        "atomicSub_system",
-        "atomicSub",
-        "atomicOr_system",
-        "atomicOr",
-        "atomicMin_system",
-        "atomicMin",
-        "atomicMax_system",
-        "atomicMax",
-        "atomicInc",
-        "atomicExch_system",
-        "atomicExch",
-        "atomicDec",
-        "atomicCAS_system",
-        "atomicCAS",
-        "atomicAnd_system",
-        "atomicAnd",
-        "atomicAdd_system",
-        "atomicAdd",
-        "atanhf",
-        "atanh",
-        "atanf",
-        "atan2f",
-        "atan2",
-        "atan",
-        "asinhf",
-        "asinh",
-        "asinf",
-        "asin",
-        "acoshf",
-        "acosh",
-        "acosf",
-        "acos",
-        "abs",
-        "__ushort_as_half",
-        "__ushort2half_rz",
-        "__ushort2half_ru",
-        "__ushort2half_rn",
-        "__ushort2half_rd",
-        "__usad",
-        "__urhadd",
-        "__umulhi",
-        "__umul64hi",
-        "__umul24",
-        "__ull2half_rz",
-        "__ull2half_ru",
-        "__ull2half_rn",
-        "__ull2half_rd",
-        "__ull2float_rz",
-        "__ull2float_ru",
-        "__ull2float_rn",
-        "__ull2float_rd",
-        "__ull2double_rz",
-        "__ull2double_ru",
-        "__ull2double_rn",
-        "__ull2double_rd",
-        "__uint_as_float",
-        "__uint2half_rz",
-        "__uint2half_ru",
-        "__uint2half_rn",
-        "__uint2half_rd",
-        "__uint2float_rz",
-        "__uint2float_ru",
-        "__uint2float_rn",
-        "__uint2float_rd",
-        "__uint2double_rn",
-        "__uhadd",
-        "__threadfence_system",
-        "__threadfence_block",
-        "__threadfence",
-        "__tanf",
-        "__syncthreads_or",
-        "__syncthreads_count",
-        "__syncthreads_and",
-        "__syncthreads",
-        "__sinf",
-        "__sincosf",
-        "__short_as_half",
-        "__short2half_rz",
-        "__short2half_ru",
-        "__short2half_rn",
-        "__short2half_rd",
-        "__shfl_xor",
-        "__shfl_up",
-        "__shfl_down",
-        "__shfl",
-        "__saturatef",
-        "__sad",
-        "__rhadd",
-        "__powf",
-        "__popcll",
-        "__popc",
-        "__mulhi",
-        "__mul64hi",
-        "__mul24",
-        "__lows2half2",
-        "__lowhigh2highlow",
-        "__low2half2",
-        "__low2half",
-        "__low2float",
-        "__longlong_as_double",
-        "__logf",
-        "__log2f",
-        "__log10f",
-        "__ll2half_rz",
-        "__ll2half_ru",
-        "__ll2half_rn",
-        "__ll2half_rd",
-        "__ll2float_rz",
-        "__ll2float_ru",
-        "__ll2float_rn",
-        "__ll2float_rd",
-        "__ll2double_rz",
-        "__ll2double_ru",
-        "__ll2double_rn",
-        "__ll2double_rd",
-        "__ldg",
-        "__ldcs",
-        "__ldcg",
-        "__ldca",
-        "__int_as_float",
-        "__int2half_rz",
-        "__int2half_ru",
-        "__int2half_rn",
-        "__int2half_rd",
-        "__int2float_rz",
-        "__int2float_ru",
-        "__int2float_rn",
-        "__int2float_rd",
-        "__int2double_rn",
-        "__hsub_sat",
-        "__hsub2_sat",
-        "__hsub2",
-        "__hsub",
-        "__hneu2",
-        "__hneu",
-        "__hneg2",
-        "__hneg",
-        "__hne2",
-        "__hne",
-        "__hmul_sat",
-        "__hmul2_sat",
-        "__hmul2",
-        "__hmul",
-        "__hmin_nan",
-        "__hmin",
-        "__hmax_nan",
-        "__hmax",
-        "__hltu2",
-        "__hltu",
-        "__hlt2",
-        "__hlt",
-        "__hleu2",
-        "__hleu",
-        "__hle2",
-        "__hle",
-        "__hisnan2",
-        "__hisnan",
-        "__hisinf",
-        "__hiloint2double",
-        "__highs2half2",
-        "__high2half2",
-        "__high2half",
-        "__high2float",
-        "__hgtu2",
-        "__hgtu",
-        "__hgt2",
-        "__hgt",
-        "__hgeu2",
-        "__hgeu",
-        "__hge2",
-        "__hge",
-        "__hfma_sat",
-        "__hfma2_sat",
-        "__hfma2",
-        "__hfma",
-        "__hequ2",
-        "__hequ",
-        "__heq2",
-        "__heq",
-        "__hdiv",
-        "__hbneu2",
-        "__hbne2",
-        "__hbltu2",
-        "__hblt2",
-        "__hbleu2",
-        "__hble2",
-        "__hbgtu2",
-        "__hbgt2",
-        "__hbgeu2",
-        "__hbge2",
-        "__hbequ2",
-        "__hbeq2",
-        "__halves2half2",
-        "__half_as_ushort",
-        "__half_as_short",
-        "__half2ushort_rz",
-        "__half2ushort_ru",
-        "__half2ushort_rn",
-        "__half2ushort_rd",
-        "__half2ull_rz",
-        "__half2ull_ru",
-        "__half2ull_rn",
-        "__half2ull_rd",
-        "__half2uint_rz",
-        "__half2uint_ru",
-        "__half2uint_rn",
-        "__half2uint_rd",
-        "__half2short_rz",
-        "__half2short_ru",
-        "__half2short_rn",
-        "__half2short_rd",
-        "__half2ll_rz",
-        "__half2ll_ru",
-        "__half2ll_rn",
-        "__half2ll_rd",
-        "__half2int_rz",
-        "__half2int_ru",
-        "__half2int_rn",
-        "__half2int_rd",
-        "__half2half2",
-        "__half2float",
-        "__half22float2",
-        "__hadd_sat",
-        "__hadd2_sat",
-        "__hadd2",
-        "__hadd",
-        "__habs2",
-        "__habs",
-        "__h2div",
-        "__funnelshift_rc",
-        "__funnelshift_r",
-        "__funnelshift_lc",
-        "__funnelshift_l",
-        "__fsub_rn",
-        "__fsqrt_rn",
-        "__frsqrt_rn",
-        "__frcp_rn",
-        "__fmul_rn",
-        "__fmaf_rn",
-        "__fma_rn",
-        "__floats2half2_rn",
-        "__float_as_uint",
-        "__float_as_int",
-        "__float2ull_rz",
-        "__float2ull_ru",
-        "__float2ull_rn",
-        "__float2ull_rd",
-        "__float2uint_rz",
-        "__float2uint_ru",
-        "__float2uint_rn",
-        "__float2uint_rd",
-        "__float2ll_rz",
-        "__float2ll_ru",
-        "__float2ll_rn",
-        "__float2ll_rd",
-        "__float2int_rz",
-        "__float2int_ru",
-        "__float2int_rn",
-        "__float2int_rd",
-        "__float2half_rz",
-        "__float2half_ru",
-        "__float2half_rn",
-        "__float2half_rd",
-        "__float2half2_rn",
-        "__float2half",
-        "__float22half2_rn",
-        "__ffsll",
-        "__ffs",
-        "__fdividef",
-        "__fdiv_rn",
-        "__fadd_rn",
-        "__expf",
-        "__exp10f",
-        "__dsub_rn",
-        "__dsqrt_rn",
-        "__drcp_rn",
-        "__double_as_longlong",
-        "__double2ull_rz",
-        "__double2ull_ru",
-        "__double2ull_rn",
-        "__double2ull_rd",
-        "__double2uint_rz",
-        "__double2uint_ru",
-        "__double2uint_rn",
-        "__double2uint_rd",
-        "__double2loint",
-        "__double2ll_rz",
-        "__double2ll_ru",
-        "__double2ll_rn",
-        "__double2ll_rd",
-        "__double2int_rz",
-        "__double2int_ru",
-        "__double2int_rn",
-        "__double2int_rd",
-        "__double2hiint",
-        "__double2float_rz",
-        "__double2float_ru",
-        "__double2float_rn",
-        "__double2float_rd",
-        "__dmul_rn",
-        "__ddiv_rn",
-        "__dadd_rn",
-        "__cosf",
-        "__clzll",
-        "__clz",
-        "__byte_perm",
-        "__brevll",
-        "__brev",
-        "__ballot",
-        "__assertfail",
-        "__assert_fail",
-        "__any",
-        "__all"
-    )
-    {
-        # match device function from the list, except those, which have a namespace prefix (aka somenamespace::umin(...));
-        # function with only global namespace qualifier '::' (aka ::umin(...)) should be treated as a device function (and warned as well as without such qualifier);
-        my $mt_namespace = m/(\w+)::($func)\s*\(\s*.*\s*\)/g;
-        my $mt = m/\b($func)\b\s*\(\s*.*\s*\)/g;
-        if ($mt && !$mt_namespace) {
-            $k += $mt;
-        }
-    }
-    return $k;
-}
-
-sub warnUnsupportedDeviceFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    foreach $func (
-        "umul24",
-        "umin",
-        "umax",
-        "ullmin",
-        "ullmax",
-        "uint_as_float",
-        "uint2float",
-        "saturate",
-        "mulhi",
-        "mul64hi",
-        "mul24",
-        "make_half2",
-        "make_bfloat162",
-        "llmin",
-        "llmax",
-        "int_as_float",
-        "int2float",
-        "float_as_uint",
-        "float_as_int",
-        "float2int",
-        "fdivide",
-        "_ldsign",
-        "_fdsign",
-        "__vsubus4",
-        "__vsubus2",
-        "__vsubss4",
-        "__vsubss2",
-        "__vsub4",
-        "__vsub2",
-        "__vsetne4",
-        "__vsetne2",
-        "__vsetltu4",
-        "__vsetltu2",
-        "__vsetlts4",
-        "__vsetlts2",
-        "__vsetleu4",
-        "__vsetleu2",
-        "__vsetles4",
-        "__vsetles2",
-        "__vsetgtu4",
-        "__vsetgts4",
-        "__vsetgts2",
-        "__vsetgeu4",
-        "__vsetgeu2",
-        "__vsetges4",
-        "__vsetges2",
-        "__vseteq4",
-        "__vseteq2",
-        "__vsadu4",
-        "__vsadu2",
-        "__vsads4",
-        "__vsads2",
-        "__vnegss4",
-        "__vnegss2",
-        "__vneg4",
-        "__vneg2",
-        "__vminu4",
-        "__vminu2",
-        "__vmins4",
-        "__vmins2",
-        "__vmaxu4",
-        "__vmaxu2",
-        "__vmaxs4",
-        "__vmaxs2",
-        "__vhaddu4",
-        "__vhaddu2",
-        "__vcmpne4",
-        "__vcmpne2",
-        "__vcmpltu4",
-        "__vcmpltu2",
-        "__vcmplts4",
-        "__vcmplts2",
-        "__vcmpleu4",
-        "__vcmples4",
-        "__vcmples2",
-        "__vcmpgtu4",
-        "__vcmpgtu2",
-        "__vcmpgts4",
-        "__vcmpgts2",
-        "__vcmpgeu4",
-        "__vcmpgeu2",
-        "__vcmpges4",
-        "__vcmpges2",
-        "__vcmpeq4",
-        "__vcmpeq2",
-        "__vavgu4",
-        "__vavgu2",
-        "__vavgs4",
-        "__vavgs2",
-        "__vaddus4",
-        "__vaddus2",
-        "__vaddss4",
-        "__vaddss2",
-        "__vadd4",
-        "__vadd2",
-        "__vabsss4",
-        "__vabsss2",
-        "__vabsdiffu4",
-        "__vabsdiffu2",
-        "__vabsdiffs4",
-        "__vabsdiffs2",
-        "__vabs4",
-        "__vabs2",
-        "__ushort_as_bfloat16",
-        "__ushort2bfloat16_rz",
-        "__ushort2bfloat16_ru",
-        "__ushort2bfloat16_rn",
-        "__ushort2bfloat16_rd",
-        "__ull2bfloat16_rz",
-        "__ull2bfloat16_ru",
-        "__ull2bfloat16_rn",
-        "__ull2bfloat16_rd",
-        "__uint2bfloat16_rz",
-        "__uint2bfloat16_ru",
-        "__uint2bfloat16_rn",
-        "__uint2bfloat16_rd",
-        "__trap",
-        "__stwt",
-        "__stwb",
-        "__stcs",
-        "__stcg",
-        "__signbitl",
-        "__signbitf",
-        "__signbit",
-        "__short_as_bfloat16",
-        "__short2bfloat16_rz",
-        "__short2bfloat16_ru",
-        "__short2bfloat16_rn",
-        "__short2bfloat16_rd",
-        "__shfl_xor_sync",
-        "__shfl_up_sync",
-        "__shfl_sync",
-        "__shfl_down_sync",
-        "__prof_trigger",
-        "__pm3",
-        "__pm2",
-        "__pm1",
-        "__pm0",
-        "__nv_cvt_halfraw_to_fp8",
-        "__nv_cvt_halfraw2_to_fp8x2",
-        "__nv_cvt_fp8x2_to_halfraw2",
-        "__nv_cvt_fp8_to_halfraw",
-        "__nv_cvt_float_to_fp8",
-        "__nv_cvt_float2_to_fp8x2",
-        "__nv_cvt_double_to_fp8",
-        "__nv_cvt_double2_to_fp8x2",
-        "__nv_cvt_bfloat16raw_to_fp8",
-        "__nv_cvt_bfloat16raw2_to_fp8x2",
-        "__lows2bfloat162",
-        "__low2bfloat162",
-        "__low2bfloat16",
-        "__ll2bfloat16_rz",
-        "__ll2bfloat16_ru",
-        "__ll2bfloat16_rn",
-        "__ll2bfloat16_rd",
-        "__ldlu",
-        "__ldcv",
-        "__isnanl",
-        "__isnanf",
-        "__isnan",
-        "__isinfl",
-        "__isinff",
-        "__isinf",
-        "__int2bfloat16_rz",
-        "__int2bfloat16_ru",
-        "__int2bfloat16_rn",
-        "__int2bfloat16_rd",
-        "__hsub_rn",
-        "__hsub2_rn",
-        "__hneu2_mask",
-        "__hne2_mask",
-        "__hmul_rn",
-        "__hmul2_rn",
-        "__hmin2_nan",
-        "__hmin2",
-        "__hmax2_nan",
-        "__hmax2",
-        "__hltu2_mask",
-        "__hlt2_mask",
-        "__hleu2_mask",
-        "__hle2_mask",
-        "__highs2bfloat162",
-        "__high2bfloat162",
-        "__high2bfloat16",
-        "__hgtu2_mask",
-        "__hgt2_mask",
-        "__hgeu2_mask",
-        "__hge2_mask",
-        "__hfma_relu",
-        "__hfma2_relu",
-        "__hequ2_mask",
-        "__heq2_mask",
-        "__hcmadd",
-        "__halves2bfloat162",
-        "__half2uchar_rz",
-        "__half2char_rz",
-        "__hadd_rn",
-        "__hadd2_rn",
-        "__fsub_rz",
-        "__fsub_ru",
-        "__fsub_rd",
-        "__fsqrt_rz",
-        "__fsqrt_ru",
-        "__fsqrt_rd",
-        "__frcp_rz",
-        "__frcp_ru",
-        "__frcp_rd",
-        "__fmul_rz",
-        "__fmul_ru",
-        "__fmul_rd",
-        "__fmaf_rz",
-        "__fmaf_ru",
-        "__fmaf_rd",
-        "__fma_rz",
-        "__fma_ru",
-        "__fma_rd",
-        "__floats2bfloat162_rn",
-        "__float2bfloat16_rz",
-        "__float2bfloat16_ru",
-        "__float2bfloat16_rn",
-        "__float2bfloat16_rd",
-        "__float2bfloat162_rn",
-        "__float2bfloat16",
-        "__float22bfloat162_rn",
-        "__finitel",
-        "__finitef",
-        "__finite",
-        "__fdiv_rz",
-        "__fdiv_ru",
-        "__fdiv_rd",
-        "__fadd_rz",
-        "__fadd_ru",
-        "__fadd_rd",
-        "__dsub_rz",
-        "__dsub_ru",
-        "__dsub_rd",
-        "__dsqrt_rz",
-        "__dsqrt_ru",
-        "__dsqrt_rd",
-        "__drcp_rz",
-        "__drcp_ru",
-        "__drcp_rd",
-        "__double2half",
-        "__double2bfloat16",
-        "__dmul_rz",
-        "__dmul_ru",
-        "__dmul_rd",
-        "__ddiv_rz",
-        "__ddiv_ru",
-        "__ddiv_rd",
-        "__dadd_rz",
-        "__dadd_ru",
-        "__dadd_rd",
-        "__brkpt",
-        "__bfloat16_as_ushort",
-        "__bfloat16_as_short",
-        "__bfloat162ushort_rz",
-        "__bfloat162ushort_ru",
-        "__bfloat162ushort_rn",
-        "__bfloat162ushort_rd",
-        "__bfloat162ull_rz",
-        "__bfloat162ull_ru",
-        "__bfloat162ull_rn",
-        "__bfloat162ull_rd",
-        "__bfloat162uint_rz",
-        "__bfloat162uint_ru",
-        "__bfloat162uint_rn",
-        "__bfloat162uint_rd",
-        "__bfloat162uchar_rz",
-        "__bfloat162short_rz",
-        "__bfloat162short_ru",
-        "__bfloat162short_rn",
-        "__bfloat162short_rd",
-        "__bfloat162ll_rz",
-        "__bfloat162ll_ru",
-        "__bfloat162ll_rn",
-        "__bfloat162ll_rd",
-        "__bfloat162int_rz",
-        "__bfloat162int_ru",
-        "__bfloat162int_rn",
-        "__bfloat162int_rd",
-        "__bfloat162float",
-        "__bfloat162char_rz",
-        "__bfloat162bfloat162",
-        "__bfloat1622float2",
-        "_Pow_int"
-    )
-    {
-        # match device function from the list, except those, which have a namespace prefix (aka somenamespace::umin(...));
-        # function with only global namespace qualifier '::' (aka ::umin(...)) should be treated as a device function (and warned as well as without such qualifier);
-        my $mt_namespace = m/(\w+)::($func)\s*\(\s*.*\s*\)/g;
-        my $mt = m/\b($func)\b\s*\(\s*.*\s*\)/g;
-        if ($mt && !$mt_namespace) {
-            $k += $mt;
-            print STDERR "  warning: $fileName:$line_num: unsupported device function \"$func\": $_\n";
-        }
-    }
-    return $k;
-}
-
-sub warnExperimentalFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    while (my($func, $val) = each %experimental_funcs)
-    {
-        my $mt = m/($func)/g;
-        if ($mt) {
-            $k += $mt;
-            print STDERR "  warning: $fileName:$line_num: experimental identifier \"$func\" in HIP $val\n";
-        }
-    }
-    return $k;
-}
-
-sub warnDeprecatedFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    while (my($func, $val) = each %deprecated_funcs)
-    {
-        my $mt = m/($func)/g;
-        if ($mt) {
-            $k += $mt;
-            my $cudnn = "CUDNN";
-            my $cuda = "CUDA";
-            if (index(lc($func),lc($cudnn)) == 0) {
-                $cuda = $cudnn;
-            }
-            print STDERR "  warning: $fileName:$line_num: deprecated identifier \"$func\" since $cuda $val\n";
-        }
-    }
-    return $k;
-}
-
-sub warnRemovedFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    while (my($func, $val) = each %removed_funcs)
-    {
-        my $mt = m/($func)/g;
-        if ($mt) {
-            $k += $mt;
-            my $cudnn = "CUDNN";
-            my $cuda = "CUDA";
-            if (index(lc($func),lc($cudnn)) == 0) {
-                $cuda = $cudnn;
-            }
-            print STDERR "  warning: $fileName:$line_num: removed identifier \"$func\" since $cuda $val\n";
-        }
-    }
-    return $k;
-}
-
-sub warnUnsupportedFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    foreach $func (
-        "syevjInfo",
-        "nvrtcGetSupportedArchs",
-        "nvrtcGetOptiXIRSize",
-        "nvrtcGetOptiXIR",
-        "nvrtcGetNumSupportedArchs",
-        "nvrtcGetNVVMSize",
-        "nvrtcGetNVVM",
-        "nvrtcGetLTOIRSize",
-        "nvrtcGetLTOIR",
-        "nv_bfloat162",
-        "memoryBarrier",
-        "libraryPropertyType_t",
-        "libraryPropertyType",
-        "gesvdjInfo",
-        "cusparseZhybsv_solve",
-        "cusparseZhybsv_analysis",
-        "cusparseZhyb2dense",
-        "cusparseZhyb2csc",
-        "cusparseZgtsv_nopivot",
-        "cusparseZgtsvStridedBatch",
-        "cusparseZgtsv",
-        "cusparseZgebsr2gebsr_bufferSizeExt",
-        "cusparseZgebsr2gebsc_bufferSizeExt",
-        "cusparseZdense2hyb",
-        "cusparseZcsrsv_solve",
-        "cusparseZcsrsv_analysis",
-        "cusparseZcsrsm_solve",
-        "cusparseZcsrsm_analysis",
-        "cusparseZcsrmv_mp",
-        "cusparseZcsrilu0",
-        "cusparseZcsric0",
-        "cusparseZcsr2gebsr_bufferSizeExt",
-        "cusparseZcsc2hyb",
-        "cusparseZbsrsm2_bufferSizeExt",
-        "cusparseZbsrilu02_bufferSizeExt",
-        "cusparseZbsric02_bufferSizeExt",
-        "cusparseXgebsr2csr",
-        "cusparseSpVecDescr",
-        "cusparseSpSV_updateMatrix",
-        "cusparseSpSVUpdate_t",
-        "cusparseSpSM_updateMatrix",
-        "cusparseSpSMUpdate_t",
-        "cusparseSpMatSetNumBatches",
-        "cusparseSpMatGetNumBatches",
-        "cusparseSpMatDescr",
-        "cusparseSpMMOp_destroyPlan",
-        "cusparseSpMMOp_createPlan",
-        "cusparseSpMMOpPlan_t",
-        "cusparseSpMMOpPlan",
-        "cusparseSpMMOpAlg_t",
-        "cusparseSpMMOp",
-        "cusparseSpGEMM_getNumProducts",
-        "cusparseSpGEMM_estimateMemory",
-        "cusparseSolveAnalysisInfo_t",
-        "cusparseSolveAnalysisInfo",
-        "cusparseSideMode_t",
-        "cusparseShybsv_solve",
-        "cusparseShybsv_analysis",
-        "cusparseShyb2dense",
-        "cusparseShyb2csc",
-        "cusparseSgtsv_nopivot",
-        "cusparseSgtsvStridedBatch",
-        "cusparseSgtsv",
-        "cusparseSgebsr2gebsr_bufferSizeExt",
-        "cusparseSgebsr2gebsc_bufferSizeExt",
-        "cusparseSdense2hyb",
-        "cusparseScsrsv_solve",
-        "cusparseScsrsv_analysis",
-        "cusparseScsrsm_solve",
-        "cusparseScsrsm_analysis",
-        "cusparseScsrmv_mp",
-        "cusparseScsrilu0",
-        "cusparseScsric0",
-        "cusparseScsr2gebsr_bufferSizeExt",
-        "cusparseScsc2hyb",
-        "cusparseSbsrsm2_bufferSizeExt",
-        "cusparseSbsrilu02_bufferSizeExt",
-        "cusparseSbsric02_bufferSizeExt",
-        "cusparseMatDescr",
-        "cusparseLoggerSetMask",
-        "cusparseLoggerSetLevel",
-        "cusparseLoggerSetFile",
-        "cusparseLoggerSetCallback",
-        "cusparseLoggerOpenFile",
-        "cusparseLoggerForceDisable",
-        "cusparseLoggerCallback_t",
-        "cusparseHybMat",
-        "cusparseHpruneDense2csr_bufferSizeExt",
-        "cusparseHpruneDense2csrNnzByPercentage",
-        "cusparseHpruneDense2csrNnz",
-        "cusparseHpruneDense2csrByPercentage_bufferSizeExt",
-        "cusparseHpruneDense2csrByPercentage",
-        "cusparseHpruneDense2csr",
-        "cusparseHpruneCsr2csr_bufferSizeExt",
-        "cusparseHpruneCsr2csrNnzByPercentage",
-        "cusparseHpruneCsr2csrNnz",
-        "cusparseHpruneCsr2csrByPercentage_bufferSizeExt",
-        "cusparseHpruneCsr2csrByPercentage",
-        "cusparseHpruneCsr2csr",
-        "cusparseGetLevelInfo",
-        "cusparseDnVecDescr",
-        "cusparseDnMatDescr",
-        "cusparseDhybsv_solve",
-        "cusparseDhybsv_analysis",
-        "cusparseDhyb2dense",
-        "cusparseDhyb2csc",
-        "cusparseDgtsv_nopivot",
-        "cusparseDgtsvStridedBatch",
-        "cusparseDgtsv",
-        "cusparseDgebsr2gebsr_bufferSizeExt",
-        "cusparseDgebsr2gebsc_bufferSizeExt",
-        "cusparseDestroySolveAnalysisInfo",
-        "cusparseDdense2hyb",
-        "cusparseDcsrsv_solve",
-        "cusparseDcsrsv_analysis",
-        "cusparseDcsrsm_solve",
-        "cusparseDcsrsm_analysis",
-        "cusparseDcsrmv_mp",
-        "cusparseDcsrilu0",
-        "cusparseDcsric0",
-        "cusparseDcsr2gebsr_bufferSizeExt",
-        "cusparseDcsc2hyb",
-        "cusparseDbsrsm2_bufferSizeExt",
-        "cusparseDbsrilu02_bufferSizeExt",
-        "cusparseDbsric02_bufferSizeExt",
-        "cusparseCsrsv_solveEx",
-        "cusparseCsrsv_analysisEx",
-        "cusparseCsrmvEx_bufferSize",
-        "cusparseCsrmvEx",
-        "cusparseCsrilu0Ex",
-        "cusparseCsr2cscEx",
-        "cusparseCreateSolveAnalysisInfo",
-        "cusparseCreateSlicedEll",
-        "cusparseCreateConstSlicedEll",
-        "cusparseCreateConstBsr",
-        "cusparseCreateBsr",
-        "cusparseContext",
-        "cusparseConstrainedGeMM_bufferSize",
-        "cusparseConstrainedGeMM",
-        "cusparseColorInfo",
-        "cusparseColorAlg_t",
-        "cusparseChybsv_solve",
-        "cusparseChybsv_analysis",
-        "cusparseChyb2dense",
-        "cusparseChyb2csc",
-        "cusparseCgtsv_nopivot",
-        "cusparseCgtsvStridedBatch",
-        "cusparseCgtsv",
-        "cusparseCgebsr2gebsr_bufferSizeExt",
-        "cusparseCgebsr2gebsc_bufferSizeExt",
-        "cusparseCdense2hyb",
-        "cusparseCcsrsv_solve",
-        "cusparseCcsrsv_analysis",
-        "cusparseCcsrsm_solve",
-        "cusparseCcsrsm_analysis",
-        "cusparseCcsrmv_mp",
-        "cusparseCcsrilu0",
-        "cusparseCcsric0",
-        "cusparseCcsr2gebsr_bufferSizeExt",
-        "cusparseCcsc2hyb",
-        "cusparseCbsrsm2_bufferSizeExt",
-        "cusparseCbsrilu02_bufferSizeExt",
-        "cusparseCbsric02_bufferSizeExt",
-        "cusparseBsrSetStridedBatch",
-        "cusparseAlgMode_t",
-        "cusolverStorevMode_t",
-        "cusolverSpZcsrzfdHost",
-        "cusolverSpZcsrqrsvBatched",
-        "cusolverSpZcsrqrZeroPivotHost",
-        "cusolverSpZcsrqrZeroPivot",
-        "cusolverSpZcsrqrSolveHost",
-        "cusolverSpZcsrqrSolve",
-        "cusolverSpZcsrqrSetupHost",
-        "cusolverSpZcsrqrSetup",
-        "cusolverSpZcsrqrFactorHost",
-        "cusolverSpZcsrqrFactor",
-        "cusolverSpZcsrqrBufferInfoHost",
-        "cusolverSpZcsrqrBufferInfoBatched",
-        "cusolverSpZcsrqrBufferInfo",
-        "cusolverSpZcsrluZeroPivotHost",
-        "cusolverSpZcsrluSolveHost",
-        "cusolverSpZcsrluFactorHost",
-        "cusolverSpZcsrluExtractHost",
-        "cusolverSpZcsrluBufferInfoHost",
-        "cusolverSpZcsrlsvqrHost",
-        "cusolverSpZcsrlsvqr",
-        "cusolverSpZcsrlsvluHost",
-        "cusolverSpZcsrlsvcholHost",
-        "cusolverSpZcsrlsvchol",
-        "cusolverSpZcsrlsqvqrHost",
-        "cusolverSpZcsreigvsiHost",
-        "cusolverSpZcsreigvsi",
-        "cusolverSpZcsreigsHost",
-        "cusolverSpZcsrcholZeroPivotHost",
-        "cusolverSpZcsrcholZeroPivot",
-        "cusolverSpZcsrcholSolveHost",
-        "cusolverSpZcsrcholSolve",
-        "cusolverSpZcsrcholFactorHost",
-        "cusolverSpZcsrcholFactor",
-        "cusolverSpZcsrcholDiag",
-        "cusolverSpZcsrcholBufferInfoHost",
-        "cusolverSpZcsrcholBufferInfo",
-        "cusolverSpXcsrsymrcmHost",
-        "cusolverSpXcsrsymmdqHost",
-        "cusolverSpXcsrsymamdHost",
-        "cusolverSpXcsrqrAnalysisHost",
-        "cusolverSpXcsrqrAnalysisBatched",
-        "cusolverSpXcsrqrAnalysis",
-        "cusolverSpXcsrperm_bufferSizeHost",
-        "cusolverSpXcsrpermHost",
-        "cusolverSpXcsrmetisndHost",
-        "cusolverSpXcsrluNnzHost",
-        "cusolverSpXcsrluAnalysisHost",
-        "cusolverSpXcsrissymHost",
-        "cusolverSpXcsrcholAnalysisHost",
-        "cusolverSpXcsrcholAnalysis",
-        "cusolverSpScsrzfdHost",
-        "cusolverSpScsrqrsvBatched",
-        "cusolverSpScsrqrZeroPivotHost",
-        "cusolverSpScsrqrZeroPivot",
-        "cusolverSpScsrqrSolveHost",
-        "cusolverSpScsrqrSolve",
-        "cusolverSpScsrqrSetupHost",
-        "cusolverSpScsrqrSetup",
-        "cusolverSpScsrqrFactorHost",
-        "cusolverSpScsrqrFactor",
-        "cusolverSpScsrqrBufferInfoHost",
-        "cusolverSpScsrqrBufferInfoBatched",
-        "cusolverSpScsrqrBufferInfo",
-        "cusolverSpScsrluZeroPivotHost",
-        "cusolverSpScsrluSolveHost",
-        "cusolverSpScsrluFactorHost",
-        "cusolverSpScsrluExtractHost",
-        "cusolverSpScsrluBufferInfoHost",
-        "cusolverSpScsrlsvqrHost",
-        "cusolverSpScsrlsvqr",
-        "cusolverSpScsrlsvluHost",
-        "cusolverSpScsrlsqvqrHost",
-        "cusolverSpScsreigvsiHost",
-        "cusolverSpScsreigvsi",
-        "cusolverSpScsreigsHost",
-        "cusolverSpScsrcholZeroPivotHost",
-        "cusolverSpScsrcholZeroPivot",
-        "cusolverSpScsrcholSolveHost",
-        "cusolverSpScsrcholSolve",
-        "cusolverSpScsrcholFactorHost",
-        "cusolverSpScsrcholFactor",
-        "cusolverSpScsrcholDiag",
-        "cusolverSpScsrcholBufferInfoHost",
-        "cusolverSpScsrcholBufferInfo",
-        "cusolverSpGetStream",
-        "cusolverSpDestroyCsrqrInfoHost",
-        "cusolverSpDestroyCsrqrInfo",
-        "cusolverSpDestroyCsrluInfoHost",
-        "cusolverSpDestroyCsrcholInfoHost",
-        "cusolverSpDestroyCsrcholInfo",
-        "cusolverSpDcsrzfdHost",
-        "cusolverSpDcsrqrsvBatched",
-        "cusolverSpDcsrqrZeroPivotHost",
-        "cusolverSpDcsrqrZeroPivot",
-        "cusolverSpDcsrqrSolveHost",
-        "cusolverSpDcsrqrSolve",
-        "cusolverSpDcsrqrSetupHost",
-        "cusolverSpDcsrqrSetup",
-        "cusolverSpDcsrqrFactorHost",
-        "cusolverSpDcsrqrFactor",
-        "cusolverSpDcsrqrBufferInfoHost",
-        "cusolverSpDcsrqrBufferInfoBatched",
-        "cusolverSpDcsrqrBufferInfo",
-        "cusolverSpDcsrluZeroPivotHost",
-        "cusolverSpDcsrluSolveHost",
-        "cusolverSpDcsrluFactorHost",
-        "cusolverSpDcsrluExtractHost",
-        "cusolverSpDcsrluBufferInfoHost",
-        "cusolverSpDcsrlsvqrHost",
-        "cusolverSpDcsrlsvqr",
-        "cusolverSpDcsrlsvluHost",
-        "cusolverSpDcsrlsqvqrHost",
-        "cusolverSpDcsreigvsiHost",
-        "cusolverSpDcsreigvsi",
-        "cusolverSpDcsreigsHost",
-        "cusolverSpDcsrcholZeroPivotHost",
-        "cusolverSpDcsrcholZeroPivot",
-        "cusolverSpDcsrcholSolveHost",
-        "cusolverSpDcsrcholSolve",
-        "cusolverSpDcsrcholFactorHost",
-        "cusolverSpDcsrcholFactor",
-        "cusolverSpDcsrcholDiag",
-        "cusolverSpDcsrcholBufferInfoHost",
-        "cusolverSpDcsrcholBufferInfo",
-        "cusolverSpCreateCsrqrInfoHost",
-        "cusolverSpCreateCsrqrInfo",
-        "cusolverSpCreateCsrluInfoHost",
-        "cusolverSpCreateCsrcholInfoHost",
-        "cusolverSpCreateCsrcholInfo",
-        "cusolverSpContext",
-        "cusolverSpCcsrzfdHost",
-        "cusolverSpCcsrqrsvBatched",
-        "cusolverSpCcsrqrZeroPivotHost",
-        "cusolverSpCcsrqrZeroPivot",
-        "cusolverSpCcsrqrSolveHost",
-        "cusolverSpCcsrqrSolve",
-        "cusolverSpCcsrqrSetupHost",
-        "cusolverSpCcsrqrSetup",
-        "cusolverSpCcsrqrFactorHost",
-        "cusolverSpCcsrqrFactor",
-        "cusolverSpCcsrqrBufferInfoHost",
-        "cusolverSpCcsrqrBufferInfoBatched",
-        "cusolverSpCcsrqrBufferInfo",
-        "cusolverSpCcsrluZeroPivotHost",
-        "cusolverSpCcsrluSolveHost",
-        "cusolverSpCcsrluFactorHost",
-        "cusolverSpCcsrluExtractHost",
-        "cusolverSpCcsrluBufferInfoHost",
-        "cusolverSpCcsrlsvqrHost",
-        "cusolverSpCcsrlsvqr",
-        "cusolverSpCcsrlsvluHost",
-        "cusolverSpCcsrlsvcholHost",
-        "cusolverSpCcsrlsvchol",
-        "cusolverSpCcsrlsqvqrHost",
-        "cusolverSpCcsreigvsiHost",
-        "cusolverSpCcsreigvsi",
-        "cusolverSpCcsreigsHost",
-        "cusolverSpCcsrcholZeroPivotHost",
-        "cusolverSpCcsrcholZeroPivot",
-        "cusolverSpCcsrcholSolveHost",
-        "cusolverSpCcsrcholSolve",
-        "cusolverSpCcsrcholFactorHost",
-        "cusolverSpCcsrcholFactor",
-        "cusolverSpCcsrcholDiag",
-        "cusolverSpCcsrcholBufferInfoHost",
-        "cusolverSpCcsrcholBufferInfo",
-        "cusolverRfGetAlgs",
-        "cusolverRfCommon",
-        "cusolverPrecType_t",
-        "cusolverNorm_t",
-        "cusolverMgSyevd_bufferSize",
-        "cusolverMgSyevd",
-        "cusolverMgPotrs_bufferSize",
-        "cusolverMgPotrs",
-        "cusolverMgPotri_bufferSize",
-        "cusolverMgPotri",
-        "cusolverMgPotrf_bufferSize",
-        "cusolverMgPotrf",
-        "cusolverMgHandle_t",
-        "cusolverMgGridMapping_t",
-        "cusolverMgGetrs_bufferSize",
-        "cusolverMgGetrs",
-        "cusolverMgGetrf_bufferSize",
-        "cusolverMgGetrf",
-        "cusolverMgDeviceSelect",
-        "cusolverMgDestroyGrid",
-        "cusolverMgDestroy",
-        "cusolverMgCreateMatrixDesc",
-        "cusolverMgCreateDeviceGrid",
-        "cusolverMgCreate",
-        "cusolverMgContext",
-        "cusolverIRSRefinement_t",
-        "cusolverDnZsytri_bufferSize",
-        "cusolverDnZsytri",
-        "cusolverDnZlauum_bufferSize",
-        "cusolverDnZlauum",
-        "cusolverDnZlaswp",
-        "cusolverDnZYgesv_bufferSize",
-        "cusolverDnZYgesv",
-        "cusolverDnZYgels_bufferSize",
-        "cusolverDnZYgels",
-        "cusolverDnZKgesv_bufferSize",
-        "cusolverDnZKgesv",
-        "cusolverDnZKgels_bufferSize",
-        "cusolverDnZKgels",
-        "cusolverDnZEgesv_bufferSize",
-        "cusolverDnZEgesv",
-        "cusolverDnZEgels_bufferSize",
-        "cusolverDnZEgels",
-        "cusolverDnZCgesv_bufferSize",
-        "cusolverDnZCgesv",
-        "cusolverDnZCgels_bufferSize",
-        "cusolverDnZCgels",
-        "cusolverDnXtrtri_bufferSize",
-        "cusolverDnXtrtri",
-        "cusolverDnXsytrs_bufferSize",
-        "cusolverDnXsytrs",
-        "cusolverDnXsyevdx_bufferSize",
-        "cusolverDnXsyevdx",
-        "cusolverDnXsyevd_bufferSize",
-        "cusolverDnXsyevd",
-        "cusolverDnXpotrs",
-        "cusolverDnXpotrf_bufferSize",
-        "cusolverDnXpotrf",
-        "cusolverDnXlarft_bufferSize",
-        "cusolverDnXlarft",
-        "cusolverDnXgesvdr_bufferSize",
-        "cusolverDnXgesvdr",
-        "cusolverDnXgesvdp_bufferSize",
-        "cusolverDnXgesvdp",
-        "cusolverDnXgesvd_bufferSize",
-        "cusolverDnXgesvd",
-        "cusolverDnXgeqrf_bufferSize",
-        "cusolverDnXgeqrf",
-        "cusolverDnSyevdx_bufferSize",
-        "cusolverDnSyevdx",
-        "cusolverDnSyevd_bufferSize",
-        "cusolverDnSyevd",
-        "cusolverDnSsytri_bufferSize",
-        "cusolverDnSsytri",
-        "cusolverDnSlauum_bufferSize",
-        "cusolverDnSlauum",
-        "cusolverDnSlaswp",
-        "cusolverDnSetDeterministicMode",
-        "cusolverDnSXgesv_bufferSize",
-        "cusolverDnSXgesv",
-        "cusolverDnSXgels_bufferSize",
-        "cusolverDnSXgels",
-        "cusolverDnSHgesv_bufferSize",
-        "cusolverDnSHgesv",
-        "cusolverDnSHgels_bufferSize",
-        "cusolverDnSHgels",
-        "cusolverDnSBgesv_bufferSize",
-        "cusolverDnSBgesv",
-        "cusolverDnSBgels_bufferSize",
-        "cusolverDnSBgels",
-        "cusolverDnPotrs",
-        "cusolverDnPotrf_bufferSize",
-        "cusolverDnPotrf",
-        "cusolverDnParams",
-        "cusolverDnLoggerSetMask",
-        "cusolverDnLoggerSetLevel",
-        "cusolverDnLoggerSetFile",
-        "cusolverDnLoggerSetCallback",
-        "cusolverDnLoggerOpenFile",
-        "cusolverDnLoggerForceDisable",
-        "cusolverDnLoggerCallback_t",
-        "cusolverDnIRSXgesv_bufferSize",
-        "cusolverDnIRSXgesv",
-        "cusolverDnIRSXgels_bufferSize",
-        "cusolverDnIRSXgels",
-        "cusolverDnIRSParams_t",
-        "cusolverDnIRSParamsSetTolInner",
-        "cusolverDnIRSParamsSetTol",
-        "cusolverDnIRSParamsSetSolverPrecisions",
-        "cusolverDnIRSParamsSetSolverMainPrecision",
-        "cusolverDnIRSParamsSetSolverLowestPrecision",
-        "cusolverDnIRSParamsSetRefinementSolver",
-        "cusolverDnIRSParamsSetMaxItersInner",
-        "cusolverDnIRSParamsSetMaxIters",
-        "cusolverDnIRSParamsGetMaxIters",
-        "cusolverDnIRSParamsEnableFallback",
-        "cusolverDnIRSParamsDisableFallback",
-        "cusolverDnIRSParamsDestroy",
-        "cusolverDnIRSParamsCreate",
-        "cusolverDnIRSParams",
-        "cusolverDnIRSInfos_t",
-        "cusolverDnIRSInfosRequestResidual",
-        "cusolverDnIRSInfosGetResidualHistory",
-        "cusolverDnIRSInfosGetOuterNiters",
-        "cusolverDnIRSInfosGetNiters",
-        "cusolverDnIRSInfosGetMaxIters",
-        "cusolverDnIRSInfosDestroy",
-        "cusolverDnIRSInfosCreate",
-        "cusolverDnIRSInfos",
-        "cusolverDnGetrs",
-        "cusolverDnGetrf_bufferSize",
-        "cusolverDnGetrf",
-        "cusolverDnGetDeterministicMode",
-        "cusolverDnGesvd_bufferSize",
-        "cusolverDnGesvd",
-        "cusolverDnGeqrf_bufferSize",
-        "cusolverDnGeqrf",
-        "cusolverDnDsytri_bufferSize",
-        "cusolverDnDsytri",
-        "cusolverDnDlauum_bufferSize",
-        "cusolverDnDlauum",
-        "cusolverDnDlaswp",
-        "cusolverDnDXgesv_bufferSize",
-        "cusolverDnDXgesv",
-        "cusolverDnDXgels_bufferSize",
-        "cusolverDnDXgels",
-        "cusolverDnDSgesv_bufferSize",
-        "cusolverDnDSgesv",
-        "cusolverDnDSgels_bufferSize",
-        "cusolverDnDSgels",
-        "cusolverDnDHgesv_bufferSize",
-        "cusolverDnDHgesv",
-        "cusolverDnDHgels_bufferSize",
-        "cusolverDnDHgels",
-        "cusolverDnDBgesv_bufferSize",
-        "cusolverDnDBgesv",
-        "cusolverDnDBgels_bufferSize",
-        "cusolverDnDBgels",
-        "cusolverDnCsytri_bufferSize",
-        "cusolverDnCsytri",
-        "cusolverDnContext",
-        "cusolverDnClauum_bufferSize",
-        "cusolverDnClauum",
-        "cusolverDnClaswp",
-        "cusolverDnCYgesv_bufferSize",
-        "cusolverDnCYgesv",
-        "cusolverDnCYgels_bufferSize",
-        "cusolverDnCYgels",
-        "cusolverDnCKgesv_bufferSize",
-        "cusolverDnCKgesv",
-        "cusolverDnCKgels_bufferSize",
-        "cusolverDnCKgels",
-        "cusolverDnCEgesv_bufferSize",
-        "cusolverDnCEgesv",
-        "cusolverDnCEgels_bufferSize",
-        "cusolverDnCEgels",
-        "cusolverDirectMode_t",
-        "cusolverDeterministicMode_t",
-        "curand_mtgp32_specific",
-        "curand_mtgp32_single_specific",
-        "curand_mtgp32_single",
-        "curand_Philox4x32_10",
-        "curandMethod_t",
-        "curandMethod",
-        "curandHistogramM2_t",
-        "curandHistogramM2_st",
-        "curandHistogramM2V_t",
-        "curandHistogramM2V_st",
-        "curandHistogramM2K_t",
-        "curandHistogramM2K_st",
-        "curandGetProperty",
-        "curandGenerateLongLong",
-        "curandDistribution_t",
-        "curandDistribution_st",
-        "curandDistributionShift_t",
-        "curandDistributionShift_st",
-        "curandDistributionM2Shift_t",
-        "curandDistributionM2Shift_st",
-        "curandDirectionVectors64_t",
-        "cufftXtWorkAreaPolicy_t",
-        "cufftXtWorkAreaPolicy",
-        "cufftXtSubFormat_t",
-        "cufftXtSubFormat",
-        "cufftXtSetWorkAreaPolicy",
-        "cufftXtSetWorkArea",
-        "cufftXtSetGPUs",
-        "cufftXtSetDistribution",
-        "cufftXtQueryType_t",
-        "cufftXtQueryType",
-        "cufftXtQueryPlan",
-        "cufftXtMemcpy",
-        "cufftXtMalloc",
-        "cufftXtMakePlanMany",
-        "cufftXtGetSizeMany",
-        "cufftXtFree",
-        "cufftXtExecDescriptorZ2Z",
-        "cufftXtExecDescriptorZ2D",
-        "cufftXtExecDescriptorR2C",
-        "cufftXtExecDescriptorD2Z",
-        "cufftXtExecDescriptorC2R",
-        "cufftXtExecDescriptorC2C",
-        "cufftXtExecDescriptor",
-        "cufftXtExec",
-        "cufftXtCopyType_t",
-        "cufftXtCopyType",
-        "cufftXt1dFactors_t",
-        "cufftXt1dFactors",
-        "cufftSetPlanPropertyInt64",
-        "cufftResetPlanProperty",
-        "cufftProperty_t",
-        "cufftProperty",
-        "cufftGetPlanPropertyInt64",
-        "cufftCompatibility_t",
-        "cufftCompatibility",
-        "cufftBox3d_t",
-        "cufftBox3d",
-        "cudnnWgradMode_t",
-        "cudnnTransformTensorEx",
-        "cudnnTransformTensor",
-        "cudnnTransformFilter",
-        "cudnnTensorTransformStruct",
-        "cudnnTensorTransformDescriptor_t",
-        "cudnnTensorStruct",
-        "cudnnSpatialTransformerStruct",
-        "cudnnSpatialTransformerDescriptor_t",
-        "cudnnSpatialTfSamplerForward",
-        "cudnnSpatialTfSamplerBackward",
-        "cudnnSpatialTfGridGeneratorForward",
-        "cudnnSpatialTfGridGeneratorBackward",
-        "cudnnSignalMode_t",
-        "cudnnSeverity_t",
-        "cudnnSetTensorTransformDescriptor",
-        "cudnnSetTensorNdDescriptorEx",
-        "cudnnSetSpatialTransformerNdDescriptor",
-        "cudnnSetSeqDataDescriptor",
-        "cudnnSetRNNProjectionLayers",
-        "cudnnSetRNNPaddingMode",
-        "cudnnSetRNNMatrixMathType",
-        "cudnnSetRNNDescriptor_v8",
-        "cudnnSetRNNDataDescriptor",
-        "cudnnSetRNNBiasMode",
-        "cudnnSetRNNAlgorithmDescriptor",
-        "cudnnSetFusedOpsVariantParamPackAttribute",
-        "cudnnSetFusedOpsConstParamPackAttribute",
-        "cudnnSetConvolutionReorderType",
-        "cudnnSetCallback",
-        "cudnnSetCTCLossDescriptor_v9",
-        "cudnnSetCTCLossDescriptor_v8",
-        "cudnnSetCTCLossDescriptorEx",
-        "cudnnSetCTCLossDescriptor",
-        "cudnnSetAttnDescriptor",
-        "cudnnSetAlgorithmPerformance",
-        "cudnnSetAlgorithmDescriptor",
-        "cudnnSetActivationDescriptorSwishBeta",
-        "cudnnSeqDataStruct",
-        "cudnnSeqDataDescriptor_t",
-        "cudnnSeqDataAxis_t",
-        "cudnnSaveAlgorithm",
-        "cudnnSamplerType_t",
-        "cudnnRuntimeTag_t",
-        "cudnnRngDistribution_t",
-        "cudnnRestoreDropoutDescriptor",
-        "cudnnRestoreAlgorithm",
-        "cudnnResampleMode_t",
-        "cudnnReorderType_t",
-        "cudnnReorderFilterAndBias",
-        "cudnnReduceTensorStruct",
-        "cudnnRNNStruct",
-        "cudnnRNNSetClip_v9",
-        "cudnnRNNSetClip_v8",
-        "cudnnRNNSetClip",
-        "cudnnRNNPaddingMode_t",
-        "cudnnRNNGetClip_v9",
-        "cudnnRNNGetClip_v8",
-        "cudnnRNNGetClip",
-        "cudnnRNNForwardTrainingEx",
-        "cudnnRNNForwardInferenceEx",
-        "cudnnRNNForward",
-        "cudnnRNNDataStruct",
-        "cudnnRNNDataLayout_t",
-        "cudnnRNNDataDescriptor_t",
-        "cudnnRNNClipMode_t",
-        "cudnnRNNBackwardWeights_v8",
-        "cudnnRNNBackwardWeightsEx",
-        "cudnnRNNBackwardData_v8",
-        "cudnnRNNBackwardDataEx",
-        "cudnnQueryRuntimeError",
-        "cudnnPoolingStruct",
-        "cudnnPointwiseMode_t",
-        "cudnnPersistentRNNPlan",
-        "cudnnPaddingMode_t",
-        "cudnnOpsVersionCheck",
-        "cudnnOpsTrainVersionCheck",
-        "cudnnOpsInferVersionCheck",
-        "cudnnOpTensorStruct",
-        "cudnnNormalizationForwardTraining",
-        "cudnnNormalizationForwardInference",
-        "cudnnNormalizationBackward",
-        "cudnnNormOps_t",
-        "cudnnNormMode_t",
-        "cudnnNormAlgo_t",
-        "cudnnMultiHeadAttnWeightKind_t",
-        "cudnnMultiHeadAttnForward",
-        "cudnnMultiHeadAttnBackwardWeights",
-        "cudnnMultiHeadAttnBackwardData",
-        "cudnnMakeFusedOpsPlan",
-        "cudnnLossNormalizationMode_t",
-        "cudnnLRNStruct",
-        "cudnnInitTransformDest",
-        "cudnnIm2Col",
-        "cudnnGraphVersionCheck",
-        "cudnnGetTensorTransformDescriptor",
-        "cudnnGetTensorSizeInBytes",
-        "cudnnGetSeqDataDescriptor",
-        "cudnnGetReductionIndicesSize",
-        "cudnnGetRNNWeightSpaceSize",
-        "cudnnGetRNNWeightParams",
-        "cudnnGetRNNTempSpaceSizes",
-        "cudnnGetRNNProjectionLayers",
-        "cudnnGetRNNPaddingMode",
-        "cudnnGetRNNMatrixMathType",
-        "cudnnGetRNNForwardTrainingAlgorithmMaxCount",
-        "cudnnGetRNNForwardInferenceAlgorithmMaxCount",
-        "cudnnGetRNNDescriptor_v8",
-        "cudnnGetRNNDescriptor_v6",
-        "cudnnGetRNNDataDescriptor",
-        "cudnnGetRNNBiasMode",
-        "cudnnGetRNNBackwardWeightsAlgorithmMaxCount",
-        "cudnnGetRNNBackwardDataAlgorithmMaxCount",
-        "cudnnGetProperty",
-        "cudnnGetPoolingNdForwardOutputDim",
-        "cudnnGetPoolingNdDescriptor",
-        "cudnnGetNormalizationTrainingReserveSpaceSize",
-        "cudnnGetNormalizationForwardTrainingWorkspaceSize",
-        "cudnnGetNormalizationBackwardWorkspaceSize",
-        "cudnnGetMultiHeadAttnWeights",
-        "cudnnGetMultiHeadAttnBuffers",
-        "cudnnGetMaxDeviceVersion",
-        "cudnnGetLastErrorString",
-        "cudnnGetFusedOpsVariantParamPackAttribute",
-        "cudnnGetFusedOpsConstParamPackAttribute",
-        "cudnnGetFoldedConvBackwardDataDescriptors",
-        "cudnnGetFilterSizeInBytes",
-        "cudnnGetDropoutDescriptor",
-        "cudnnGetCudartVersion",
-        "cudnnGetConvolutionReorderType",
-        "cudnnGetConvolutionNdForwardOutputDim",
-        "cudnnGetConvolutionNdDescriptor",
-        "cudnnGetConvolutionMathType",
-        "cudnnGetConvolutionGroupCount",
-        "cudnnGetConvolutionForwardAlgorithm_v7",
-        "cudnnGetConvolutionForwardAlgorithmMaxCount",
-        "cudnnGetConvolutionBackwardFilterAlgorithm_v7",
-        "cudnnGetConvolutionBackwardFilterAlgorithmMaxCount",
-        "cudnnGetConvolutionBackwardDataAlgorithm_v7",
-        "cudnnGetConvolutionBackwardDataAlgorithmMaxCount",
-        "cudnnGetCallback",
-        "cudnnGetCTCLossWorkspaceSize_v8",
-        "cudnnGetCTCLossWorkspaceSize",
-        "cudnnGetCTCLossDescriptor_v9",
-        "cudnnGetCTCLossDescriptor_v8",
-        "cudnnGetCTCLossDescriptorEx",
-        "cudnnGetCTCLossDescriptor",
-        "cudnnGetBatchNormalizationTrainingExReserveSpaceSize",
-        "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize",
-        "cudnnGetBatchNormalizationBackwardExWorkspaceSize",
-        "cudnnGetAttnDescriptor",
-        "cudnnGetAlgorithmSpaceSize",
-        "cudnnGetAlgorithmPerformance",
-        "cudnnGetAlgorithmDescriptor",
-        "cudnnGetActivationDescriptorSwishBeta",
-        "cudnnGenStatsMode_t",
-        "cudnnFusedOps_t",
-        "cudnnFusedOpsVariantParamStruct",
-        "cudnnFusedOpsVariantParamPack_t",
-        "cudnnFusedOpsVariantParamLabel_t",
-        "cudnnFusedOpsPointerPlaceHolder_t",
-        "cudnnFusedOpsPlan_t",
-        "cudnnFusedOpsPlanStruct",
-        "cudnnFusedOpsExecute",
-        "cudnnFusedOpsConstParamStruct",
-        "cudnnFusedOpsConstParamPack_t",
-        "cudnnFusedOpsConstParamLabel_t",
-        "cudnnFraction_t",
-        "cudnnFractionStruct",
-        "cudnnForwardMode_t",
-        "cudnnFoldingDirection_t",
-        "cudnnFindRNNForwardTrainingAlgorithmEx",
-        "cudnnFindRNNForwardInferenceAlgorithmEx",
-        "cudnnFindRNNBackwardWeightsAlgorithmEx",
-        "cudnnFindRNNBackwardDataAlgorithmEx",
-        "cudnnFilterStruct",
-        "cudnnErrQueryMode_t",
-        "cudnnDropoutStruct",
-        "cudnnDropoutGetReserveSpaceSize",
-        "cudnnDropoutForward",
-        "cudnnDropoutBackward",
-        "cudnnDivisiveNormalizationForward",
-        "cudnnDivisiveNormalizationBackward",
-        "cudnnDivNormMode_t",
-        "cudnnDeterminism_t",
-        "cudnnDestroyTensorTransformDescriptor",
-        "cudnnDestroySpatialTransformerDescriptor",
-        "cudnnDestroySeqDataDescriptor",
-        "cudnnDestroyRNNDataDescriptor",
-        "cudnnDestroyFusedOpsVariantParamPack",
-        "cudnnDestroyFusedOpsPlan",
-        "cudnnDestroyFusedOpsConstParamPack",
-        "cudnnDestroyCTCLossDescriptor",
-        "cudnnDestroyAttnDescriptor",
-        "cudnnDestroyAlgorithmPerformance",
-        "cudnnDestroyAlgorithmDescriptor",
-        "cudnnDeriveNormTensorDescriptor",
-        "cudnnDebug_t",
-        "cudnnDebugStruct",
-        "cudnnCreateTensorTransformDescriptor",
-        "cudnnCreateSpatialTransformerDescriptor",
-        "cudnnCreateSeqDataDescriptor",
-        "cudnnCreateRNNDataDescriptor",
-        "cudnnCreateFusedOpsVariantParamPack",
-        "cudnnCreateFusedOpsPlan",
-        "cudnnCreateFusedOpsConstParamPack",
-        "cudnnCreateCTCLossDescriptor",
-        "cudnnCreateAttnDescriptor",
-        "cudnnCreateAlgorithmPerformance",
-        "cudnnCreateAlgorithmDescriptor",
-        "cudnnCopyAlgorithmDescriptor",
-        "cudnnConvolutionStruct",
-        "cudnnConvolutionBiasActivationForward",
-        "cudnnContext",
-        "cudnnCnnTrainVersionCheck",
-        "cudnnCnnInferVersionCheck",
-        "cudnnCallback_t",
-        "cudnnCTCLoss_v8",
-        "cudnnCTCLossStruct",
-        "cudnnCTCLossDescriptor_t",
-        "cudnnCTCLossAlgo_t",
-        "cudnnCTCLoss",
-        "cudnnBuildRNNDynamic",
-        "cudnnBnFinalizeStatsMode_t",
-        "cudnnBatchNormalizationForwardTrainingEx",
-        "cudnnBatchNormalizationBackwardEx",
-        "cudnnBatchNormOps_t",
-        "cudnnBackendTensorReordering_t",
-        "cudnnBackendSetAttribute",
-        "cudnnBackendNumericalNote_t",
-        "cudnnBackendNormMode_t",
-        "cudnnBackendNormFwdPhase_t",
-        "cudnnBackendLayoutType_t",
-        "cudnnBackendKnobType_t",
-        "cudnnBackendInitialize",
-        "cudnnBackendHeurMode_t",
-        "cudnnBackendGetAttribute",
-        "cudnnBackendFinalize",
-        "cudnnBackendExecute",
-        "cudnnBackendDestroyDescriptor",
-        "cudnnBackendDescriptor_t",
-        "cudnnBackendDescriptorType_t",
-        "cudnnBackendCreateDescriptor",
-        "cudnnBackendBehaviorNote_t",
-        "cudnnBackendAttributeType_t",
-        "cudnnBackendAttributeName_t",
-        "cudnnAttnStruct",
-        "cudnnAttnQueryMap_t",
-        "cudnnAttnDescriptor_t",
-        "cudnnAlgorithm_t",
-        "cudnnAlgorithmUnionStruct",
-        "cudnnAlgorithmStruct",
-        "cudnnAlgorithmPerformance_t",
-        "cudnnAlgorithmPerformanceStruct",
-        "cudnnAlgorithmDescriptor_t",
-        "cudnnAdvVersionCheck",
-        "cudnnAdvTrainVersionCheck",
-        "cudnnAdvInferVersionCheck",
-        "cudnnActivationStruct",
-        "cudaWGLGetDevice",
-        "cudaVDPAUSetVDPAUDevice",
-        "cudaVDPAUGetDevice",
-        "cudaThreadSetLimit",
-        "cudaThreadGetLimit",
-        "cudaSynchronizationPolicy",
-        "cudaSyncPolicyYield",
-        "cudaSyncPolicySpin",
-        "cudaSyncPolicyBlockingSync",
-        "cudaSyncPolicyAuto",
-        "cudaSurfaceFormatMode",
-        "cudaStreamUpdateCaptureDependencies_v2",
-        "cudaStreamSetAttribute",
-        "cudaStreamGetId",
-        "cudaStreamGetCaptureInfo_v3",
-        "cudaStreamGetAttribute",
-        "cudaStreamCopyAttributes",
-        "cudaStreamAttributeSynchronizationPolicy",
-        "cudaStreamAttributePriority",
-        "cudaStreamAttributeMemSyncDomainMap",
-        "cudaStreamAttributeMemSyncDomain",
-        "cudaStreamAttributeAccessPolicyWindow",
-        "cudaStreamAttrValue",
-        "cudaStreamAttrID",
-        "cudaSharedmemCarveoutMaxShared",
-        "cudaSharedmemCarveoutMaxL1",
-        "cudaSharedmemCarveoutDefault",
-        "cudaSharedCarveout",
-        "cudaSetDoubleForHost",
-        "cudaSetDoubleForDevice",
-        "cudaProfilerInitialize",
-        "cudaOutputMode_t",
-        "cudaOutputMode",
-        "cudaOccupancyMaxPotentialClusterSize",
-        "cudaOccupancyMaxActiveClusters",
-        "cudaOccupancyAvailableDynamicSMemPerBlock",
-        "cudaNvSciSyncAttrWait",
-        "cudaNvSciSyncAttrSignal",
-        "cudaMemsetParamsV2",
-        "cudaMemoryTypeUnregistered",
-        "cudaMemcpyToArrayAsync",
-        "cudaMemcpyFromArrayAsync",
-        "cudaMemcpyArrayToArray",
-        "cudaMemcpy3DPeerParms",
-        "cudaMemcpy3DPeerAsync",
-        "cudaMemcpy3DPeer",
-        "cudaMemRangeAttributePreferredLocationType",
-        "cudaMemRangeAttributePreferredLocationId",
-        "cudaMemRangeAttributeLastPrefetchLocationType",
-        "cudaMemRangeAttributeLastPrefetchLocationId",
-        "cudaMemPrefetchAsync_v2",
-        "cudaMemLocationTypeHostNumaCurrent",
-        "cudaMemLocationTypeHostNuma",
-        "cudaMemLocationTypeHost",
-        "cudaMemHandleTypeFabric",
-        "cudaMemFabricHandle_t",
-        "cudaMemFabricHandle_st",
-        "cudaMemAllocNodeParamsV2",
-        "cudaMemAdvise_v2",
-        "cudaLimitPersistingL2CacheSize",
-        "cudaLimitMaxL2FetchGranularity",
-        "cudaLimitDevRuntimeSyncDepth",
-        "cudaLimitDevRuntimePendingLaunchCount",
-        "cudaLibMgMatrixDesc_t",
-        "cudaLibMgGrid_t",
-        "cudaLaunchMemSyncDomainRemote",
-        "cudaLaunchMemSyncDomainMap_st",
-        "cudaLaunchMemSyncDomainMap",
-        "cudaLaunchMemSyncDomainDefault",
-        "cudaLaunchMemSyncDomain",
-        "cudaLaunchKernelExC",
-        "cudaLaunchConfig_t",
-        "cudaLaunchConfig_st",
-        "cudaLaunchAttribute_st",
-        "cudaLaunchAttributeSynchronizationPolicy",
-        "cudaLaunchAttributeProgrammaticStreamSerialization",
-        "cudaLaunchAttributeProgrammaticEvent",
-        "cudaLaunchAttributePreferredSharedMemoryCarveout",
-        "cudaLaunchAttributeMemSyncDomainMap",
-        "cudaLaunchAttributeMemSyncDomain",
-        "cudaLaunchAttributeLaunchCompletionEvent",
-        "cudaLaunchAttributeIgnore",
-        "cudaLaunchAttributeDeviceUpdatableKernelNode",
-        "cudaLaunchAttributeClusterSchedulingPolicyPreference",
-        "cudaLaunchAttributeClusterDimension",
-        "cudaLaunchAttribute",
-        "cudaKeyValuePair",
-        "cudaKernel_t",
-        "cudaKernelNodeParamsV2",
-        "cudaKernelNodeAttributePreferredSharedMemoryCarveout",
-        "cudaKernelNodeAttributeMemSyncDomainMap",
-        "cudaKernelNodeAttributeMemSyncDomain",
-        "cudaKernelNodeAttributeDeviceUpdatableKernelNode",
-        "cudaKernelNodeAttributeClusterSchedulingPolicyPreference",
-        "cudaKernelNodeAttributeClusterDimension",
-        "cudaInitDeviceFlagsAreValid",
-        "cudaInitDevice",
-        "cudaHostNodeParamsV2",
-        "cudaGraphicsVDPAURegisterVideoSurface",
-        "cudaGraphicsVDPAURegisterOutputSurface",
-        "cudaGraphicsResourceSetMapFlags",
-        "cudaGraphicsResourceGetMappedMipmappedArray",
-        "cudaGraphicsResourceGetMappedEglFrame",
-        "cudaGraphicsMapFlagsWriteDiscard",
-        "cudaGraphicsMapFlagsReadOnly",
-        "cudaGraphicsMapFlagsNone",
-        "cudaGraphicsMapFlags",
-        "cudaGraphicsEGLRegisterImage",
-        "cudaGraphicsD3D9RegisterResource",
-        "cudaGraphicsD3D11RegisterResource",
-        "cudaGraphicsD3D10RegisterResource",
-        "cudaGraphicsCubeFacePositiveZ",
-        "cudaGraphicsCubeFacePositiveY",
-        "cudaGraphicsCubeFacePositiveX",
-        "cudaGraphicsCubeFaceNegativeZ",
-        "cudaGraphicsCubeFaceNegativeY",
-        "cudaGraphicsCubeFaceNegativeX",
-        "cudaGraphicsCubeFace",
-        "cudaGraphRemoveDependencies_v2",
-        "cudaGraphNodeSetParams",
-        "cudaGraphNodeGetDependentNodes_v2",
-        "cudaGraphNodeGetDependencies_v2",
-        "cudaGraphKernelNodeUpdate",
-        "cudaGraphKernelNodeFieldParam",
-        "cudaGraphKernelNodeFieldInvalid",
-        "cudaGraphKernelNodeFieldGridDim",
-        "cudaGraphKernelNodeFieldEnabled",
-        "cudaGraphKernelNodeField",
-        "cudaGraphGetEdges_v2",
-        "cudaGraphExecUpdateResultInfo_st",
-        "cudaGraphExecUpdateResultInfo",
-        "cudaGraphExecUpdateErrorAttributesChanged",
-        "cudaGraphExecNodeSetParams",
-        "cudaGraphExecGetFlags",
-        "cudaGraphDeviceNode_t",
-        "cudaGraphDebugDotFlagsConditionalNodeParams",
-        "cudaGraphConditionalNodeType",
-        "cudaGraphConditionalHandleFlags",
-        "cudaGraphConditionalHandleCreate",
-        "cudaGraphConditionalHandle",
-        "cudaGraphCondTypeWhile",
-        "cudaGraphCondTypeIf",
-        "cudaGraphCondAssignDefault",
-        "cudaGraphAddNode_v2",
-        "cudaGraphAddDependencies_v2",
-        "cudaGetTextureObjectTextureDesc_v2",
-        "cudaGetSurfaceReference",
-        "cudaGetSurfaceObjectResourceDesc",
-        "cudaGetParameterBufferV2",
-        "cudaGetParameterBuffer",
-        "cudaGetKernel",
-        "cudaGetDriverEntryPointFlags",
-        "cudaGetDriverEntryPointByVersion",
-        "cudaGLUnregisterBufferObject",
-        "cudaGLUnmapBufferObjectAsync",
-        "cudaGLUnmapBufferObject",
-        "cudaGLSetGLDevice",
-        "cudaGLSetBufferObjectMapFlags",
-        "cudaGLRegisterBufferObject",
-        "cudaGLMapFlagsWriteDiscard",
-        "cudaGLMapFlagsReadOnly",
-        "cudaGLMapFlagsNone",
-        "cudaGLMapFlags",
-        "cudaGLMapBufferObjectAsync",
-        "cudaGLMapBufferObject",
-        "cudaFuncGetParamInfo",
-        "cudaFuncGetName",
-        "cudaFuncAttributeRequiredClusterWidth",
-        "cudaFuncAttributeRequiredClusterHeight",
-        "cudaFuncAttributeRequiredClusterDepth",
-        "cudaFuncAttributeNonPortableClusterSizeAllowed",
-        "cudaFuncAttributeClusterSchedulingPolicyPreference",
-        "cudaFuncAttributeClusterDimMustBeSet",
-        "cudaFormatModeForced",
-        "cudaFormatModeAuto",
-        "cudaFlushGPUDirectRDMAWritesToOwner",
-        "cudaFlushGPUDirectRDMAWritesToAllDevices",
-        "cudaFlushGPUDirectRDMAWritesTargetCurrentDevice",
-        "cudaFlushGPUDirectRDMAWritesTarget",
-        "cudaFlushGPUDirectRDMAWritesScope",
-        "cudaExternalSemaphoreWaitSkipNvSciBufMemSync",
-        "cudaExternalSemaphoreSignalSkipNvSciBufMemSync",
-        "cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32",
-        "cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd",
-        "cudaExternalSemaphoreHandleTypeNvSciSync",
-        "cudaExternalSemaphoreHandleTypeKeyedMutexKmt",
-        "cudaExternalSemaphoreHandleTypeKeyedMutex",
-        "cudaExternalSemaphoreHandleTypeD3D11Fence",
-        "cudaExternalMemoryMipmappedArrayDesc",
-        "cudaExternalMemoryHandleTypeNvSciBuf",
-        "cudaExternalMemoryGetMappedMipmappedArray",
-        "cudaEventWaitExternal",
-        "cudaEventWaitDefault",
-        "cudaEventRecordWithFlags",
-        "cudaEventRecordExternal",
-        "cudaEventRecordDefault",
-        "cudaEventCreateFromEGLSync",
-        "cudaErrorUnsupportedPtxVersion",
-        "cudaErrorUnsupportedExecAffinity",
-        "cudaErrorUnsupportedDevSideSync",
-        "cudaErrorTooManyPeers",
-        "cudaErrorTimeout",
-        "cudaErrorTextureNotBound",
-        "cudaErrorTextureFetchFailed",
-        "cudaErrorSystemNotReady",
-        "cudaErrorSystemDriverMismatch",
-        "cudaErrorSynchronizationError",
-        "cudaErrorSyncDepthExceeded",
-        "cudaErrorStubLibrary",
-        "cudaErrorStartupFailure",
-        "cudaErrorSoftwareValidityNotEstablished",
-        "cudaErrorNvlinkUncorrectable",
-        "cudaErrorNotYetImplemented",
-        "cudaErrorNotPermitted",
-        "cudaErrorMpsServerNotReady",
-        "cudaErrorMpsRpcFailure",
-        "cudaErrorMpsMaxConnectionsReached",
-        "cudaErrorMpsMaxClientsReached",
-        "cudaErrorMpsConnectionFailed",
-        "cudaErrorMpsClientTerminated",
-        "cudaErrorMixedDeviceExecution",
-        "cudaErrorMisalignedAddress",
-        "cudaErrorMemoryValueTooLarge",
-        "cudaErrorLaunchPendingCountExceeded",
-        "cudaErrorLaunchMaxDepthExceeded",
-        "cudaErrorLaunchIncompatibleTexturing",
-        "cudaErrorLaunchFileScopedTex",
-        "cudaErrorLaunchFileScopedSurf",
-        "cudaErrorJitCompilerNotFound",
-        "cudaErrorJitCompilationDisabled",
-        "cudaErrorInvalidTextureBinding",
-        "cudaErrorInvalidTexture",
-        "cudaErrorInvalidSurface",
-        "cudaErrorInvalidPc",
-        "cudaErrorInvalidNormSetting",
-        "cudaErrorInvalidHostPointer",
-        "cudaErrorInvalidFilterSetting",
-        "cudaErrorInvalidClusterSize",
-        "cudaErrorInvalidChannelDescriptor",
-        "cudaErrorInvalidAddressSpace",
-        "cudaErrorIncompatibleDriverContext",
-        "cudaErrorIllegalInstruction",
-        "cudaErrorHardwareStackError",
-        "cudaErrorExternalDevice",
-        "cudaErrorDuplicateVariableName",
-        "cudaErrorDuplicateTextureName",
-        "cudaErrorDuplicateSurfaceName",
-        "cudaErrorDevicesUnavailable",
-        "cudaErrorDeviceNotLicensed",
-        "cudaErrorCompatNotSupportedOnDevice",
-        "cudaErrorCdpVersionMismatch",
-        "cudaErrorCdpNotSupported",
-        "cudaErrorCallRequiresNewerDriver",
-        "cudaErrorApiFailureBase",
-        "cudaErrorAddressOfConstant",
-        "cudaEnablePerThreadDefaultStream",
-        "cudaEnableLegacyStream",
-        "cudaEnableDefault",
-        "cudaEglStreamConnection",
-        "cudaEglResourceLocationVidmem",
-        "cudaEglResourceLocationSysmem",
-        "cudaEglResourceLocationFlags",
-        "cudaEglPlaneDesc_st",
-        "cudaEglPlaneDesc",
-        "cudaEglFrame_st",
-        "cudaEglFrameTypePitch",
-        "cudaEglFrameTypeArray",
-        "cudaEglFrameType",
-        "cudaEglFrame",
-        "cudaEglColorFormatYVYU_ER",
-        "cudaEglColorFormatYVU444SemiPlanar_ER",
-        "cudaEglColorFormatYVU444SemiPlanar",
-        "cudaEglColorFormatYVU444Planar_ER",
-        "cudaEglColorFormatYVU444Planar",
-        "cudaEglColorFormatYVU422SemiPlanar_ER",
-        "cudaEglColorFormatYVU422SemiPlanar",
-        "cudaEglColorFormatYVU422Planar_ER",
-        "cudaEglColorFormatYVU422Planar",
-        "cudaEglColorFormatYVU420SemiPlanar_ER",
-        "cudaEglColorFormatYVU420SemiPlanar",
-        "cudaEglColorFormatYVU420Planar_ER",
-        "cudaEglColorFormatYVU420Planar",
-        "cudaEglColorFormatYUYV_ER",
-        "cudaEglColorFormatYUYV422",
-        "cudaEglColorFormatYUV_ER",
-        "cudaEglColorFormatYUVA_ER",
-        "cudaEglColorFormatYUV444SemiPlanar_ER",
-        "cudaEglColorFormatYUV444SemiPlanar",
-        "cudaEglColorFormatYUV444Planar_ER",
-        "cudaEglColorFormatYUV444Planar",
-        "cudaEglColorFormatYUV422SemiPlanar_ER",
-        "cudaEglColorFormatYUV422SemiPlanar",
-        "cudaEglColorFormatYUV422Planar_ER",
-        "cudaEglColorFormatYUV422Planar",
-        "cudaEglColorFormatYUV420SemiPlanar_ER",
-        "cudaEglColorFormatYUV420SemiPlanar",
-        "cudaEglColorFormatYUV420Planar_ER",
-        "cudaEglColorFormatYUV420Planar",
-        "cudaEglColorFormatY12V12U12_444SemiPlanar",
-        "cudaEglColorFormatY12V12U12_420SemiPlanar",
-        "cudaEglColorFormatY10V10U10_444SemiPlanar",
-        "cudaEglColorFormatY10V10U10_420SemiPlanar",
-        "cudaEglColorFormatVYUY_ER",
-        "cudaEglColorFormatUYVY_ER",
-        "cudaEglColorFormatUYVY422",
-        "cudaEglColorFormatRGBA",
-        "cudaEglColorFormatRGB",
-        "cudaEglColorFormatRG",
-        "cudaEglColorFormatR",
-        "cudaEglColorFormatL",
-        "cudaEglColorFormatBayerRGGB",
-        "cudaEglColorFormatBayerIspRGGB",
-        "cudaEglColorFormatBayerIspGRBG",
-        "cudaEglColorFormatBayerIspGBRG",
-        "cudaEglColorFormatBayerIspBGGR",
-        "cudaEglColorFormatBayerGRBG",
-        "cudaEglColorFormatBayerGBRG",
-        "cudaEglColorFormatBayerBGGR",
-        "cudaEglColorFormatBayer20RGGB",
-        "cudaEglColorFormatBayer20GRBG",
-        "cudaEglColorFormatBayer20GBRG",
-        "cudaEglColorFormatBayer20BGGR",
-        "cudaEglColorFormatBayer14RGGB",
-        "cudaEglColorFormatBayer14GRBG",
-        "cudaEglColorFormatBayer14GBRG",
-        "cudaEglColorFormatBayer14BGGR",
-        "cudaEglColorFormatBayer12RGGB",
-        "cudaEglColorFormatBayer12GRBG",
-        "cudaEglColorFormatBayer12GBRG",
-        "cudaEglColorFormatBayer12BGGR",
-        "cudaEglColorFormatBayer10RGGB",
-        "cudaEglColorFormatBayer10GRBG",
-        "cudaEglColorFormatBayer10GBRG",
-        "cudaEglColorFormatBayer10BGGR",
-        "cudaEglColorFormatBGRA",
-        "cudaEglColorFormatBGR",
-        "cudaEglColorFormatAYUV_ER",
-        "cudaEglColorFormatAYUV",
-        "cudaEglColorFormatARGB",
-        "cudaEglColorFormatABGR",
-        "cudaEglColorFormatA",
-        "cudaEglColorFormat",
-        "cudaEGLStreamProducerReturnFrame",
-        "cudaEGLStreamProducerPresentFrame",
-        "cudaEGLStreamProducerDisconnect",
-        "cudaEGLStreamProducerConnect",
-        "cudaEGLStreamConsumerReleaseFrame",
-        "cudaEGLStreamConsumerDisconnect",
-        "cudaEGLStreamConsumerConnectWithFlags",
-        "cudaEGLStreamConsumerConnect",
-        "cudaEGLStreamConsumerAcquireFrame",
-        "cudaDeviceUnregisterAsyncNotification",
-        "cudaDeviceSyncMemops",
-        "cudaDeviceRegisterAsyncNotification",
-        "cudaDevicePropDontCare",
-        "cudaDeviceNumaConfigNumaNode",
-        "cudaDeviceNumaConfigNone",
-        "cudaDeviceNumaConfig",
-        "cudaDeviceMask",
-        "cudaDeviceGetTexture1DLinearMaxWidth",
-        "cudaDeviceGetNvSciSyncAttributes",
-        "cudaDeviceFlushGPUDirectRDMAWrites",
-        "cudaDevAttrTimelineSemaphoreInteropSupported",
-        "cudaDevAttrSparseCudaArraySupported",
-        "cudaDevAttrReservedSharedMemoryPerBlock",
-        "cudaDevAttrReserved93",
-        "cudaDevAttrReserved92",
-        "cudaDevAttrReserved132",
-        "cudaDevAttrReserved129",
-        "cudaDevAttrReserved128",
-        "cudaDevAttrReserved127",
-        "cudaDevAttrReserved124",
-        "cudaDevAttrReserved123",
-        "cudaDevAttrReserved122",
-        "cudaDevAttrNumaId",
-        "cudaDevAttrNumaConfig",
-        "cudaDevAttrMpsEnabled",
-        "cudaDevAttrMemoryPoolSupportedHandleTypes",
-        "cudaDevAttrMemSyncDomainCount",
-        "cudaDevAttrMaxTimelineSemaphoreInteropSupported",
-        "cudaDevAttrMaxTextureCubemapLayeredLayers",
-        "cudaDevAttrMaxTexture2DLayeredLayers",
-        "cudaDevAttrMaxTexture1DLayeredLayers",
-        "cudaDevAttrMaxSurfaceCubemapLayeredLayers",
-        "cudaDevAttrMaxSurface2DLayeredLayers",
-        "cudaDevAttrMaxSurface1DLayeredLayers",
-        "cudaDevAttrMaxPersistingL2CacheSize",
-        "cudaDevAttrMaxAccessPolicyWindowSize",
-        "cudaDevAttrMax",
-        "cudaDevAttrIpcEventSupport",
-        "cudaDevAttrHostRegisterReadOnlySupported",
-        "cudaDevAttrHostNumaId",
-        "cudaDevAttrGPUDirectRDMAWritesOrdering",
-        "cudaDevAttrGPUDirectRDMASupported",
-        "cudaDevAttrGPUDirectRDMAFlushWritesOptions",
-        "cudaDevAttrDeferredMappingCudaArraySupported",
-        "cudaDevAttrD3D12CigSupported",
-        "cudaDevAttrClusterLaunch",
-        "cudaDevAttrCanFlushRemoteWrites",
-        "cudaD3D9UnregisterResource",
-        "cudaD3D9UnmapResources",
-        "cudaD3D9SetDirect3DDevice",
-        "cudaD3D9ResourceSetMapFlags",
-        "cudaD3D9ResourceGetSurfaceDimensions",
-        "cudaD3D9ResourceGetMappedSize",
-        "cudaD3D9ResourceGetMappedPointer",
-        "cudaD3D9ResourceGetMappedPitch",
-        "cudaD3D9ResourceGetMappedArray",
-        "cudaD3D9RegisterResource",
-        "cudaD3D9RegisterFlagsNone",
-        "cudaD3D9RegisterFlagsArray",
-        "cudaD3D9RegisterFlags",
-        "cudaD3D9MapResources",
-        "cudaD3D9MapFlagsWriteDiscard",
-        "cudaD3D9MapFlagsReadOnly",
-        "cudaD3D9MapFlagsNone",
-        "cudaD3D9MapFlags",
-        "cudaD3D9GetDirect3DDevice",
-        "cudaD3D9GetDevices",
-        "cudaD3D9GetDevice",
-        "cudaD3D9DeviceListNextFrame",
-        "cudaD3D9DeviceListCurrentFrame",
-        "cudaD3D9DeviceListAll",
-        "cudaD3D9DeviceList",
-        "cudaD3D11SetDirect3DDevice",
-        "cudaD3D11GetDirect3DDevice",
-        "cudaD3D11GetDevices",
-        "cudaD3D11GetDevice",
-        "cudaD3D11DeviceListNextFrame",
-        "cudaD3D11DeviceListCurrentFrame",
-        "cudaD3D11DeviceListAll",
-        "cudaD3D11DeviceList",
-        "cudaD3D10UnregisterResource",
-        "cudaD3D10UnmapResources",
-        "cudaD3D10SetDirect3DDevice",
-        "cudaD3D10ResourceSetMapFlags",
-        "cudaD3D10ResourceGetSurfaceDimensions",
-        "cudaD3D10ResourceGetMappedSize",
-        "cudaD3D10ResourceGetMappedPointer",
-        "cudaD3D10ResourceGetMappedPitch",
-        "cudaD3D10ResourceGetMappedArray",
-        "cudaD3D10RegisterResource",
-        "cudaD3D10RegisterFlagsNone",
-        "cudaD3D10RegisterFlagsArray",
-        "cudaD3D10RegisterFlags",
-        "cudaD3D10MapResources",
-        "cudaD3D10MapFlagsWriteDiscard",
-        "cudaD3D10MapFlagsReadOnly",
-        "cudaD3D10MapFlagsNone",
-        "cudaD3D10MapFlags",
-        "cudaD3D10GetDirect3DDevice",
-        "cudaD3D10GetDevices",
-        "cudaD3D10GetDevice",
-        "cudaD3D10DeviceListNextFrame",
-        "cudaD3D10DeviceListCurrentFrame",
-        "cudaD3D10DeviceListAll",
-        "cudaD3D10DeviceList",
-        "cudaCtxResetPersistingL2Cache",
-        "cudaCreateTextureObject_v2",
-        "cudaConditionalNodeParams",
-        "cudaClusterSchedulingPolicySpread",
-        "cudaClusterSchedulingPolicyLoadBalancing",
-        "cudaClusterSchedulingPolicyDefault",
-        "cudaClusterSchedulingPolicy",
-        "cudaChannelFormatKindUnsignedNormalized8X4",
-        "cudaChannelFormatKindUnsignedNormalized8X2",
-        "cudaChannelFormatKindUnsignedNormalized8X1",
-        "cudaChannelFormatKindUnsignedNormalized16X4",
-        "cudaChannelFormatKindUnsignedNormalized16X2",
-        "cudaChannelFormatKindUnsignedNormalized16X1",
-        "cudaChannelFormatKindUnsignedBlockCompressed7SRGB",
-        "cudaChannelFormatKindUnsignedBlockCompressed7",
-        "cudaChannelFormatKindUnsignedBlockCompressed6H",
-        "cudaChannelFormatKindUnsignedBlockCompressed5",
-        "cudaChannelFormatKindUnsignedBlockCompressed4",
-        "cudaChannelFormatKindUnsignedBlockCompressed3SRGB",
-        "cudaChannelFormatKindUnsignedBlockCompressed3",
-        "cudaChannelFormatKindUnsignedBlockCompressed2SRGB",
-        "cudaChannelFormatKindUnsignedBlockCompressed2",
-        "cudaChannelFormatKindUnsignedBlockCompressed1SRGB",
-        "cudaChannelFormatKindUnsignedBlockCompressed1",
-        "cudaChannelFormatKindSignedNormalized8X4",
-        "cudaChannelFormatKindSignedNormalized8X2",
-        "cudaChannelFormatKindSignedNormalized8X1",
-        "cudaChannelFormatKindSignedNormalized16X4",
-        "cudaChannelFormatKindSignedNormalized16X2",
-        "cudaChannelFormatKindSignedNormalized16X1",
-        "cudaChannelFormatKindSignedBlockCompressed6H",
-        "cudaChannelFormatKindSignedBlockCompressed5",
-        "cudaChannelFormatKindSignedBlockCompressed4",
-        "cudaChannelFormatKindNV12",
-        "cudaCSV",
-        "cudaCGScopeMultiGrid",
-        "cudaCGScopeInvalid",
-        "cudaCGScopeGrid",
-        "cudaCGScope",
-        "cudaBindSurfaceToArray",
-        "cudaAsyncNotificationInfo_t",
-        "cudaAsyncNotificationInfo",
-        "cudaAsyncCallbackHandle_t",
-        "cudaAsyncCallbackEntry",
-        "cudaAsyncCallback",
-        "cudaArraySparsePropertiesSingleMipTail",
-        "cudaArraySparseProperties",
-        "cudaArraySparse",
-        "cudaArrayMemoryRequirements",
-        "cudaArrayGetSparseProperties",
-        "cudaArrayGetPlane",
-        "cudaArrayGetMemoryRequirements",
-        "cudaArrayDeferredMapping",
-        "cudaArrayColorAttachment",
-        "cuWGLGetDevice",
-        "cuVDPAUGetDevice",
-        "cuVDPAUCtxCreate",
-        "cuTexRefDestroy",
-        "cuTexRefCreate",
-        "cuTensorMapReplaceAddress",
-        "cuTensorMapEncodeTiled",
-        "cuTensorMapEncodeIm2col",
-        "cuSurfRefSetArray",
-        "cuSurfRefGetArray",
-        "cuSurfObjectGetResourceDesc",
-        "cuSurfObjectDestroy",
-        "cuSurfObjectCreate",
-        "cuStreamUpdateCaptureDependencies_v2",
-        "cuStreamSetAttribute",
-        "cuStreamGetId",
-        "cuStreamGetGreenCtx",
-        "cuStreamGetCtx_v2",
-        "cuStreamGetCtx",
-        "cuStreamGetCaptureInfo_v3",
-        "cuStreamGetAttribute",
-        "cuStreamCopyAttributes",
-        "cuStreamBeginCapture_ptsz",
-        "cuStreamBatchMemOp_v2",
-        "cuStreamBatchMemOp",
-        "cuProfilerInitialize",
-        "cuParamSetv",
-        "cuParamSeti",
-        "cuParamSetf",
-        "cuParamSetTexRef",
-        "cuParamSetSize",
-        "cuOccupancyMaxPotentialClusterSize",
-        "cuOccupancyMaxActiveClusters",
-        "cuOccupancyAvailableDynamicSMemPerBlock",
-        "cuMulticastUnbind",
-        "cuMulticastGetGranularity",
-        "cuMulticastCreate",
-        "cuMulticastBindMem",
-        "cuMulticastBindAddr",
-        "cuMulticastAddDevice",
-        "cuModuleLoadFatBinary",
-        "cuModuleGetSurfRef",
-        "cuModuleGetLoadingMode",
-        "cuModuleGetFunctionCount",
-        "cuModuleEnumerateFunctions",
-        "cuMipmappedArrayGetMemoryRequirements",
-        "cuMemsetD2D8_v2",
-        "cuMemsetD2D8Async",
-        "cuMemsetD2D8",
-        "cuMemsetD2D32_v2",
-        "cuMemsetD2D32Async",
-        "cuMemsetD2D32",
-        "cuMemsetD2D16_v2",
-        "cuMemsetD2D16Async",
-        "cuMemsetD2D16",
-        "cuMemcpyPeerAsync",
-        "cuMemcpyPeer",
-        "cuMemcpyAsync",
-        "cuMemcpy3DPeerAsync",
-        "cuMemcpy3DPeer",
-        "cuMemcpy",
-        "cuMemPrefetchAsync_v2",
-        "cuMemGetHandleForAddressRange",
-        "cuMemAdvise_v2",
-        "cuLibraryUnload",
-        "cuLibraryLoadFromFile",
-        "cuLibraryLoadData",
-        "cuLibraryGetUnifiedFunction",
-        "cuLibraryGetModule",
-        "cuLibraryGetManaged",
-        "cuLibraryGetKernelCount",
-        "cuLibraryGetKernel",
-        "cuLibraryGetGlobal",
-        "cuLibraryEnumerateKernels",
-        "cuLaunchKernelEx",
-        "cuLaunchGridAsync",
-        "cuLaunchGrid",
-        "cuLaunch",
-        "cuKernelSetCacheConfig",
-        "cuKernelSetAttribute",
-        "cuKernelGetParamInfo",
-        "cuKernelGetName",
-        "cuKernelGetLibrary",
-        "cuKernelGetFunction",
-        "cuKernelGetAttribute",
-        "cuGreenCtxWaitEvent",
-        "cuGreenCtxStreamCreate",
-        "cuGreenCtxRecordEvent",
-        "cuGreenCtxGetDevResource",
-        "cuGreenCtxDestroy",
-        "cuGreenCtxCreate",
-        "cuGraphicsVDPAURegisterVideoSurface",
-        "cuGraphicsVDPAURegisterOutputSurface",
-        "cuGraphicsResourceSetMapFlags_v2",
-        "cuGraphicsResourceSetMapFlags",
-        "cuGraphicsResourceGetMappedMipmappedArray",
-        "cuGraphicsResourceGetMappedEglFrame",
-        "cuGraphicsEGLRegisterImage",
-        "cuGraphicsD3D9RegisterResource",
-        "cuGraphicsD3D11RegisterResource",
-        "cuGraphicsD3D10RegisterResource",
-        "cuGraphRemoveDependencies_v2",
-        "cuGraphNodeSetParams",
-        "cuGraphNodeGetDependentNodes_v2",
-        "cuGraphNodeGetDependencies_v2",
-        "cuGraphMemcpyNodeSetParams",
-        "cuGraphMemcpyNodeGetParams",
-        "cuGraphGetEdges_v2",
-        "cuGraphExecNodeSetParams",
-        "cuGraphExecMemcpyNodeSetParams",
-        "cuGraphExecGetFlags",
-        "cuGraphConditionalHandleCreate",
-        "cuGraphAddNode_v2",
-        "cuGraphAddMemFreeNode",
-        "cuGraphAddDependencies_v2",
-        "cuGLUnregisterBufferObject",
-        "cuGLUnmapBufferObjectAsync",
-        "cuGLUnmapBufferObject",
-        "cuGLSetBufferObjectMapFlags",
-        "cuGLRegisterBufferObject",
-        "cuGLMapBufferObjectAsync",
-        "cuGLMapBufferObject",
-        "cuGLInit",
-        "cuGLCtxCreate",
-        "cuFuncSetSharedSize",
-        "cuFuncSetSharedMemConfig",
-        "cuFuncSetCacheConfig",
-        "cuFuncSetBlockShape",
-        "cuFuncSetAttribute",
-        "cuFuncLoad",
-        "cuFuncIsLoaded",
-        "cuFuncGetParamInfo",
-        "cuFuncGetName",
-        "cuFuncGetModule",
-        "cuFlushGPUDirectRDMAWrites",
-        "cuExternalMemoryGetMappedMipmappedArray",
-        "cuEventRecordWithFlags",
-        "cuEventCreateFromEGLSync",
-        "cuEGLStreamProducerReturnFrame",
-        "cuEGLStreamProducerPresentFrame",
-        "cuEGLStreamProducerDisconnect",
-        "cuEGLStreamProducerConnect",
-        "cuEGLStreamConsumerReleaseFrame",
-        "cuEGLStreamConsumerDisconnect",
-        "cuEGLStreamConsumerConnectWithFlags",
-        "cuEGLStreamConsumerConnect",
-        "cuEGLStreamConsumerAcquireFrame",
-        "cuDeviceUnregisterAsyncNotification",
-        "cuDeviceRegisterAsyncNotification",
-        "cuDeviceGetTexture1DLinearMaxWidth",
-        "cuDeviceGetProperties",
-        "cuDeviceGetNvSciSyncAttributes",
-        "cuDeviceGetLuid",
-        "cuDeviceGetExecAffinitySupport",
-        "cuDeviceGetDevResource",
-        "cuDevSmResourceSplitByCount",
-        "cuDevResourceGenerateDesc",
-        "cuD3D9UnregisterResource",
-        "cuD3D9UnmapResources",
-        "cuD3D9ResourceSetMapFlags",
-        "cuD3D9ResourceGetSurfaceDimensions",
-        "cuD3D9ResourceGetMappedSize",
-        "cuD3D9ResourceGetMappedPointer",
-        "cuD3D9ResourceGetMappedPitch",
-        "cuD3D9ResourceGetMappedArray",
-        "cuD3D9RegisterResource",
-        "cuD3D9MapResources",
-        "cuD3D9GetDirect3DDevice",
-        "cuD3D9GetDevices",
-        "cuD3D9GetDevice",
-        "cuD3D9CtxCreateOnDevice",
-        "cuD3D9CtxCreate",
-        "cuD3D11GetDirect3DDevice",
-        "cuD3D11GetDevices",
-        "cuD3D11GetDevice",
-        "cuD3D11CtxCreateOnDevice",
-        "cuD3D11CtxCreate",
-        "cuD3D10UnregisterResource",
-        "cuD3D10UnmapResources",
-        "cuD3D10ResourceSetMapFlags",
-        "cuD3D10ResourceGetSurfaceDimensions",
-        "cuD3D10ResourceGetMappedSize",
-        "cuD3D10ResourceGetMappedPointer",
-        "cuD3D10ResourceGetMappedPitch",
-        "cuD3D10ResourceGetMappedArray",
-        "cuD3D10RegisterResource",
-        "cuD3D10MapResources",
-        "cuD3D10GetDirect3DDevice",
-        "cuD3D10GetDevices",
-        "cuD3D10GetDevice",
-        "cuD3D10CtxCreateOnDevice",
-        "cuD3D10CtxCreate",
-        "cuCtxWaitEvent",
-        "cuCtxSetFlags",
-        "cuCtxResetPersistingL2Cache",
-        "cuCtxGetId",
-        "cuCtxGetExecAffinity",
-        "cuCtxGetDevResource",
-        "cuCtxFromGreenCtx",
-        "cuCtxDetach",
-        "cuCtxCreate_v4",
-        "cuCtxCreate_v3",
-        "cuCtxAttach",
-        "cuCoredumpSetAttributeGlobal",
-        "cuCoredumpSetAttribute",
-        "cuCoredumpGetAttributeGlobal",
-        "cuCoredumpGetAttribute",
-        "cuArrayGetSparseProperties",
-        "cuArrayGetPlane",
-        "cuArrayGetMemoryRequirements",
-        "csrsv2Info",
-        "csrsm2Info",
-        "csrqrInfo_t",
-        "csrqrInfoHost_t",
-        "csrqrInfoHost",
-        "csrqrInfo",
-        "csrluInfoHost_t",
-        "csrluInfoHost",
-        "csrcholInfo_t",
-        "csrcholInfoHost_t",
-        "csrcholInfoHost",
-        "csrcholInfo",
-        "cl_event_flags_enum",
-        "cl_event_flags",
-        "cl_context_flags_enum",
-        "cl_context_flags",
-        "__nv_saturation_t",
-        "__nv_fp8x4_storage_t",
-        "__nv_fp8x4_e5m2",
-        "__nv_fp8x4_e4m3",
-        "__nv_fp8x2_storage_t",
-        "__nv_fp8x2_e5m2",
-        "__nv_fp8x2_e4m3",
-        "__nv_fp8_storage_t",
-        "__nv_fp8_interpretation_t",
-        "__nv_fp8_e5m2",
-        "__nv_fp8_e4m3",
-        "__nv_bfloat16_raw",
-        "__nv_bfloat162_raw",
-        "__nv_bfloat162",
-        "__curand_umul",
-        "__NV_SATFINITE",
-        "__NV_NOSAT",
-        "__NV_E5M2",
-        "__NV_E4M3",
-        "__CUB_LP64__",
-        "_CUB_ASM_PTR_SIZE_",
-        "_CUB_ASM_PTR_",
-        "NVRTC_ERROR_TIME_FILE_WRITE_FAILED",
-        "NVFFT_PLAN_PROPERTY_INT64_PATIENT_JIT",
-        "NVFFT_PLAN_PROPERTY_INT64_MAX_NUM_HOST_THREADS",
-        "NVCL_EVENT_SCHED_YIELD",
-        "NVCL_EVENT_SCHED_SPIN",
-        "NVCL_EVENT_SCHED_BLOCKING_SYNC",
-        "NVCL_EVENT_SCHED_AUTO",
-        "NVCL_CTX_SCHED_YIELD",
-        "NVCL_CTX_SCHED_SPIN",
-        "NVCL_CTX_SCHED_BLOCKING_SYNC",
-        "NVCL_CTX_SCHED_AUTO",
-        "MAX_CUFFT_ERROR",
-        "CubVector",
-        "CubDebugExit",
-        "CUtensorMap_st",
-        "CUtensorMapSwizzle_enum",
-        "CUtensorMapSwizzle",
-        "CUtensorMapL2promotion_enum",
-        "CUtensorMapL2promotion",
-        "CUtensorMapInterleave_enum",
-        "CUtensorMapInterleave",
-        "CUtensorMapFloatOOBfill_enum",
-        "CUtensorMapFloatOOBfill",
-        "CUtensorMapDataType_enum",
-        "CUtensorMapDataType",
-        "CUtensorMap",
-        "CUsynchronizationPolicy_enum",
-        "CUsynchronizationPolicy",
-        "CUsurfref_st",
-        "CUsurfref",
-        "CUstream_flags_enum",
-        "CUstream_flags",
-        "CUstreamWriteValue_flags_enum",
-        "CUstreamWriteValue_flags",
-        "CUstreamWaitValue_flags_enum",
-        "CUstreamWaitValue_flags",
-        "CUstreamMemoryBarrier_flags_enum",
-        "CUstreamMemoryBarrier_flags",
-        "CUstreamMemOpMemoryBarrierParams_st",
-        "CUstreamBatchMemOpType_enum",
-        "CUstreamBatchMemOpType",
-        "CUstreamBatchMemOpParams_v1",
-        "CUstreamBatchMemOpParams_union",
-        "CUstreamBatchMemOpParams",
-        "CUstreamAttrValue_v1",
-        "CUstreamAttrValue_union",
-        "CUstreamAttrValue",
-        "CUstreamAttrID_enum",
-        "CUstreamAttrID",
-        "CUshared_carveout_enum",
-        "CUshared_carveout",
-        "CUoccupancy_flags_enum",
-        "CUoccupancy_flags",
-        "CUmulticastObjectProp_v1",
-        "CUmulticastObjectProp_st",
-        "CUmulticastObjectProp",
-        "CUmulticastGranularity_flags_enum",
-        "CUmulticastGranularity_flags",
-        "CUmoduleLoadingMode_enum",
-        "CUmoduleLoadingMode",
-        "CUmemRangeHandleType_enum",
-        "CUmemRangeHandleType",
-        "CUmemFabricHandle_v1",
-        "CUmemFabricHandle_st",
-        "CUmemFabricHandle",
-        "CUmemAttach_flags_enum",
-        "CUmemAttach_flags",
-        "CUlibraryOption_enum",
-        "CUlibraryOption",
-        "CUlibraryHostUniversalFunctionAndDataTable_st",
-        "CUlibraryHostUniversalFunctionAndDataTable",
-        "CUlibrary",
-        "CUlib_st",
-        "CUlaunchMemSyncDomain_enum",
-        "CUlaunchMemSyncDomainMap_st",
-        "CUlaunchMemSyncDomainMap",
-        "CUlaunchMemSyncDomain",
-        "CUlaunchConfig_st",
-        "CUlaunchConfig",
-        "CUlaunchAttribute_st",
-        "CUlaunchAttribute",
-        "CUkernel",
-        "CUkern_st",
-        "CUjit_target_enum",
-        "CUjit_target",
-        "CUjit_fallback_enum",
-        "CUjit_fallback",
-        "CUjit_cacheMode_enum",
-        "CUjit_cacheMode",
-        "CUipcMem_flags_enum",
-        "CUipcMem_flags",
-        "CUgreenCtx_st",
-        "CUgreenCtxCreate_flags",
-        "CUgreenCtx",
-        "CUgraphicsMapResourceFlags_enum",
-        "CUgraphicsMapResourceFlags",
-        "CUgraphExecUpdateResultInfo_v1",
-        "CUgraphExecUpdateResultInfo_st",
-        "CUgraphExecUpdateResultInfo",
-        "CUgraphDeviceUpdatableNode_st",
-        "CUgraphDeviceNode",
-        "CUgraphConditionalNodeType_enum",
-        "CUgraphConditionalNodeType",
-        "CUgraphConditionalHandle",
-        "CUfunctionLoadingState_enum",
-        "CUfunctionLoadingState",
-        "CUflushGPUDirectRDMAWritesTarget_enum",
-        "CUflushGPUDirectRDMAWritesTarget",
-        "CUflushGPUDirectRDMAWritesScope_enum",
-        "CUflushGPUDirectRDMAWritesScope",
-        "CUexternalSemaphore_st",
-        "CUexternalMemory_st",
-        "CUextSemaphore_st",
-        "CUextMemory_st",
-        "CUexecAffinityType_enum",
-        "CUexecAffinityType",
-        "CUexecAffinitySmCount_v1",
-        "CUexecAffinitySmCount_st",
-        "CUexecAffinitySmCount",
-        "CUexecAffinityParam_v1",
-        "CUexecAffinityParam_st",
-        "CUexecAffinityParam",
-        "CUevent_wait_flags_enum",
-        "CUevent_wait_flags",
-        "CUevent_sched_flags_enum",
-        "CUevent_sched_flags",
-        "CUevent_record_flags_enum",
-        "CUevent_record_flags",
-        "CUevent_flags_enum",
-        "CUevent_flags",
-        "CUeglStreamConnection_st",
-        "CUeglStreamConnection",
-        "CUeglResourceLocationFlags_enum",
-        "CUeglResourceLocationFlags",
-        "CUeglFrameType_enum",
-        "CUeglFrameType",
-        "CUeglColorFormate_enum",
-        "CUeglColorFormat",
-        "CUdriverProcAddress_flags_enum",
-        "CUdriverProcAddress_flags",
-        "CUdevprop_v1",
-        "CUdevprop_st",
-        "CUdevprop",
-        "CUdeviceNumaConfig_enum",
-        "CUdeviceNumaConfig",
-        "CUdevSmResource_st",
-        "CUdevSmResourceSplit_flags",
-        "CUdevSmResource",
-        "CUdevResource_st",
-        "CUdevResourceType",
-        "CUdevResourceDesc_st",
-        "CUdevResourceDesc",
-        "CUdevResource",
-        "CUd3d9register_flags_enum",
-        "CUd3d9register_flags",
-        "CUd3d9map_flags_enum",
-        "CUd3d9map_flags",
-        "CUd3d9DeviceList_enum",
-        "CUd3d9DeviceList",
-        "CUd3d11DeviceList_enum",
-        "CUd3d11DeviceList",
-        "CUd3d10register_flags_enum",
-        "CUd3d10register_flags",
-        "CUd3d10map_flags_enum",
-        "CUd3d10map_flags",
-        "CUd3d10DeviceList_enum",
-        "CUd3d10DeviceList",
-        "CUctx_flags_enum",
-        "CUctx_flags",
-        "CUctxCreateParams_st",
-        "CUctxCreateParams",
-        "CUctxCigParam_st",
-        "CUctxCigParam",
-        "CUcoredumpSettings_enum",
-        "CUcoredumpSettings",
-        "CUclusterSchedulingPolicy_enum",
-        "CUclusterSchedulingPolicy",
-        "CUcigDataType_enum",
-        "CUcigDataType",
-        "CUasyncNotificationType_enum",
-        "CUasyncNotificationType",
-        "CUasyncNotificationInfo_st",
-        "CUasyncNotificationInfo",
-        "CUasyncCallbackHandle",
-        "CUasyncCallbackEntry_st",
-        "CUasyncCallback",
-        "CUarray_cubemap_face_enum",
-        "CUarray_cubemap_face",
-        "CU_TRSF_SEAMLESS_CUBEMAP",
-        "CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION",
-        "CU_TENSOR_MAP_SWIZZLE_NONE",
-        "CU_TENSOR_MAP_SWIZZLE_64B",
-        "CU_TENSOR_MAP_SWIZZLE_32B",
-        "CU_TENSOR_MAP_SWIZZLE_128B",
-        "CU_TENSOR_MAP_NUM_QWORDS",
-        "CU_TENSOR_MAP_L2_PROMOTION_NONE",
-        "CU_TENSOR_MAP_L2_PROMOTION_L2_64B",
-        "CU_TENSOR_MAP_L2_PROMOTION_L2_256B",
-        "CU_TENSOR_MAP_L2_PROMOTION_L2_128B",
-        "CU_TENSOR_MAP_INTERLEAVE_NONE",
-        "CU_TENSOR_MAP_INTERLEAVE_32B",
-        "CU_TENSOR_MAP_INTERLEAVE_16B",
-        "CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE",
-        "CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA",
-        "CU_TENSOR_MAP_DATA_TYPE_UINT8",
-        "CU_TENSOR_MAP_DATA_TYPE_UINT64",
-        "CU_TENSOR_MAP_DATA_TYPE_UINT32",
-        "CU_TENSOR_MAP_DATA_TYPE_UINT16",
-        "CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ",
-        "CU_TENSOR_MAP_DATA_TYPE_TFLOAT32",
-        "CU_TENSOR_MAP_DATA_TYPE_INT64",
-        "CU_TENSOR_MAP_DATA_TYPE_INT32",
-        "CU_TENSOR_MAP_DATA_TYPE_FLOAT64",
-        "CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ",
-        "CU_TENSOR_MAP_DATA_TYPE_FLOAT32",
-        "CU_TENSOR_MAP_DATA_TYPE_FLOAT16",
-        "CU_TENSOR_MAP_DATA_TYPE_BFLOAT16",
-        "CU_TARGET_COMPUTE_90A",
-        "CU_TARGET_COMPUTE_90",
-        "CU_TARGET_COMPUTE_89",
-        "CU_TARGET_COMPUTE_87",
-        "CU_TARGET_COMPUTE_86",
-        "CU_TARGET_COMPUTE_80",
-        "CU_TARGET_COMPUTE_75",
-        "CU_TARGET_COMPUTE_73",
-        "CU_TARGET_COMPUTE_72",
-        "CU_TARGET_COMPUTE_70",
-        "CU_TARGET_COMPUTE_62",
-        "CU_TARGET_COMPUTE_61",
-        "CU_TARGET_COMPUTE_60",
-        "CU_TARGET_COMPUTE_53",
-        "CU_TARGET_COMPUTE_52",
-        "CU_TARGET_COMPUTE_50",
-        "CU_TARGET_COMPUTE_37",
-        "CU_TARGET_COMPUTE_35",
-        "CU_TARGET_COMPUTE_32",
-        "CU_TARGET_COMPUTE_30",
-        "CU_TARGET_COMPUTE_21",
-        "CU_TARGET_COMPUTE_20",
-        "CU_TARGET_COMPUTE_13",
-        "CU_TARGET_COMPUTE_12",
-        "CU_TARGET_COMPUTE_11",
-        "CU_TARGET_COMPUTE_10",
-        "CU_SYNC_POLICY_YIELD",
-        "CU_SYNC_POLICY_SPIN",
-        "CU_SYNC_POLICY_BLOCKING_SYNC",
-        "CU_SYNC_POLICY_AUTO",
-        "CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER",
-        "CU_STREAM_WRITE_VALUE_DEFAULT",
-        "CU_STREAM_WAIT_VALUE_FLUSH",
-        "CU_STREAM_MEM_OP_WRITE_VALUE_64",
-        "CU_STREAM_MEM_OP_WRITE_VALUE_32",
-        "CU_STREAM_MEM_OP_WAIT_VALUE_64",
-        "CU_STREAM_MEM_OP_WAIT_VALUE_32",
-        "CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES",
-        "CU_STREAM_MEM_OP_BARRIER",
-        "CU_STREAM_MEMORY_BARRIER_TYPE_SYS",
-        "CU_STREAM_MEMORY_BARRIER_TYPE_GPU",
-        "CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY",
-        "CU_STREAM_ATTRIBUTE_PRIORITY",
-        "CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP",
-        "CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN",
-        "CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW",
-        "CU_SHAREDMEM_CARVEOUT_MAX_SHARED",
-        "CU_SHAREDMEM_CARVEOUT_MAX_L1",
-        "CU_SHAREDMEM_CARVEOUT_DEFAULT",
-        "CU_PREFER_PTX",
-        "CU_PREFER_BINARY",
-        "CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID",
-        "CU_POINTER_ATTRIBUTE_MAPPING_SIZE",
-        "CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR",
-        "CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE",
-        "CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ",
-        "CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE",
-        "CU_PARAM_TR_DEFAULT",
-        "CU_MULTICAST_GRANULARITY_RECOMMENDED",
-        "CU_MULTICAST_GRANULARITY_MINIMUM",
-        "CU_MODULE_LAZY_LOADING",
-        "CU_MODULE_EAGER_LOADING",
-        "CU_MEM_RANGE_HANDLE_TYPE_MAX",
-        "CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD",
-        "CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE",
-        "CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID",
-        "CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE",
-        "CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID",
-        "CU_MEM_LOCATION_TYPE_MAX",
-        "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
-        "CU_MEM_LOCATION_TYPE_HOST_NUMA",
-        "CU_MEM_LOCATION_TYPE_HOST",
-        "CU_MEM_HANDLE_TYPE_MAX",
-        "CU_MEM_HANDLE_TYPE_FABRIC",
-        "CU_MEM_CREATE_USAGE_TILE_POOL",
-        "CU_MEM_ACCESS_FLAGS_PROT_MAX",
-        "CU_LIMIT_SHMEM_SIZE",
-        "CU_LIMIT_PERSISTING_L2_CACHE_SIZE",
-        "CU_LIMIT_MAX_L2_FETCH_GRANULARITY",
-        "CU_LIMIT_MAX",
-        "CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH",
-        "CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT",
-        "CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED",
-        "CU_LIMIT_CIG_ENABLED",
-        "CU_LIBRARY_NUM_OPTIONS",
-        "CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE",
-        "CU_LIBRARY_BINARY_IS_PRESERVED",
-        "CU_LAUNCH_PARAM_END_AS_INT",
-        "CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT",
-        "CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT",
-        "CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE",
-        "CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT",
-        "CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY",
-        "CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION",
-        "CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT",
-        "CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT",
-        "CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP",
-        "CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN",
-        "CU_LAUNCH_ATTRIBUTE_MAX",
-        "CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT",
-        "CU_LAUNCH_ATTRIBUTE_IGNORE",
-        "CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE",
-        "CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE",
-        "CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION",
-        "CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT",
-        "CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP",
-        "CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN",
-        "CU_KERNEL_NODE_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE",
-        "CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE",
-        "CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_DIMENSION",
-        "CU_JIT_REFERENCED_VARIABLE_NAMES",
-        "CU_JIT_REFERENCED_VARIABLE_COUNT",
-        "CU_JIT_REFERENCED_KERNEL_NAMES",
-        "CU_JIT_REFERENCED_KERNEL_COUNT",
-        "CU_JIT_PREC_SQRT",
-        "CU_JIT_PREC_DIV",
-        "CU_JIT_POSITION_INDEPENDENT_CODE",
-        "CU_JIT_OVERRIDE_DIRECTIVE_VALUES",
-        "CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES",
-        "CU_JIT_MIN_CTA_PER_SM",
-        "CU_JIT_MAX_THREADS_PER_BLOCK",
-        "CU_JIT_LTO",
-        "CU_JIT_GLOBAL_SYMBOL_NAMES",
-        "CU_JIT_GLOBAL_SYMBOL_COUNT",
-        "CU_JIT_GLOBAL_SYMBOL_ADDRESSES",
-        "CU_JIT_FTZ",
-        "CU_JIT_FMA",
-        "CU_JIT_CACHE_OPTION_NONE",
-        "CU_JIT_CACHE_OPTION_CG",
-        "CU_JIT_CACHE_OPTION_CA",
-        "CU_GREEN_CTX_DEFAULT_STREAM",
-        "CU_GRAPH_NODE_TYPE_CONDITIONAL",
-        "CU_GRAPH_NODE_TYPE_BATCH_MEM_OP",
-        "CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED",
-        "CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS",
-        "CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS",
-        "CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO",
-        "CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS",
-        "CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS",
-        "CU_GRAPH_COND_TYPE_WHILE",
-        "CU_GRAPH_COND_TYPE_IF",
-        "CU_GRAPH_COND_ASSIGN_DEFAULT",
-        "CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
-        "CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY",
-        "CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE",
-        "CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
-        "CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
-        "CU_GL_MAP_RESOURCE_FLAGS_NONE",
-        "CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM",
-        "CU_GET_PROC_ADDRESS_LEGACY_STREAM",
-        "CU_GET_PROC_ADDRESS_DEFAULT",
-        "CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH",
-        "CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT",
-        "CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH",
-        "CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED",
-        "CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET",
-        "CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE",
-        "CU_FUNCTION_LOADING_STATE_UNLOADED",
-        "CU_FUNCTION_LOADING_STATE_MAX",
-        "CU_FUNCTION_LOADING_STATE_LOADED",
-        "CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER",
-        "CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES",
-        "CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX",
-        "CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32",
-        "CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD",
-        "CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC",
-        "CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT",
-        "CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX",
-        "CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE",
-        "CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF",
-        "CU_EXEC_AFFINITY_TYPE_SM_COUNT",
-        "CU_EXEC_AFFINITY_TYPE_MAX",
-        "CU_EVENT_WAIT_EXTERNAL",
-        "CU_EVENT_WAIT_DEFAULT",
-        "CU_EVENT_SCHED_YIELD",
-        "CU_EVENT_SCHED_SPIN",
-        "CU_EVENT_SCHED_BLOCKING_SYNC",
-        "CU_EVENT_SCHED_AUTO",
-        "CU_EVENT_RECORD_EXTERNAL",
-        "CU_EVENT_RECORD_DEFAULT",
-        "CU_EGL_RESOURCE_LOCATION_VIDMEM",
-        "CU_EGL_RESOURCE_LOCATION_SYSMEM",
-        "CU_EGL_FRAME_TYPE_PITCH",
-        "CU_EGL_FRAME_TYPE_ARRAY",
-        "CU_EGL_COLOR_FORMAT_YVYU_ER",
-        "CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YVU444_PLANAR",
-        "CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YVU422_PLANAR",
-        "CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YVU420_PLANAR",
-        "CU_EGL_COLOR_FORMAT_YUYV_ER",
-        "CU_EGL_COLOR_FORMAT_YUYV_422",
-        "CU_EGL_COLOR_FORMAT_YUV_ER",
-        "CU_EGL_COLOR_FORMAT_YUVA_ER",
-        "CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YUV444_PLANAR",
-        "CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YUV422_PLANAR",
-        "CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER",
-        "CU_EGL_COLOR_FORMAT_YUV420_PLANAR",
-        "CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR",
-        "CU_EGL_COLOR_FORMAT_VYUY_ER",
-        "CU_EGL_COLOR_FORMAT_UYVY_ER",
-        "CU_EGL_COLOR_FORMAT_UYVY_422",
-        "CU_EGL_COLOR_FORMAT_RGBA",
-        "CU_EGL_COLOR_FORMAT_RGB",
-        "CU_EGL_COLOR_FORMAT_RG",
-        "CU_EGL_COLOR_FORMAT_R",
-        "CU_EGL_COLOR_FORMAT_MAX",
-        "CU_EGL_COLOR_FORMAT_L",
-        "CU_EGL_COLOR_FORMAT_BGRA",
-        "CU_EGL_COLOR_FORMAT_BGR",
-        "CU_EGL_COLOR_FORMAT_BAYER_RGGB",
-        "CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB",
-        "CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG",
-        "CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG",
-        "CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR",
-        "CU_EGL_COLOR_FORMAT_BAYER_GRBG",
-        "CU_EGL_COLOR_FORMAT_BAYER_GBRG",
-        "CU_EGL_COLOR_FORMAT_BAYER_BGGR",
-        "CU_EGL_COLOR_FORMAT_BAYER20_RGGB",
-        "CU_EGL_COLOR_FORMAT_BAYER20_GRBG",
-        "CU_EGL_COLOR_FORMAT_BAYER20_GBRG",
-        "CU_EGL_COLOR_FORMAT_BAYER20_BGGR",
-        "CU_EGL_COLOR_FORMAT_BAYER14_RGGB",
-        "CU_EGL_COLOR_FORMAT_BAYER14_GRBG",
-        "CU_EGL_COLOR_FORMAT_BAYER14_GBRG",
-        "CU_EGL_COLOR_FORMAT_BAYER14_BGGR",
-        "CU_EGL_COLOR_FORMAT_BAYER12_RGGB",
-        "CU_EGL_COLOR_FORMAT_BAYER12_GRBG",
-        "CU_EGL_COLOR_FORMAT_BAYER12_GBRG",
-        "CU_EGL_COLOR_FORMAT_BAYER12_BGGR",
-        "CU_EGL_COLOR_FORMAT_BAYER10_RGGB",
-        "CU_EGL_COLOR_FORMAT_BAYER10_GRBG",
-        "CU_EGL_COLOR_FORMAT_BAYER10_GBRG",
-        "CU_EGL_COLOR_FORMAT_BAYER10_BGGR",
-        "CU_EGL_COLOR_FORMAT_AYUV_ER",
-        "CU_EGL_COLOR_FORMAT_AYUV",
-        "CU_EGL_COLOR_FORMAT_ARGB",
-        "CU_EGL_COLOR_FORMAT_ABGR",
-        "CU_EGL_COLOR_FORMAT_A",
-        "CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE",
-        "CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING",
-        "CU_DEV_RESOURCE_TYPE_SM",
-        "CU_DEV_RESOURCE_TYPE_MAX",
-        "CU_DEV_RESOURCE_TYPE_INVALID",
-        "CU_DEVICE_NUMA_CONFIG_NUMA_NODE",
-        "CU_DEVICE_NUMA_CONFIG_NONE",
-        "CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS",
-        "CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK",
-        "CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_NUMA_ID",
-        "CU_DEVICE_ATTRIBUTE_NUMA_CONFIG",
-        "CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_MPS_ENABLED",
-        "CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT",
-        "CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES",
-        "CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE",
-        "CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS",
-        "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS",
-        "CU_DEVICE_ATTRIBUTE_MAX",
-        "CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID",
-        "CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING",
-        "CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS",
-        "CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED",
-        "CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH",
-        "CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2",
-        "CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1",
-        "CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS",
-        "CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2",
-        "CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1",
-        "CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS",
-        "CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER",
-        "CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES",
-        "CU_D3D9_REGISTER_FLAGS_NONE",
-        "CU_D3D9_REGISTER_FLAGS_ARRAY",
-        "CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
-        "CU_D3D9_MAPRESOURCE_FLAGS_READONLY",
-        "CU_D3D9_MAPRESOURCE_FLAGS_NONE",
-        "CU_D3D9_DEVICE_LIST_NEXT_FRAME",
-        "CU_D3D9_DEVICE_LIST_CURRENT_FRAME",
-        "CU_D3D9_DEVICE_LIST_ALL",
-        "CU_D3D11_DEVICE_LIST_NEXT_FRAME",
-        "CU_D3D11_DEVICE_LIST_CURRENT_FRAME",
-        "CU_D3D11_DEVICE_LIST_ALL",
-        "CU_D3D10_REGISTER_FLAGS_NONE",
-        "CU_D3D10_REGISTER_FLAGS_ARRAY",
-        "CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
-        "CU_D3D10_MAPRESOURCE_FLAGS_READONLY",
-        "CU_D3D10_MAPRESOURCE_FLAGS_NONE",
-        "CU_D3D10_DEVICE_LIST_NEXT_FRAME",
-        "CU_D3D10_DEVICE_LIST_CURRENT_FRAME",
-        "CU_D3D10_DEVICE_LIST_ALL",
-        "CU_CUBEMAP_FACE_POSITIVE_Z",
-        "CU_CUBEMAP_FACE_POSITIVE_Y",
-        "CU_CUBEMAP_FACE_POSITIVE_X",
-        "CU_CUBEMAP_FACE_NEGATIVE_Z",
-        "CU_CUBEMAP_FACE_NEGATIVE_Y",
-        "CU_CUBEMAP_FACE_NEGATIVE_X",
-        "CU_CTX_USER_COREDUMP_ENABLE",
-        "CU_CTX_SYNC_MEMOPS",
-        "CU_CTX_FLAGS_MASK",
-        "CU_CTX_COREDUMP_ENABLE",
-        "CU_COREDUMP_TRIGGER_HOST",
-        "CU_COREDUMP_SKIP_SHARED_MEMORY",
-        "CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES",
-        "CU_COREDUMP_SKIP_LOCAL_MEMORY",
-        "CU_COREDUMP_SKIP_GLOBAL_MEMORY",
-        "CU_COREDUMP_SKIP_ABORT",
-        "CU_COREDUMP_PIPE",
-        "CU_COREDUMP_MAX",
-        "CU_COREDUMP_LIGHTWEIGHT_FLAGS",
-        "CU_COREDUMP_LIGHTWEIGHT",
-        "CU_COREDUMP_GENERATION_FLAGS",
-        "CU_COREDUMP_FILE",
-        "CU_COREDUMP_ENABLE_USER_TRIGGER",
-        "CU_COREDUMP_ENABLE_ON_EXCEPTION",
-        "CU_COREDUMP_DEFAULT_FLAGS",
-        "CU_COMPUTE_ACCELERATED_TARGET_BASE",
-        "CU_CLUSTER_SCHEDULING_POLICY_SPREAD",
-        "CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING",
-        "CU_CLUSTER_SCHEDULING_POLICY_DEFAULT",
-        "CU_ASYNC_NOTIFICATION_TYPE_OVER_BUDGET",
-        "CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL",
-        "CU_AD_FORMAT_YUY2",
-        "CU_AD_FORMAT_Y444_PLANAR8",
-        "CU_AD_FORMAT_Y444_PLANAR10",
-        "CU_AD_FORMAT_Y416",
-        "CU_AD_FORMAT_Y410",
-        "CU_AD_FORMAT_Y216",
-        "CU_AD_FORMAT_Y210",
-        "CU_AD_FORMAT_UNORM_INT8X4",
-        "CU_AD_FORMAT_UNORM_INT8X2",
-        "CU_AD_FORMAT_UNORM_INT8X1",
-        "CU_AD_FORMAT_UNORM_INT16X4",
-        "CU_AD_FORMAT_UNORM_INT16X2",
-        "CU_AD_FORMAT_UNORM_INT16X1",
-        "CU_AD_FORMAT_SNORM_INT8X4",
-        "CU_AD_FORMAT_SNORM_INT8X2",
-        "CU_AD_FORMAT_SNORM_INT8X1",
-        "CU_AD_FORMAT_SNORM_INT16X4",
-        "CU_AD_FORMAT_SNORM_INT16X2",
-        "CU_AD_FORMAT_SNORM_INT16X1",
-        "CU_AD_FORMAT_P216",
-        "CU_AD_FORMAT_P210",
-        "CU_AD_FORMAT_P016",
-        "CU_AD_FORMAT_P010",
-        "CU_AD_FORMAT_NV16",
-        "CU_AD_FORMAT_NV12",
-        "CU_AD_FORMAT_MAX",
-        "CU_AD_FORMAT_BC7_UNORM_SRGB",
-        "CU_AD_FORMAT_BC7_UNORM",
-        "CU_AD_FORMAT_BC6H_UF16",
-        "CU_AD_FORMAT_BC6H_SF16",
-        "CU_AD_FORMAT_BC5_UNORM",
-        "CU_AD_FORMAT_BC5_SNORM",
-        "CU_AD_FORMAT_BC4_UNORM",
-        "CU_AD_FORMAT_BC4_SNORM",
-        "CU_AD_FORMAT_BC3_UNORM_SRGB",
-        "CU_AD_FORMAT_BC3_UNORM",
-        "CU_AD_FORMAT_BC2_UNORM_SRGB",
-        "CU_AD_FORMAT_BC2_UNORM",
-        "CU_AD_FORMAT_BC1_UNORM_SRGB",
-        "CU_AD_FORMAT_BC1_UNORM",
-        "CU_AD_FORMAT_AYUV",
-        "CUSPARSE_SPSV_UPDATE_GENERAL",
-        "CUSPARSE_SPSV_UPDATE_DIAGONAL",
-        "CUSPARSE_SPSM_UPDATE_GENERAL",
-        "CUSPARSE_SPSM_UPDATE_DIAGONAL",
-        "CUSPARSE_SPMV_SELL_ALG1",
-        "CUSPARSE_SPMM_OP_ALG_DEFAULT",
-        "CUSPARSE_SPMM_BSR_ALG1",
-        "CUSPARSE_SPMMA_PREPROCESS",
-        "CUSPARSE_SPMMA_ALG4",
-        "CUSPARSE_SPMMA_ALG3",
-        "CUSPARSE_SPMMA_ALG2",
-        "CUSPARSE_SPMMA_ALG1",
-        "CUSPARSE_SIDE_RIGHT",
-        "CUSPARSE_SIDE_LEFT",
-        "CUSPARSE_FORMAT_SLICED_ELLPACK",
-        "CUSPARSE_FORMAT_BSR",
-        "CUSPARSE_COLOR_ALG1",
-        "CUSPARSE_COLOR_ALG0",
-        "CUSPARSE_ALG_NAIVE",
-        "CUSPARSE_ALG_MERGE_PATH",
-        "CUSPARSE_ALG1",
-        "CUSPARSE_ALG0",
-        "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED",
-        "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE",
-        "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC",
-        "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER",
-        "CUSOLVER_STATUS_IRS_OUT_OF_RANGE",
-        "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES",
-        "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR",
-        "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED",
-        "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED",
-        "CUSOLVER_STATUS_INVALID_WORKSPACE",
-        "CUSOLVER_STATUS_INVALID_LICENSE",
-        "CUSOLVER_R_TF32",
-        "CUSOLVER_R_AP",
-        "CUSOLVER_R_8U",
-        "CUSOLVER_R_8I",
-        "CUSOLVER_R_64F",
-        "CUSOLVER_R_32F",
-        "CUSOLVER_R_16F",
-        "CUSOLVER_R_16BF",
-        "CUSOLVER_PREC_SS",
-        "CUSOLVER_PREC_SHT",
-        "CUSOLVER_PREC_DD",
-        "CUSOLVER_ONE_NORM",
-        "CUSOLVER_MAX_NORM",
-        "CUSOLVER_IRS_REFINE_NOT_SET",
-        "CUSOLVER_IRS_REFINE_NONE",
-        "CUSOLVER_IRS_REFINE_GMRES_NOPCOND",
-        "CUSOLVER_IRS_REFINE_GMRES_GMRES",
-        "CUSOLVER_IRS_REFINE_GMRES",
-        "CUSOLVER_IRS_REFINE_CLASSICAL_GMRES",
-        "CUSOLVER_IRS_REFINE_CLASSICAL",
-        "CUSOLVER_INF_NORM",
-        "CUSOLVER_FRO_NORM",
-        "CUSOLVER_DETERMINISTIC_RESULTS",
-        "CUSOLVER_C_TF32",
-        "CUSOLVER_C_AP",
-        "CUSOLVER_C_8U",
-        "CUSOLVER_C_8I",
-        "CUSOLVER_C_64F",
-        "CUSOLVER_C_32F",
-        "CUSOLVER_C_16F",
-        "CUSOLVER_C_16BF",
-        "CUSOLVER_ALLOW_NON_DETERMINISTIC_RESULTS",
-        "CUSOLVER_ALG_2",
-        "CUSOLVERDN_POTRF",
-        "CURAND_REJECTION",
-        "CURAND_POISSON",
-        "CURAND_M2",
-        "CURAND_M1",
-        "CURAND_KNUTH",
-        "CURAND_ITR",
-        "CURAND_HITR",
-        "CURAND_FAST_REJECTION",
-        "CURAND_DISCRETE_GAUSS",
-        "CURAND_DEVICE_API",
-        "CURAND_DEFINITION",
-        "CURAND_CHOOSE_BEST",
-        "CURAND_BINARY_SEARCH",
-        "CURAND_3RD",
-        "CUGLmap_flags_enum",
-        "CUGLmap_flags",
-        "CUFFT_XT_FORMAT_OUTPUT",
-        "CUFFT_XT_FORMAT_INPUT",
-        "CUFFT_XT_FORMAT_INPLACE_SHUFFLED",
-        "CUFFT_XT_FORMAT_INPLACE",
-        "CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT",
-        "CUFFT_XT_FORMAT_DISTRIBUTED_INPUT",
-        "CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED",
-        "CUFFT_WORKAREA_USER",
-        "CUFFT_WORKAREA_PERFORMANCE",
-        "CUFFT_WORKAREA_MINIMAL",
-        "CUFFT_QUERY_UNDEFINED",
-        "CUFFT_QUERY_1D_FACTORS",
-        "CUFFT_LICENSE_ERROR",
-        "CUFFT_FORMAT_UNDEFINED",
-        "CUFFT_COPY_UNDEFINED",
-        "CUFFT_COPY_HOST_TO_DEVICE",
-        "CUFFT_COPY_DEVICE_TO_HOST",
-        "CUFFT_COPY_DEVICE_TO_DEVICE",
-        "CUFFT_COMPATIBILITY_FFTW_PADDING",
-        "CUFFT_COMPATIBILITY_DEFAULT",
-        "CUDNN_ZERO_PAD",
-        "CUDNN_WGRAD_MODE_SET",
-        "CUDNN_WGRAD_MODE_ADD",
-        "CUDNN_TYPE_VOID_PTR",
-        "CUDNN_TYPE_TENSOR_REORDERING_MODE",
-        "CUDNN_TYPE_SIGNAL_MODE",
-        "CUDNN_TYPE_RNG_DISTRIBUTION",
-        "CUDNN_TYPE_RESAMPLE_MODE",
-        "CUDNN_TYPE_REDUCTION_OPERATOR_TYPE",
-        "CUDNN_TYPE_POINTWISE_MODE",
-        "CUDNN_TYPE_PADDING_MODE",
-        "CUDNN_TYPE_NUMERICAL_NOTE",
-        "CUDNN_TYPE_NORM_MODE",
-        "CUDNN_TYPE_NORM_FWD_PHASE",
-        "CUDNN_TYPE_NAN_PROPOGATION",
-        "CUDNN_TYPE_LAYOUT_TYPE",
-        "CUDNN_TYPE_KNOB_TYPE",
-        "CUDNN_TYPE_INT64",
-        "CUDNN_TYPE_INT32",
-        "CUDNN_TYPE_HEUR_MODE",
-        "CUDNN_TYPE_HANDLE",
-        "CUDNN_TYPE_GENSTATS_MODE",
-        "CUDNN_TYPE_FRACTION",
-        "CUDNN_TYPE_FLOAT",
-        "CUDNN_TYPE_DOUBLE",
-        "CUDNN_TYPE_DATA_TYPE",
-        "CUDNN_TYPE_CONVOLUTION_MODE",
-        "CUDNN_TYPE_CHAR",
-        "CUDNN_TYPE_BOOLEAN",
-        "CUDNN_TYPE_BN_FINALIZE_STATS_MODE",
-        "CUDNN_TYPE_BEHAVIOR_NOTE",
-        "CUDNN_TYPE_BACKEND_DESCRIPTOR",
-        "CUDNN_TYPE_ATTRIB_NAME",
-        "CUDNN_TRANSFORM_UNFOLD",
-        "CUDNN_TRANSFORM_FOLD",
-        "CUDNN_TENSOR_REORDERING_NONE",
-        "CUDNN_TENSOR_REORDERING_INT8x32",
-        "CUDNN_TENSOR_REORDERING_F16x16",
-        "CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION",
-        "CUDNN_STATUS_VERSION_MISMATCH",
-        "CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH",
-        "CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED",
-        "CUDNN_STATUS_SPECIFIC_ERROR",
-        "CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH",
-        "CUDNN_STATUS_RUNTIME_IN_PROGRESS",
-        "CUDNN_STATUS_RUNTIME_FP_OVERFLOW",
-        "CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE",
-        "CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT",
-        "CUDNN_STATUS_NOT_SUPPORTED_SHAPE",
-        "CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING",
-        "CUDNN_STATUS_NOT_SUPPORTED_PADDING",
-        "CUDNN_STATUS_NOT_SUPPORTED_LAYOUT",
-        "CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER",
-        "CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART",
-        "CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN",
-        "CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE",
-        "CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM",
-        "CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH",
-        "CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE",
-        "CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED",
-        "CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED",
-        "CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED",
-        "CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED",
-        "CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM",
-        "CUDNN_STATUS_FULL_ERROR_CODE",
-        "CUDNN_STATUS_EXECUTION_FAILED_CURAND",
-        "CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER",
-        "CUDNN_STATUS_EXECUTION_FAILED_CUDART",
-        "CUDNN_STATUS_EXECUTION_FAILED_CUBLAS",
-        "CUDNN_STATUS_DEPRECATED",
-        "CUDNN_STATUS_CATEGORY",
-        "CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH",
-        "CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT",
-        "CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH",
-        "CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND",
-        "CUDNN_STATUS_BAD_PARAM_NULL_POINTER",
-        "CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED",
-        "CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER",
-        "CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES",
-        "CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE",
-        "CUDNN_SIGNAL_WAIT",
-        "CUDNN_SIGNAL_SET",
-        "CUDNN_SEV_WARNING_EN",
-        "CUDNN_SEV_WARNING",
-        "CUDNN_SEV_INFO_EN",
-        "CUDNN_SEV_INFO",
-        "CUDNN_SEV_FATAL",
-        "CUDNN_SEV_ERROR_EN",
-        "CUDNN_SEV_ERROR",
-        "CUDNN_SEQDATA_VECT_DIM",
-        "CUDNN_SEQDATA_TIME_DIM",
-        "CUDNN_SEQDATA_DIM_COUNT",
-        "CUDNN_SEQDATA_BEAM_DIM",
-        "CUDNN_SEQDATA_BATCH_DIM",
-        "CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES",
-        "CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT",
-        "CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR",
-        "CUDNN_SCALAR_DOUBLE_BN_EPSILON",
-        "CUDNN_SAMPLER_BILINEAR",
-        "CUDNN_RNN_PADDED_IO_ENABLED",
-        "CUDNN_RNN_PADDED_IO_DISABLED",
-        "CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED",
-        "CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED",
-        "CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED",
-        "CUDNN_RNN_CLIP_NONE",
-        "CUDNN_RNN_CLIP_MINMAX",
-        "CUDNN_RNN_ALGO_COUNT",
-        "CUDNN_RNG_DISTRIBUTION_UNIFORM",
-        "CUDNN_RNG_DISTRIBUTION_NORMAL",
-        "CUDNN_RNG_DISTRIBUTION_BERNOULLI",
-        "CUDNN_RMS_NORM",
-        "CUDNN_RESAMPLE_NEAREST",
-        "CUDNN_RESAMPLE_MAXPOOL",
-        "CUDNN_RESAMPLE_BILINEAR",
-        "CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING",
-        "CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING",
-        "CUDNN_RESAMPLE_AVGPOOL",
-        "CUDNN_PTR_ZDATA",
-        "CUDNN_PTR_YSUM",
-        "CUDNN_PTR_YSQSUM",
-        "CUDNN_PTR_YDATA",
-        "CUDNN_PTR_XDATA",
-        "CUDNN_PTR_WORKSPACE",
-        "CUDNN_PTR_WDATA",
-        "CUDNN_PTR_NULL",
-        "CUDNN_PTR_ELEM_ALIGNED",
-        "CUDNN_PTR_DZDATA",
-        "CUDNN_PTR_DYDATA",
-        "CUDNN_PTR_DXDATA",
-        "CUDNN_PTR_DWDATA",
-        "CUDNN_PTR_BN_Z_EQSCALE",
-        "CUDNN_PTR_BN_Z_EQBIAS",
-        "CUDNN_PTR_BN_SCALE",
-        "CUDNN_PTR_BN_SAVED_MEAN",
-        "CUDNN_PTR_BN_SAVED_INVSTD",
-        "CUDNN_PTR_BN_RUNNING_VAR",
-        "CUDNN_PTR_BN_RUNNING_MEAN",
-        "CUDNN_PTR_BN_EQSCALE",
-        "CUDNN_PTR_BN_EQBIAS",
-        "CUDNN_PTR_BN_DSCALE",
-        "CUDNN_PTR_BN_DBIAS",
-        "CUDNN_PTR_BN_BIAS",
-        "CUDNN_PTR_ACTIVATION_BITMASK",
-        "CUDNN_PTR_16B_ALIGNED",
-        "CUDNN_POINTWISE_TANH_FWD",
-        "CUDNN_POINTWISE_TANH_BWD",
-        "CUDNN_POINTWISE_TAN",
-        "CUDNN_POINTWISE_SWISH_FWD",
-        "CUDNN_POINTWISE_SWISH_BWD",
-        "CUDNN_POINTWISE_SUB",
-        "CUDNN_POINTWISE_SQRT",
-        "CUDNN_POINTWISE_SOFTPLUS_FWD",
-        "CUDNN_POINTWISE_SOFTPLUS_BWD",
-        "CUDNN_POINTWISE_SIN",
-        "CUDNN_POINTWISE_SIGMOID_FWD",
-        "CUDNN_POINTWISE_SIGMOID_BWD",
-        "CUDNN_POINTWISE_RSQRT",
-        "CUDNN_POINTWISE_RELU_FWD",
-        "CUDNN_POINTWISE_RELU_BWD",
-        "CUDNN_POINTWISE_RECIPROCAL",
-        "CUDNN_POINTWISE_POW",
-        "CUDNN_POINTWISE_NEG",
-        "CUDNN_POINTWISE_MUL",
-        "CUDNN_POINTWISE_MOD",
-        "CUDNN_POINTWISE_MIN",
-        "CUDNN_POINTWISE_MAX",
-        "CUDNN_POINTWISE_LOGICAL_OR",
-        "CUDNN_POINTWISE_LOGICAL_NOT",
-        "CUDNN_POINTWISE_LOGICAL_AND",
-        "CUDNN_POINTWISE_LOG",
-        "CUDNN_POINTWISE_IDENTITY",
-        "CUDNN_POINTWISE_GEN_INDEX",
-        "CUDNN_POINTWISE_GELU_FWD",
-        "CUDNN_POINTWISE_GELU_BWD",
-        "CUDNN_POINTWISE_GELU_APPROX_TANH_FWD",
-        "CUDNN_POINTWISE_GELU_APPROX_TANH_BWD",
-        "CUDNN_POINTWISE_FLOOR",
-        "CUDNN_POINTWISE_EXP",
-        "CUDNN_POINTWISE_ERF",
-        "CUDNN_POINTWISE_ELU_FWD",
-        "CUDNN_POINTWISE_ELU_BWD",
-        "CUDNN_POINTWISE_DIV",
-        "CUDNN_POINTWISE_COS",
-        "CUDNN_POINTWISE_CMP_NEQ",
-        "CUDNN_POINTWISE_CMP_LT",
-        "CUDNN_POINTWISE_CMP_LE",
-        "CUDNN_POINTWISE_CMP_GT",
-        "CUDNN_POINTWISE_CMP_GE",
-        "CUDNN_POINTWISE_CMP_EQ",
-        "CUDNN_POINTWISE_CEIL",
-        "CUDNN_POINTWISE_BINARY_SELECT",
-        "CUDNN_POINTWISE_ATAN2",
-        "CUDNN_POINTWISE_ADD_SQUARE",
-        "CUDNN_POINTWISE_ADD",
-        "CUDNN_POINTWISE_ABS",
-        "CUDNN_PARAM_ZDESC",
-        "CUDNN_PARAM_ZDATA_PLACEHOLDER",
-        "CUDNN_PARAM_YSUM_PLACEHOLDER",
-        "CUDNN_PARAM_YSTATS_DESC",
-        "CUDNN_PARAM_YSQSUM_PLACEHOLDER",
-        "CUDNN_PARAM_YDESC",
-        "CUDNN_PARAM_YDATA_PLACEHOLDER",
-        "CUDNN_PARAM_XDESC",
-        "CUDNN_PARAM_XDATA_PLACEHOLDER",
-        "CUDNN_PARAM_WDESC",
-        "CUDNN_PARAM_WDATA_PLACEHOLDER",
-        "CUDNN_PARAM_DZDESC",
-        "CUDNN_PARAM_DZDATA_PLACEHOLDER",
-        "CUDNN_PARAM_DYDESC",
-        "CUDNN_PARAM_DYDATA_PLACEHOLDER",
-        "CUDNN_PARAM_DXDESC",
-        "CUDNN_PARAM_DXDATA_PLACEHOLDER",
-        "CUDNN_PARAM_DWDESC",
-        "CUDNN_PARAM_DWDATA_PLACEHOLDER",
-        "CUDNN_PARAM_CONV_DESC",
-        "CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER",
-        "CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC",
-        "CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER",
-        "CUDNN_PARAM_BN_SCALE_PLACEHOLDER",
-        "CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC",
-        "CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER",
-        "CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER",
-        "CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER",
-        "CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER",
-        "CUDNN_PARAM_BN_MODE",
-        "CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER",
-        "CUDNN_PARAM_BN_EQSCALEBIAS_DESC",
-        "CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER",
-        "CUDNN_PARAM_BN_DSCALE_PLACEHOLDER",
-        "CUDNN_PARAM_BN_DBIAS_PLACEHOLDER",
-        "CUDNN_PARAM_BN_BIAS_PLACEHOLDER",
-        "CUDNN_PARAM_ACTIVATION_DESC",
-        "CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER",
-        "CUDNN_PARAM_ACTIVATION_BITMASK_DESC",
-        "CUDNN_OP_TENSOR_NOT",
-        "CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6",
-        "CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4",
-        "CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13",
-        "CUDNN_NUMERICAL_NOTE_WINOGRAD",
-        "CUDNN_NUMERICAL_NOTE_TYPE_COUNT",
-        "CUDNN_NUMERICAL_NOTE_TENSOR_CORE",
-        "CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP",
-        "CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION",
-        "CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC",
-        "CUDNN_NUMERICAL_NOTE_FFT",
-        "CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS",
-        "CUDNN_NO_REORDER",
-        "CUDNN_NORM_PER_CHANNEL",
-        "CUDNN_NORM_PER_ACTIVATION",
-        "CUDNN_NORM_OPS_NORM_ADD_ACTIVATION",
-        "CUDNN_NORM_OPS_NORM_ACTIVATION",
-        "CUDNN_NORM_OPS_NORM",
-        "CUDNN_NORM_FWD_TRAINING",
-        "CUDNN_NORM_FWD_INFERENCE",
-        "CUDNN_NORM_ALGO_STANDARD",
-        "CUDNN_NORM_ALGO_PERSIST",
-        "CUDNN_NON_DETERMINISTIC",
-        "CUDNN_NEG_INF_PAD",
-        "CUDNN_MH_ATTN_V_WEIGHTS",
-        "CUDNN_MH_ATTN_V_BIASES",
-        "CUDNN_MH_ATTN_Q_WEIGHTS",
-        "CUDNN_MH_ATTN_Q_BIASES",
-        "CUDNN_MH_ATTN_O_WEIGHTS",
-        "CUDNN_MH_ATTN_O_BIASES",
-        "CUDNN_MH_ATTN_K_WEIGHTS",
-        "CUDNN_MH_ATTN_K_BIASES",
-        "CUDNN_LRN_MIN_N",
-        "CUDNN_LRN_MIN_K",
-        "CUDNN_LRN_MIN_BETA",
-        "CUDNN_LRN_MAX_N",
-        "CUDNN_LOSS_NORMALIZATION_SOFTMAX",
-        "CUDNN_LOSS_NORMALIZATION_NONE",
-        "CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK",
-        "CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK",
-        "CUDNN_LAYOUT_TYPE_PREFERRED_NHWC",
-        "CUDNN_LAYOUT_TYPE_PREFERRED_NCHW",
-        "CUDNN_LAYOUT_TYPE_COUNT",
-        "CUDNN_LAYER_NORM",
-        "CUDNN_KNOB_TYPE_WORKSPACE",
-        "CUDNN_KNOB_TYPE_WINO_TILE",
-        "CUDNN_KNOB_TYPE_USE_TEX",
-        "CUDNN_KNOB_TYPE_TILE_SIZE",
-        "CUDNN_KNOB_TYPE_TILE_ROWS",
-        "CUDNN_KNOB_TYPE_TILE_COLS",
-        "CUDNN_KNOB_TYPE_TILE_CGA_N",
-        "CUDNN_KNOB_TYPE_TILE_CGA_M",
-        "CUDNN_KNOB_TYPE_TILE_CGA",
-        "CUDNN_KNOB_TYPE_TILEK",
-        "CUDNN_KNOB_TYPE_SWIZZLE",
-        "CUDNN_KNOB_TYPE_STAGES",
-        "CUDNN_KNOB_TYPE_SPLIT_RS",
-        "CUDNN_KNOB_TYPE_SPLIT_K_SLC",
-        "CUDNN_KNOB_TYPE_SPLIT_K_BUF",
-        "CUDNN_KNOB_TYPE_SPLIT_K",
-        "CUDNN_KNOB_TYPE_SPLIT_H",
-        "CUDNN_KNOB_TYPE_SPLIT_COLS",
-        "CUDNN_KNOB_TYPE_SPECFILT",
-        "CUDNN_KNOB_TYPE_SLICED",
-        "CUDNN_KNOB_TYPE_SINGLEBUFFER",
-        "CUDNN_KNOB_TYPE_REDUCTION_MODE",
-        "CUDNN_KNOB_TYPE_OCCUPANCY",
-        "CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK",
-        "CUDNN_KNOB_TYPE_MULTIPLY",
-        "CUDNN_KNOB_TYPE_LOAD_SIZE",
-        "CUDNN_KNOB_TYPE_LDGC",
-        "CUDNN_KNOB_TYPE_LDGB",
-        "CUDNN_KNOB_TYPE_LDGA",
-        "CUDNN_KNOB_TYPE_KERNEL_CFG",
-        "CUDNN_KNOB_TYPE_KBLOCK",
-        "CUDNN_KNOB_TYPE_IDX_MODE",
-        "CUDNN_KNOB_TYPE_EDGE",
-        "CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE",
-        "CUDNN_KNOB_TYPE_COUNTS",
-        "CUDNN_KNOB_TYPE_CHUNK_K",
-        "CUDNN_KNOB_TYPE_BLOCK_SIZE",
-        "CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD",
-        "CUDNN_INSTANCE_NORM",
-        "CUDNN_HEUR_MODE_INSTANT",
-        "CUDNN_HEUR_MODE_FALLBACK",
-        "CUDNN_HEUR_MODE_B",
-        "CUDNN_HEUR_MODE_A",
-        "CUDNN_HEUR_MODES_COUNT",
-        "CUDNN_GROUP_NORM",
-        "CUDNN_GENSTATS_SUM_SQSUM",
-        "CUDNN_FWD_MODE_TRAINING",
-        "CUDNN_FWD_MODE_INFERENCE",
-        "CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK",
-        "CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD",
-        "CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS",
-        "CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM",
-        "CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION",
-        "CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING",
-        "CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE",
-        "CUDNN_FMA_MATH",
-        "CUDNN_ERRQUERY_RAWCODE",
-        "CUDNN_ERRQUERY_NONBLOCKING",
-        "CUDNN_ERRQUERY_BLOCKING",
-        "CUDNN_EDGE_VAL_PAD",
-        "CUDNN_DIVNORM_PRECOMPUTED_MEANS",
-        "CUDNN_DIM_MAX",
-        "CUDNN_DETERMINISTIC",
-        "CUDNN_DEFAULT_REORDER",
-        "CUDNN_DATA_UINT8x4",
-        "CUDNN_DATA_UINT8",
-        "CUDNN_DATA_INT8x32",
-        "CUDNN_DATA_INT64",
-        "CUDNN_DATA_FP8_E5M2",
-        "CUDNN_DATA_FP8_E4M3",
-        "CUDNN_DATA_FAST_FLOAT_FOR_FP8",
-        "CUDNN_DATA_BOOLEAN",
-        "CUDNN_DATA_BFLOAT16",
-        "CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC",
-        "CUDNN_CTC_LOSS_ALGO_DETERMINISTIC",
-        "CUDNN_BN_FINALIZE_STATISTICS_TRAINING",
-        "CUDNN_BN_FINALIZE_STATISTICS_INFERENCE",
-        "CUDNN_BEHAVIOR_NOTE_TYPE_COUNT",
-        "CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION",
-        "CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER",
-        "CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER",
-        "CUDNN_BATCH_NORM",
-        "CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION",
-        "CUDNN_BATCHNORM_OPS_BN_ACTIVATION",
-        "CUDNN_BATCHNORM_OPS_BN",
-        "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR",
-        "CUDNN_BACKEND_TENSOR_DESCRIPTOR",
-        "CUDNN_BACKEND_RNG_DESCRIPTOR",
-        "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR",
-        "CUDNN_BACKEND_REDUCTION_DESCRIPTOR",
-        "CUDNN_BACKEND_POINTWISE_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR",
-        "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR",
-        "CUDNN_BACKEND_MATMUL_DESCRIPTOR",
-        "CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR",
-        "CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR",
-        "CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR",
-        "CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR",
-        "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR",
-        "CUDNN_BACKEND_ENGINE_DESCRIPTOR",
-        "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR",
-        "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR",
-        "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR",
-        "CUDNN_ATTR_VARIANT_PACK_WORKSPACE",
-        "CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS",
-        "CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES",
-        "CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS",
-        "CUDNN_ATTR_TENSOR_VECTOR_COUNT",
-        "CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION",
-        "CUDNN_ATTR_TENSOR_UNIQUE_ID",
-        "CUDNN_ATTR_TENSOR_STRIDES",
-        "CUDNN_ATTR_TENSOR_REORDERING_MODE",
-        "CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC",
-        "CUDNN_ATTR_TENSOR_IS_VIRTUAL",
-        "CUDNN_ATTR_TENSOR_IS_BY_VALUE",
-        "CUDNN_ATTR_TENSOR_DIMENSIONS",
-        "CUDNN_ATTR_TENSOR_DATA_TYPE",
-        "CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT",
-        "CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM",
-        "CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM",
-        "CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION",
-        "CUDNN_ATTR_RNG_NORMAL_DIST_MEAN",
-        "CUDNN_ATTR_RNG_DISTRIBUTION",
-        "CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY",
-        "CUDNN_ATTR_RESAMPLE_WINDOW_DIMS",
-        "CUDNN_ATTR_RESAMPLE_STRIDES",
-        "CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS",
-        "CUDNN_ATTR_RESAMPLE_PRE_PADDINGS",
-        "CUDNN_ATTR_RESAMPLE_POST_PADDINGS",
-        "CUDNN_ATTR_RESAMPLE_PADDING_MODE",
-        "CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION",
-        "CUDNN_ATTR_RESAMPLE_MODE",
-        "CUDNN_ATTR_RESAMPLE_COMP_TYPE",
-        "CUDNN_ATTR_REDUCTION_OPERATOR",
-        "CUDNN_ATTR_REDUCTION_COMP_TYPE",
-        "CUDNN_ATTR_POINTWISE_SWISH_BETA",
-        "CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA",
-        "CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP",
-        "CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE",
-        "CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP",
-        "CUDNN_ATTR_POINTWISE_NAN_PROPAGATION",
-        "CUDNN_ATTR_POINTWISE_MODE",
-        "CUDNN_ATTR_POINTWISE_MATH_PREC",
-        "CUDNN_ATTR_POINTWISE_ELU_ALPHA",
-        "CUDNN_ATTR_POINTWISE_AXIS",
-        "CUDNN_ATTR_OPERATION_SIGNAL_YDESC",
-        "CUDNN_ATTR_OPERATION_SIGNAL_XDESC",
-        "CUDNN_ATTR_OPERATION_SIGNAL_VALUE",
-        "CUDNN_ATTR_OPERATION_SIGNAL_MODE",
-        "CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC",
-        "CUDNN_ATTR_OPERATION_RNG_YDESC",
-        "CUDNN_ATTR_OPERATION_RNG_SEED",
-        "CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC",
-        "CUDNN_ATTR_OPERATION_RNG_DESC",
-        "CUDNN_ATTR_OPERATION_RESHAPE_YDESC",
-        "CUDNN_ATTR_OPERATION_RESHAPE_XDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA",
-        "CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA",
-        "CUDNN_ATTR_OPERATION_REDUCTION_YDESC",
-        "CUDNN_ATTR_OPERATION_REDUCTION_XDESC",
-        "CUDNN_ATTR_OPERATION_REDUCTION_DESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_YDESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_XDESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_TDESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR",
-        "CUDNN_ATTR_OPERATION_POINTWISE_DYDESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_DXDESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_BDESC",
-        "CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2",
-        "CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_YDESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_XDESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_PHASE",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_MODE",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_XDESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_MODE",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC",
-        "CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT",
-        "CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_DESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_CDESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_BDESC",
-        "CUDNN_ATTR_OPERATION_MATMUL_ADESC",
-        "CUDNN_ATTR_OPERATION_GENSTATS_XDESC",
-        "CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC",
-        "CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC",
-        "CUDNN_ATTR_OPERATION_GENSTATS_MODE",
-        "CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA",
-        "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA",
-        "CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC",
-        "CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS",
-        "CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX",
-        "CUDNN_ATTR_OPERATION_CONCAT_AXIS",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC",
-        "CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC",
-        "CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC",
-        "CUDNN_ATTR_OPERATIONGRAPH_OPS",
-        "CUDNN_ATTR_OPERATIONGRAPH_HANDLE",
-        "CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT",
-        "CUDNN_ATTR_MATMUL_PADDING_VALUE",
-        "CUDNN_ATTR_MATMUL_COMP_TYPE",
-        "CUDNN_ATTR_LAYOUT_INFO_TYPES",
-        "CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID",
-        "CUDNN_ATTR_KNOB_INFO_TYPE",
-        "CUDNN_ATTR_KNOB_INFO_STRIDE",
-        "CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE",
-        "CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE",
-        "CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE",
-        "CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE",
-        "CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID",
-        "CUDNN_ATTR_INTERMEDIATE_INFO_SIZE",
-        "CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS",
-        "CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES",
-        "CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE",
-        "CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS",
-        "CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION",
-        "CUDNN_ATTR_EXECUTION_PLAN_HANDLE",
-        "CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG",
-        "CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS",
-        "CUDNN_ATTR_ENGINE_SM_COUNT_TARGET",
-        "CUDNN_ATTR_ENGINE_OPERATION_GRAPH",
-        "CUDNN_ATTR_ENGINE_NUMERICAL_NOTE",
-        "CUDNN_ATTR_ENGINE_LAYOUT_INFO",
-        "CUDNN_ATTR_ENGINE_KNOB_INFO",
-        "CUDNN_ATTR_ENGINE_GLOBAL_INDEX",
-        "CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE",
-        "CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET",
-        "CUDNN_ATTR_ENGINEHEUR_RESULTS",
-        "CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH",
-        "CUDNN_ATTR_ENGINEHEUR_MODE",
-        "CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE",
-        "CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED",
-        "CUDNN_ATTR_ENGINECFG_KNOB_CHOICES",
-        "CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO",
-        "CUDNN_ATTR_ENGINECFG_ENGINE",
-        "CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS",
-        "CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS",
-        "CUDNN_ATTR_CONVOLUTION_POST_PADDINGS",
-        "CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES",
-        "CUDNN_ATTR_CONVOLUTION_DILATIONS",
-        "CUDNN_ATTR_CONVOLUTION_CONV_MODE",
-        "CUDNN_ATTR_CONVOLUTION_COMP_TYPE",
-        "CUDNN_ATTN_WKIND_COUNT",
-        "CUDNN_ATTN_QUERYMAP_ONE_TO_ONE",
-        "CUDNN_ATTN_QUERYMAP_ALL_TO_ONE",
-        "CUDNN_ATTN_ENABLE_PROJ_BIASES",
-        "CUDNN_ATTN_DISABLE_PROJ_BIASES",
-        "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1",
-        "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st",
-        "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS",
-        "CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum",
-        "CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS",
-        "CUDA_NVSCISYNC_ATTR_WAIT",
-        "CUDA_NVSCISYNC_ATTR_SIGNAL",
-        "CUDA_MEM_ALLOC_NODE_PARAMS_v2_st",
-        "CUDA_MEM_ALLOC_NODE_PARAMS_v2",
-        "CUDA_MEMSET_NODE_PARAMS_v2_st",
-        "CUDA_MEMSET_NODE_PARAMS_v2",
-        "CUDA_MEMCPY3D_PEER_v1",
-        "CUDA_MEMCPY3D_PEER_st",
-        "CUDA_MEMCPY3D_PEER",
-        "CUDA_KERNEL_NODE_PARAMS_v3_st",
-        "CUDA_KERNEL_NODE_PARAMS_v3",
-        "CUDA_KERNEL_NODE_PARAMS_v2_st",
-        "CUDA_KERNEL_NODE_PARAMS_v2",
-        "CUDA_HOST_NODE_PARAMS_v2_st",
-        "CUDA_HOST_NODE_PARAMS_v2",
-        "CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC",
-        "CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC",
-        "CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1",
-        "CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st",
-        "CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC",
-        "CUDA_ERROR_UNSUPPORTED_PTX_VERSION",
-        "CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY",
-        "CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC",
-        "CUDA_ERROR_TOO_MANY_PEERS",
-        "CUDA_ERROR_TIMEOUT",
-        "CUDA_ERROR_SYSTEM_NOT_READY",
-        "CUDA_ERROR_SYSTEM_DRIVER_MISMATCH",
-        "CUDA_ERROR_STUB_LIBRARY",
-        "CUDA_ERROR_NVLINK_UNCORRECTABLE",
-        "CUDA_ERROR_NOT_PERMITTED",
-        "CUDA_ERROR_MPS_SERVER_NOT_READY",
-        "CUDA_ERROR_MPS_RPC_FAILURE",
-        "CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED",
-        "CUDA_ERROR_MPS_MAX_CLIENTS_REACHED",
-        "CUDA_ERROR_MPS_CONNECTION_FAILED",
-        "CUDA_ERROR_MPS_CLIENT_TERMINATED",
-        "CUDA_ERROR_MISALIGNED_ADDRESS",
-        "CUDA_ERROR_LOSSY_QUERY",
-        "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING",
-        "CUDA_ERROR_JIT_COMPILER_NOT_FOUND",
-        "CUDA_ERROR_JIT_COMPILATION_DISABLED",
-        "CUDA_ERROR_INVALID_RESOURCE_TYPE",
-        "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION",
-        "CUDA_ERROR_INVALID_PC",
-        "CUDA_ERROR_INVALID_CLUSTER_SIZE",
-        "CUDA_ERROR_INVALID_ADDRESS_SPACE",
-        "CUDA_ERROR_ILLEGAL_INSTRUCTION",
-        "CUDA_ERROR_HARDWARE_STACK_ERROR",
-        "CUDA_ERROR_FUNCTION_NOT_LOADED",
-        "CUDA_ERROR_EXTERNAL_DEVICE",
-        "CUDA_ERROR_DEVICE_UNAVAILABLE",
-        "CUDA_ERROR_DEVICE_NOT_LICENSED",
-        "CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE",
-        "CUDA_ERROR_CDP_VERSION_MISMATCH",
-        "CUDA_ERROR_CDP_NOT_SUPPORTED",
-        "CUDA_EGL_MAX_PLANES",
-        "CUDA_CONDITIONAL_NODE_PARAMS",
-        "CUDA_CB",
-        "CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st",
-        "CUDA_BATCH_MEM_OP_NODE_PARAMS_v2",
-        "CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st",
-        "CUDA_BATCH_MEM_OP_NODE_PARAMS_v1",
-        "CUDA_BATCH_MEM_OP_NODE_PARAMS_st",
-        "CUDA_BATCH_MEM_OP_NODE_PARAMS",
-        "CUDA_ARRAY_SPARSE_PROPERTIES_v1",
-        "CUDA_ARRAY_SPARSE_PROPERTIES_st",
-        "CUDA_ARRAY_SPARSE_PROPERTIES",
-        "CUDA_ARRAY_MEMORY_REQUIREMENTS_v1",
-        "CUDA_ARRAY_MEMORY_REQUIREMENTS_st",
-        "CUDA_ARRAY_MEMORY_REQUIREMENTS",
-        "CUDA_ARRAY3D_VIDEO_ENCODE_DECODE",
-        "CUDA_ARRAY3D_SPARSE",
-        "CUDA_ARRAY3D_DEPTH_TEXTURE",
-        "CUDA_ARRAY3D_DEFERRED_MAPPING",
-        "CUDA_ARRAY3D_COLOR_ATTACHMENT",
-        "CUDA_ARRAY3D_2DARRAY",
-        "CUDALIBMG_GRID_MAPPING_ROW_MAJOR",
-        "CUDALIBMG_GRID_MAPPING_COL_MAJOR",
-        "CUCoredumpGenerationFlags",
-        "CUB_USE_COOPERATIVE_GROUPS",
-        "CUB_SUBSCRIPTION_FACTOR",
-        "CUB_STATIC_ASSERT",
-        "CUB_SMEM_BANKS",
-        "CUB_RUNTIME_ENABLED",
-        "CUB_ROUND_UP_NEAREST",
-        "CUB_ROUND_DOWN_NEAREST",
-        "CUB_QUOTIENT_FLOOR",
-        "CUB_QUOTIENT_CEILING",
-        "CUB_PTX_SUBSCRIPTION_FACTOR",
-        "CUB_PTX_SMEM_BANKS",
-        "CUB_PTX_PREFER_CONFLICT_OVER_PADDING",
-        "CUB_PTX_LOG_WARP_THREADS",
-        "CUB_PTX_LOG_SMEM_BANKS",
-        "CUB_PREVENT_MACRO_SUBSTITUTION",
-        "CUB_PREFER_CONFLICT_OVER_PADDING",
-        "CUB_MSVC_VERSION_FULL",
-        "CUB_MSVC_VERSION",
-        "CUB_MAX_DEVICES",
-        "CUB_LOG_WARP_THREADS",
-        "CUB_LOG_SMEM_BANKS",
-        "CUB_IS_HOST_CODE",
-        "CUB_IS_DEVICE_CODE",
-        "CUB_INCLUDE_HOST_CODE",
-        "CUB_INCLUDE_DEVICE_CODE",
-        "CUB_IGNORE_DEPRECATED_DIALECT",
-        "CUB_IGNORE_DEPRECATED_CPP_DIALECT",
-        "CUB_IGNORE_DEPRECATED_CPP_11",
-        "CUB_IGNORE_DEPRECATED_COMPILER",
-        "CUB_IGNORE_DEPRECATED_API",
-        "CUB_HOST_COMPILER_UNKNOWN",
-        "CUB_HOST_COMPILER_MSVC",
-        "CUB_HOST_COMPILER_GCC",
-        "CUB_HOST_COMPILER_CLANG",
-        "CUB_HOST_COMPILER",
-        "CUB_DEVICE_COMPILER_UNKNOWN",
-        "CUB_DEVICE_COMPILER_NVCC",
-        "CUB_DEVICE_COMPILER_MSVC",
-        "CUB_DEVICE_COMPILER_GCC",
-        "CUB_DEVICE_COMPILER_CLANG",
-        "CUB_DEVICE_COMPILER",
-        "CUB_DEPRECATED",
-        "CUB_DEFINE_VECTOR_TYPE",
-        "CUB_DEFINE_DETECT_NESTED_TYPE",
-        "CUB_CPP_DIALECT",
-        "CUB_CPLUSPLUS",
-        "CUB_COMP_DEPR_IMPL1",
-        "CUB_COMP_DEPR_IMPL0",
-        "CUB_COMP_DEPR_IMPL",
-        "CUB_COMPILER_DEPRECATION_SOFT",
-        "CUB_COMPILER_DEPRECATION",
-        "CUB_CAT_",
-        "CUB_CAT",
-        "CUB_ALIGN",
-        "CUBLAS_STOREV_ROWWISE",
-        "CUBLAS_STOREV_COLUMNWISE",
-        "CUBLAS_DIRECT_FORWARD",
-        "CUBLAS_DIRECT_BACKWARD",
-        "CIG_DATA_TYPE_D3D12_COMMAND_QUEUE"
-    )
-    {
-        my $mt = m/($func)/g;
-        if ($mt) {
-            $k += $mt;
-            print STDERR "  warning: $fileName:$line_num: unsupported identifier \"$func\"\n";
-        }
-    }
-    return $k;
-}
-
-sub warnHipOnlyUnsupportedFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    foreach $func (
-        "cublasZtrttp",
-        "cublasZtrsm_v2_64",
-        "cublasZtrsm_64",
-        "cublasZtrsmBatched_64",
-        "cublasZtrmm_v2_64",
-        "cublasZtrmm_64",
-        "cublasZtpttr",
-        "cublasZsyrkx_64",
-        "cublasZsyrk_v2_64",
-        "cublasZsyrk_64",
-        "cublasZsyr2k_v2_64",
-        "cublasZsyr2k_64",
-        "cublasZsymm_v2_64",
-        "cublasZsymm_64",
-        "cublasZmatinvBatched",
-        "cublasZherkx_64",
-        "cublasZherk_v2_64",
-        "cublasZherk_64",
-        "cublasZher2k_v2_64",
-        "cublasZher2k_64",
-        "cublasZhemm_v2_64",
-        "cublasZhemm_64",
-        "cublasZgemm_v2_64",
-        "cublasZgemm_64",
-        "cublasZgemmStridedBatched_64",
-        "cublasZgemmBatched_64",
-        "cublasZgemm3m_64",
-        "cublasZgemm3m",
-        "cublasZgeam_64",
-        "cublasZdgmm_64",
-        "cublasXerbla",
-        "cublasUint8gemmBias",
-        "cublasTSTgemvStridedBatched_64",
-        "cublasTSTgemvStridedBatched",
-        "cublasTSTgemvBatched_64",
-        "cublasTSTgemvBatched",
-        "cublasTSSgemvStridedBatched_64",
-        "cublasTSSgemvStridedBatched",
-        "cublasTSSgemvBatched_64",
-        "cublasTSSgemvBatched",
-        "cublasSwapEx_64",
-        "cublasSwapEx",
-        "cublasStrttp",
-        "cublasStrsm_v2_64",
-        "cublasStrsm_64",
-        "cublasStrsmBatched_64",
-        "cublasStrmm_v2_64",
-        "cublasStrmm_64",
-        "cublasStpttr",
-        "cublasSsyrkx_64",
-        "cublasSsyrk_v2_64",
-        "cublasSsyrk_64",
-        "cublasSsyr2k_v2_64",
-        "cublasSsyr2k_64",
-        "cublasSsymm_v2_64",
-        "cublasSsymm_64",
-        "cublasSmatinvBatched",
-        "cublasShutdown",
-        "cublasSgemm_v2_64",
-        "cublasSgemm_64",
-        "cublasSgemmStridedBatched_64",
-        "cublasSgemmGroupedBatched_64",
-        "cublasSgemmGroupedBatched",
-        "cublasSgemmEx_64",
-        "cublasSgemmEx",
-        "cublasSgemmBatched_64",
-        "cublasSgeam_64",
-        "cublasSetVector_64",
-        "cublasSetVectorAsync_64",
-        "cublasSetSmCountTarget",
-        "cublasSetMatrix_64",
-        "cublasSetMatrixAsync_64",
-        "cublasSetLoggerCallback",
-        "cublasSetKernelStream",
-        "cublasSdgmm_64",
-        "cublasRotmgEx",
-        "cublasRotmEx_64",
-        "cublasRotmEx",
-        "cublasRotgEx",
-        "cublasMigrateComputeType",
-        "cublasLtReductionScheme_t",
-        "cublasLtPointerModeMask_t",
-        "cublasLtNumericalImplFlags_t",
-        "cublasLtMatrixTransformDescInit",
-        "cublasLtMatrixLayoutInit",
-        "cublasLtMatmulTile_t",
-        "cublasLtMatmulStages_t",
-        "cublasLtMatmulSearch_t",
-        "cublasLtMatmulPreferenceInit",
-        "cublasLtMatmulInnerShape_t",
-        "cublasLtMatmulDescInit",
-        "cublasLtMatmulAlgoInit",
-        "cublasLtMatmulAlgoGetIds",
-        "cublasLtMatmulAlgoConfigSetAttribute",
-        "cublasLtMatmulAlgoConfigGetAttribute",
-        "cublasLtMatmulAlgoConfigAttributes_t",
-        "cublasLtMatmulAlgoCheck",
-        "cublasLtMatmulAlgoCapGetAttribute",
-        "cublasLtMatmulAlgoCapAttributes_t",
-        "cublasLtLoggerSetMask",
-        "cublasLtLoggerSetLevel",
-        "cublasLtLoggerSetFile",
-        "cublasLtLoggerSetCallback",
-        "cublasLtLoggerOpenFile",
-        "cublasLtLoggerForceDisable",
-        "cublasLtLoggerCallback_t",
-        "cublasLtHeuristicsCacheSetCapacity",
-        "cublasLtHeuristicsCacheGetCapacity",
-        "cublasLtGetVersion",
-        "cublasLtGetStatusString",
-        "cublasLtGetStatusName",
-        "cublasLtGetProperty",
-        "cublasLtGetCudartVersion",
-        "cublasLtDisableCpuInstructionsSetMask",
-        "cublasLtContext",
-        "cublasLtClusterShape_t",
-        "cublasLoggerConfigure",
-        "cublasLogCallback",
-        "cublasInit",
-        "cublasIaminEx_64",
-        "cublasIaminEx",
-        "cublasIamaxEx_64",
-        "cublasIamaxEx",
-        "cublasHgemm_64",
-        "cublasHgemmStridedBatched_64",
-        "cublasHgemmBatched_64",
-        "cublasHSSgemvStridedBatched_64",
-        "cublasHSSgemvStridedBatched",
-        "cublasHSSgemvBatched_64",
-        "cublasHSSgemvBatched",
-        "cublasHSHgemvStridedBatched_64",
-        "cublasHSHgemvStridedBatched",
-        "cublasHSHgemvBatched_64",
-        "cublasHSHgemvBatched",
-        "cublasGetVersion_v2",
-        "cublasGetVersion",
-        "cublasGetVector_64",
-        "cublasGetVectorAsync_64",
-        "cublasGetStatusString",
-        "cublasGetStatusName",
-        "cublasGetSmCountTarget",
-        "cublasGetProperty",
-        "cublasGetMatrix_64",
-        "cublasGetMatrixAsync_64",
-        "cublasGetLoggerCallback",
-        "cublasGetError",
-        "cublasGetCudartVersion",
-        "cublasGemmStridedBatchedEx_64",
-        "cublasGemmGroupedBatchedEx_64",
-        "cublasGemmGroupedBatchedEx",
-        "cublasGemmEx_64",
-        "cublasGemmBatchedEx_64",
-        "cublasFree",
-        "cublasDtrttp",
-        "cublasDtrsm_v2_64",
-        "cublasDtrsm_64",
-        "cublasDtrsmBatched_64",
-        "cublasDtrmm_v2_64",
-        "cublasDtrmm_64",
-        "cublasDtpttr",
-        "cublasDsyrkx_64",
-        "cublasDsyrk_v2_64",
-        "cublasDsyrk_64",
-        "cublasDsyr2k_v2_64",
-        "cublasDsyr2k_64",
-        "cublasDsymm_v2_64",
-        "cublasDsymm_64",
-        "cublasDmatinvBatched",
-        "cublasDgemm_v2_64",
-        "cublasDgemm_64",
-        "cublasDgemmStridedBatched_64",
-        "cublasDgemmGroupedBatched_64",
-        "cublasDgemmGroupedBatched",
-        "cublasDgemmBatched_64",
-        "cublasDgeam_64",
-        "cublasDdgmm_64",
-        "cublasCtrttp",
-        "cublasCtrsm_v2_64",
-        "cublasCtrsm_64",
-        "cublasCtrsmBatched_64",
-        "cublasCtrmm_v2_64",
-        "cublasCtrmm_64",
-        "cublasCtpttr",
-        "cublasCsyrkx_64",
-        "cublasCsyrk_v2_64",
-        "cublasCsyrk_64",
-        "cublasCsyrkEx_64",
-        "cublasCsyrkEx",
-        "cublasCsyrk3mEx_64",
-        "cublasCsyrk3mEx",
-        "cublasCsyr2k_v2_64",
-        "cublasCsyr2k_64",
-        "cublasCsymm_v2_64",
-        "cublasCsymm_64",
-        "cublasCopyEx_64",
-        "cublasCopyEx",
-        "cublasContext",
-        "cublasCmatinvBatched",
-        "cublasCherkx_64",
-        "cublasCherk_v2_64",
-        "cublasCherk_64",
-        "cublasCherkEx_64",
-        "cublasCherkEx",
-        "cublasCherk3mEx_64",
-        "cublasCherk3mEx",
-        "cublasCher2k_v2_64",
-        "cublasCher2k_64",
-        "cublasChemm_v2_64",
-        "cublasChemm_64",
-        "cublasCgemm_v2_64",
-        "cublasCgemm_64",
-        "cublasCgemmStridedBatched_64",
-        "cublasCgemmEx_64",
-        "cublasCgemmEx",
-        "cublasCgemmBatched_64",
-        "cublasCgemm3m_64",
-        "cublasCgemm3mStridedBatched_64",
-        "cublasCgemm3mStridedBatched",
-        "cublasCgemm3mEx_64",
-        "cublasCgemm3mEx",
-        "cublasCgemm3mBatched_64",
-        "cublasCgemm3mBatched",
-        "cublasCgemm3m",
-        "cublasCgeam_64",
-        "cublasCdgmm_64",
-        "cublasAsumEx_64",
-        "cublasAsumEx",
-        "cublasAlloc",
-        "CUDA_R_8F_E5M2",
-        "CUDA_R_8F_E4M3",
-        "CUDA_R_64U",
-        "CUDA_R_64I",
-        "CUDA_R_4U",
-        "CUDA_R_4I",
-        "CUDA_R_16U",
-        "CUDA_R_16I",
-        "CUDA_C_64U",
-        "CUDA_C_64I",
-        "CUDA_C_4U",
-        "CUDA_C_4I",
-        "CUDA_C_16U",
-        "CUDA_C_16I",
-        "CUBLAS_OP_CONJG",
-        "CUBLAS_GEMM_DFALT_TENSOR_OP",
-        "CUBLAS_GEMM_DEFAULT_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO9_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO9",
-        "CUBLAS_GEMM_ALGO8_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO8",
-        "CUBLAS_GEMM_ALGO7_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO7",
-        "CUBLAS_GEMM_ALGO6_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO6",
-        "CUBLAS_GEMM_ALGO5_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO5",
-        "CUBLAS_GEMM_ALGO4_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO4",
-        "CUBLAS_GEMM_ALGO3_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO3",
-        "CUBLAS_GEMM_ALGO2_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO23",
-        "CUBLAS_GEMM_ALGO22",
-        "CUBLAS_GEMM_ALGO21",
-        "CUBLAS_GEMM_ALGO20",
-        "CUBLAS_GEMM_ALGO2",
-        "CUBLAS_GEMM_ALGO1_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO19",
-        "CUBLAS_GEMM_ALGO18",
-        "CUBLAS_GEMM_ALGO17",
-        "CUBLAS_GEMM_ALGO16",
-        "CUBLAS_GEMM_ALGO15_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO15",
-        "CUBLAS_GEMM_ALGO14_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO14",
-        "CUBLAS_GEMM_ALGO13_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO13",
-        "CUBLAS_GEMM_ALGO12_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO12",
-        "CUBLAS_GEMM_ALGO11_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO11",
-        "CUBLAS_GEMM_ALGO10_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO10",
-        "CUBLAS_GEMM_ALGO1",
-        "CUBLAS_GEMM_ALGO0_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO0",
-        "CUBLASLT_SEARCH_RESERVED_05",
-        "CUBLASLT_SEARCH_RESERVED_04",
-        "CUBLASLT_SEARCH_RESERVED_03",
-        "CUBLASLT_SEARCH_RESERVED_02",
-        "CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID",
-        "CUBLASLT_SEARCH_BEST_FIT",
-        "CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE",
-        "CUBLASLT_REDUCTION_SCHEME_NONE",
-        "CUBLASLT_REDUCTION_SCHEME_MASK",
-        "CUBLASLT_REDUCTION_SCHEME_INPLACE",
-        "CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE",
-        "CUBLASLT_POINTER_MODE_MASK_HOST",
-        "CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR",
-        "CUBLASLT_POINTER_MODE_MASK_DEVICE",
-        "CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO",
-        "CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST",
-        "CUBLASLT_POINTER_MODE_DEVICE_VECTOR",
-        "CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO",
-        "CUBLASLT_ORDER_COL4_4R2_8C",
-        "CUBLASLT_ORDER_COL32_2R_4R4",
-        "CUBLASLT_ORDER_COL32",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F",
-        "CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET",
-        "CUBLASLT_MATMUL_TILE_UNDEFINED",
-        "CUBLASLT_MATMUL_TILE_END",
-        "CUBLASLT_MATMUL_TILE_96x64",
-        "CUBLASLT_MATMUL_TILE_96x128",
-        "CUBLASLT_MATMUL_TILE_8x8",
-        "CUBLASLT_MATMUL_TILE_8x64",
-        "CUBLASLT_MATMUL_TILE_8x32",
-        "CUBLASLT_MATMUL_TILE_8x16",
-        "CUBLASLT_MATMUL_TILE_64x96",
-        "CUBLASLT_MATMUL_TILE_64x8",
-        "CUBLASLT_MATMUL_TILE_64x64",
-        "CUBLASLT_MATMUL_TILE_64x512",
-        "CUBLASLT_MATMUL_TILE_64x32",
-        "CUBLASLT_MATMUL_TILE_64x256",
-        "CUBLASLT_MATMUL_TILE_64x128",
-        "CUBLASLT_MATMUL_TILE_512x64",
-        "CUBLASLT_MATMUL_TILE_32x8",
-        "CUBLASLT_MATMUL_TILE_32x64",
-        "CUBLASLT_MATMUL_TILE_32x32",
-        "CUBLASLT_MATMUL_TILE_32x256",
-        "CUBLASLT_MATMUL_TILE_32x16",
-        "CUBLASLT_MATMUL_TILE_32x128",
-        "CUBLASLT_MATMUL_TILE_256x64",
-        "CUBLASLT_MATMUL_TILE_256x32",
-        "CUBLASLT_MATMUL_TILE_256x128",
-        "CUBLASLT_MATMUL_TILE_192x128",
-        "CUBLASLT_MATMUL_TILE_16x8",
-        "CUBLASLT_MATMUL_TILE_16x32",
-        "CUBLASLT_MATMUL_TILE_16x16",
-        "CUBLASLT_MATMUL_TILE_160x128",
-        "CUBLASLT_MATMUL_TILE_128x96",
-        "CUBLASLT_MATMUL_TILE_128x64",
-        "CUBLASLT_MATMUL_TILE_128x32",
-        "CUBLASLT_MATMUL_TILE_128x256",
-        "CUBLASLT_MATMUL_TILE_128x192",
-        "CUBLASLT_MATMUL_TILE_128x160",
-        "CUBLASLT_MATMUL_TILE_128x128",
-        "CUBLASLT_MATMUL_STAGES_UNDEFINED",
-        "CUBLASLT_MATMUL_STAGES_END",
-        "CUBLASLT_MATMUL_STAGES_8xAUTO",
-        "CUBLASLT_MATMUL_STAGES_8x5",
-        "CUBLASLT_MATMUL_STAGES_8x4",
-        "CUBLASLT_MATMUL_STAGES_8x3",
-        "CUBLASLT_MATMUL_STAGES_64xAUTO",
-        "CUBLASLT_MATMUL_STAGES_64x6",
-        "CUBLASLT_MATMUL_STAGES_64x5",
-        "CUBLASLT_MATMUL_STAGES_64x4",
-        "CUBLASLT_MATMUL_STAGES_64x3",
-        "CUBLASLT_MATMUL_STAGES_64x2",
-        "CUBLASLT_MATMUL_STAGES_64x1",
-        "CUBLASLT_MATMUL_STAGES_32xAUTO",
-        "CUBLASLT_MATMUL_STAGES_32x6",
-        "CUBLASLT_MATMUL_STAGES_32x5",
-        "CUBLASLT_MATMUL_STAGES_32x4",
-        "CUBLASLT_MATMUL_STAGES_32x3",
-        "CUBLASLT_MATMUL_STAGES_32x2",
-        "CUBLASLT_MATMUL_STAGES_32x10",
-        "CUBLASLT_MATMUL_STAGES_32x1",
-        "CUBLASLT_MATMUL_STAGES_16xAUTO",
-        "CUBLASLT_MATMUL_STAGES_16x6",
-        "CUBLASLT_MATMUL_STAGES_16x5",
-        "CUBLASLT_MATMUL_STAGES_16x4",
-        "CUBLASLT_MATMUL_STAGES_16x3",
-        "CUBLASLT_MATMUL_STAGES_16x2",
-        "CUBLASLT_MATMUL_STAGES_16x10",
-        "CUBLASLT_MATMUL_STAGES_16x1",
-        "CUBLASLT_MATMUL_STAGES_128xAUTO",
-        "CUBLASLT_MATMUL_STAGES_128x6",
-        "CUBLASLT_MATMUL_STAGES_128x5",
-        "CUBLASLT_MATMUL_STAGES_128x4",
-        "CUBLASLT_MATMUL_STAGES_128x3",
-        "CUBLASLT_MATMUL_STAGES_128x2",
-        "CUBLASLT_MATMUL_STAGES_128x1",
-        "CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES",
-        "CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT",
-        "CUBLASLT_MATMUL_PREF_IMPL_MASK",
-        "CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA884",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA1688",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA1684",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA16816",
-        "CUBLASLT_MATMUL_INNER_SHAPE_END",
-        "CUBLASLT_MATMUL_DESC_TRANSC",
-        "CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET",
-        "CUBLASLT_MATMUL_DESC_SCALE_TYPE",
-        "CUBLASLT_MATMUL_DESC_FILL_MODE",
-        "CUBLASLT_MATMUL_DESC_FAST_ACCUM",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER",
-        "CUBLASLT_MATMUL_DESC_COMPUTE_TYPE",
-        "CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER",
-        "CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE",
-        "CUBLASLT_EPILOGUE_RELU_AUX_BIAS",
-        "CUBLASLT_EPILOGUE_RELU_AUX",
-        "CUBLASLT_EPILOGUE_DRELU_BGRAD",
-        "CUBLASLT_EPILOGUE_DRELU",
-        "CUBLASLT_CLUSTER_SHAPE_END",
-        "CUBLASLT_CLUSTER_SHAPE_AUTO",
-        "CUBLASLT_CLUSTER_SHAPE_9x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_8x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_8x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_7x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_7x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_6x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_6x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_5x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_5x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_5x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x5x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x8x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x7x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x6x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x5x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x9x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x8x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x7x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x6x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x5x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x16x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x15x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x14x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x13x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x12x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x11x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x10x1",
-        "CUBLASLT_CLUSTER_SHAPE_16x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_15x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_14x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_13x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_12x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_11x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_10x1x1",
-        "CUBLASLT_ALGO_CONFIG_TILE_ID",
-        "CUBLASLT_ALGO_CONFIG_STAGES_ID",
-        "CUBLASLT_ALGO_CONFIG_SPLITK_NUM",
-        "CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME",
-        "CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID",
-        "CUBLASLT_ALGO_CONFIG_ID",
-        "CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION",
-        "CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING",
-        "CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID",
-        "CUBLASLT_ALGO_CAP_UPLO_SUPPORT",
-        "CUBLASLT_ALGO_CAP_TILE_IDS",
-        "CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT",
-        "CUBLASLT_ALGO_CAP_STAGES_IDS",
-        "CUBLASLT_ALGO_CAP_SPLITK_SUPPORT",
-        "CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK",
-        "CUBLASLT_ALGO_CAP_POINTER_MODE_MASK",
-        "CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT",
-        "CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES",
-        "CUBLASLT_ALGO_CAP_LD_NEGATIVE",
-        "CUBLASLT_ALGO_CAP_EPILOGUE_MASK",
-        "CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX",
-        "CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER",
-        "CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT",
-        "CUBLASLT_ALGO_CAP_ATOMIC_SYNC"
-    )
-    {
-        my $mt = m/($func)/g;
-        if ($mt) {
-            $k += $mt;
-            print STDERR "  warning: $fileName:$line_num: unsupported identifier \"$func\"\n";
-        }
-    }
-    return $k;
-}
-
-sub warnRocOnlyUnsupportedFunctions {
-    my $line_num = shift;
-    my $k = 0;
-    foreach $func (
-        "cublasZtrttp",
-        "cublasZtrsv_v2_64",
-        "cublasZtrsv_64",
-        "cublasZtrsm_v2_64",
-        "cublasZtrsm_64",
-        "cublasZtrsmBatched_64",
-        "cublasZtrmv_v2_64",
-        "cublasZtrmv_64",
-        "cublasZtrmm_v2_64",
-        "cublasZtrmm_64",
-        "cublasZtpttr",
-        "cublasZtpsv_v2_64",
-        "cublasZtpsv_64",
-        "cublasZtpmv_v2_64",
-        "cublasZtpmv_64",
-        "cublasZtbsv_v2_64",
-        "cublasZtbsv_64",
-        "cublasZtbmv_v2_64",
-        "cublasZtbmv_64",
-        "cublasZsyrkx_64",
-        "cublasZsyrk_v2_64",
-        "cublasZsyrk_64",
-        "cublasZsyr_v2_64",
-        "cublasZsyr_64",
-        "cublasZsyr2k_v2_64",
-        "cublasZsyr2k_64",
-        "cublasZsyr2_v2_64",
-        "cublasZsyr2_64",
-        "cublasZsymv_v2_64",
-        "cublasZsymv_64",
-        "cublasZsymm_v2_64",
-        "cublasZsymm_64",
-        "cublasZmatinvBatched",
-        "cublasZhpr_v2_64",
-        "cublasZhpr_64",
-        "cublasZhpr2_v2_64",
-        "cublasZhpr2_64",
-        "cublasZhpmv_v2_64",
-        "cublasZhpmv_64",
-        "cublasZherkx_64",
-        "cublasZherk_v2_64",
-        "cublasZherk_64",
-        "cublasZher_v2_64",
-        "cublasZher_64",
-        "cublasZher2k_v2_64",
-        "cublasZher2k_64",
-        "cublasZher2_v2_64",
-        "cublasZher2_64",
-        "cublasZhemv_v2_64",
-        "cublasZhemv_64",
-        "cublasZhemm_v2_64",
-        "cublasZhemm_64",
-        "cublasZhbmv_v2_64",
-        "cublasZhbmv_64",
-        "cublasZgetrsBatched",
-        "cublasZgetriBatched",
-        "cublasZgetrfBatched",
-        "cublasZgeru_v2_64",
-        "cublasZgeru_64",
-        "cublasZgerc_v2_64",
-        "cublasZgerc_64",
-        "cublasZgeqrfBatched",
-        "cublasZgemv_v2_64",
-        "cublasZgemv_64",
-        "cublasZgemvStridedBatched_64",
-        "cublasZgemvBatched_64",
-        "cublasZgemm_v2_64",
-        "cublasZgemm_64",
-        "cublasZgemmStridedBatched_64",
-        "cublasZgemmBatched_64",
-        "cublasZgemm3m_64",
-        "cublasZgemm3m",
-        "cublasZgelsBatched",
-        "cublasZgeam_64",
-        "cublasZgbmv_v2_64",
-        "cublasZgbmv_64",
-        "cublasZdgmm_64",
-        "cublasXerbla",
-        "cublasUint8gemmBias",
-        "cublasTSTgemvStridedBatched_64",
-        "cublasTSTgemvBatched_64",
-        "cublasTSSgemvStridedBatched_64",
-        "cublasTSSgemvBatched_64",
-        "cublasSwapEx_64",
-        "cublasSwapEx",
-        "cublasStrttp",
-        "cublasStrsv_v2_64",
-        "cublasStrsv_64",
-        "cublasStrsm_v2_64",
-        "cublasStrsm_64",
-        "cublasStrsmBatched_64",
-        "cublasStrmv_v2_64",
-        "cublasStrmv_64",
-        "cublasStrmm_v2_64",
-        "cublasStrmm_64",
-        "cublasStpttr",
-        "cublasStpsv_v2_64",
-        "cublasStpsv_64",
-        "cublasStpmv_v2_64",
-        "cublasStpmv_64",
-        "cublasStbsv_v2_64",
-        "cublasStbsv_64",
-        "cublasStbmv_v2_64",
-        "cublasStbmv_64",
-        "cublasSsyrkx_64",
-        "cublasSsyrk_v2_64",
-        "cublasSsyrk_64",
-        "cublasSsyr_v2_64",
-        "cublasSsyr_64",
-        "cublasSsyr2k_v2_64",
-        "cublasSsyr2k_64",
-        "cublasSsyr2_v2_64",
-        "cublasSsyr2_64",
-        "cublasSsymv_v2_64",
-        "cublasSsymv_64",
-        "cublasSsymm_v2_64",
-        "cublasSsymm_64",
-        "cublasSspr_v2_64",
-        "cublasSspr_64",
-        "cublasSspr2_v2_64",
-        "cublasSspr2_64",
-        "cublasSspmv_v2_64",
-        "cublasSspmv_64",
-        "cublasSsbmv_v2_64",
-        "cublasSsbmv_64",
-        "cublasSmatinvBatched",
-        "cublasShutdown",
-        "cublasSgetrsBatched",
-        "cublasSgetriBatched",
-        "cublasSgetrfBatched",
-        "cublasSger_v2_64",
-        "cublasSger_64",
-        "cublasSgeqrfBatched",
-        "cublasSgemv_v2_64",
-        "cublasSgemv_64",
-        "cublasSgemvStridedBatched_64",
-        "cublasSgemvStridedBatched",
-        "cublasSgemvBatched_64",
-        "cublasSgemvBatched",
-        "cublasSgemm_v2_64",
-        "cublasSgemm_64",
-        "cublasSgemmStridedBatched_64",
-        "cublasSgemmGroupedBatched_64",
-        "cublasSgemmGroupedBatched",
-        "cublasSgemmEx_64",
-        "cublasSgemmEx",
-        "cublasSgemmBatched_64",
-        "cublasSgelsBatched",
-        "cublasSgeam_64",
-        "cublasSgbmv_v2_64",
-        "cublasSgbmv_64",
-        "cublasSetVector_64",
-        "cublasSetVectorAsync_64",
-        "cublasSetSmCountTarget",
-        "cublasSetMatrix_64",
-        "cublasSetMatrixAsync_64",
-        "cublasSetLoggerCallback",
-        "cublasSetKernelStream",
-        "cublasSdgmm_64",
-        "cublasRotmgEx",
-        "cublasRotmEx_64",
-        "cublasRotmEx",
-        "cublasRotgEx",
-        "cublasMigrateComputeType",
-        "cublasLtReductionScheme_t",
-        "cublasLtPointerMode_t",
-        "cublasLtPointerModeMask_t",
-        "cublasLtOrder_t",
-        "cublasLtNumericalImplFlags_t",
-        "cublasLtMatrixTransformDescSetAttribute",
-        "cublasLtMatrixTransformDescInit",
-        "cublasLtMatrixTransformDescGetAttribute",
-        "cublasLtMatrixTransformDescDestroy",
-        "cublasLtMatrixTransformDescCreate",
-        "cublasLtMatrixTransformDescAttributes_t",
-        "cublasLtMatrixTransform",
-        "cublasLtMatrixLayoutSetAttribute",
-        "cublasLtMatrixLayoutInit",
-        "cublasLtMatrixLayoutGetAttribute",
-        "cublasLtMatrixLayoutDestroy",
-        "cublasLtMatrixLayoutCreate",
-        "cublasLtMatrixLayoutAttribute_t",
-        "cublasLtMatmulTile_t",
-        "cublasLtMatmulStages_t",
-        "cublasLtMatmulSearch_t",
-        "cublasLtMatmulPreferenceSetAttribute",
-        "cublasLtMatmulPreferenceInit",
-        "cublasLtMatmulPreferenceGetAttribute",
-        "cublasLtMatmulPreferenceDestroy",
-        "cublasLtMatmulPreferenceCreate",
-        "cublasLtMatmulPreferenceAttributes_t",
-        "cublasLtMatmulInnerShape_t",
-        "cublasLtMatmulHeuristicResult_t",
-        "cublasLtMatmulDescSetAttribute",
-        "cublasLtMatmulDescInit",
-        "cublasLtMatmulDescGetAttribute",
-        "cublasLtMatmulDescDestroy",
-        "cublasLtMatmulDescCreate",
-        "cublasLtMatmulDescAttributes_t",
-        "cublasLtMatmulAlgoInit",
-        "cublasLtMatmulAlgoGetIds",
-        "cublasLtMatmulAlgoGetHeuristic",
-        "cublasLtMatmulAlgoConfigSetAttribute",
-        "cublasLtMatmulAlgoConfigGetAttribute",
-        "cublasLtMatmulAlgoConfigAttributes_t",
-        "cublasLtMatmulAlgoCheck",
-        "cublasLtMatmulAlgoCapGetAttribute",
-        "cublasLtMatmulAlgoCapAttributes_t",
-        "cublasLtMatmul",
-        "cublasLtLoggerSetMask",
-        "cublasLtLoggerSetLevel",
-        "cublasLtLoggerSetFile",
-        "cublasLtLoggerSetCallback",
-        "cublasLtLoggerOpenFile",
-        "cublasLtLoggerForceDisable",
-        "cublasLtLoggerCallback_t",
-        "cublasLtHeuristicsCacheSetCapacity",
-        "cublasLtHeuristicsCacheGetCapacity",
-        "cublasLtGetVersion",
-        "cublasLtGetStatusString",
-        "cublasLtGetStatusName",
-        "cublasLtGetProperty",
-        "cublasLtGetCudartVersion",
-        "cublasLtEpilogue_t",
-        "cublasLtDisableCpuInstructionsSetMask",
-        "cublasLtContext",
-        "cublasLtClusterShape_t",
-        "cublasLoggerConfigure",
-        "cublasLogCallback",
-        "cublasIaminEx_64",
-        "cublasIaminEx",
-        "cublasIamaxEx_64",
-        "cublasIamaxEx",
-        "cublasHgemm_64",
-        "cublasHgemmStridedBatched_64",
-        "cublasHgemmBatched_64",
-        "cublasHSSgemvStridedBatched_64",
-        "cublasHSSgemvBatched_64",
-        "cublasHSHgemvStridedBatched_64",
-        "cublasHSHgemvBatched_64",
-        "cublasGetVersion_v2",
-        "cublasGetVersion",
-        "cublasGetVector_64",
-        "cublasGetVectorAsync_64",
-        "cublasGetStatusName",
-        "cublasGetSmCountTarget",
-        "cublasGetProperty",
-        "cublasGetMatrix_64",
-        "cublasGetMatrixAsync_64",
-        "cublasGetLoggerCallback",
-        "cublasGetError",
-        "cublasGetCudartVersion",
-        "cublasGemmStridedBatchedEx_64",
-        "cublasGemmGroupedBatchedEx_64",
-        "cublasGemmGroupedBatchedEx",
-        "cublasGemmEx_64",
-        "cublasGemmBatchedEx_64",
-        "cublasFree",
-        "cublasDtrttp",
-        "cublasDtrsv_v2_64",
-        "cublasDtrsv_64",
-        "cublasDtrsm_v2_64",
-        "cublasDtrsm_64",
-        "cublasDtrsmBatched_64",
-        "cublasDtrmv_v2_64",
-        "cublasDtrmv_64",
-        "cublasDtrmm_v2_64",
-        "cublasDtrmm_64",
-        "cublasDtpttr",
-        "cublasDtpsv_v2_64",
-        "cublasDtpsv_64",
-        "cublasDtpmv_v2_64",
-        "cublasDtpmv_64",
-        "cublasDtbsv_v2_64",
-        "cublasDtbsv_64",
-        "cublasDtbmv_v2_64",
-        "cublasDtbmv_64",
-        "cublasDsyrkx_64",
-        "cublasDsyrk_v2_64",
-        "cublasDsyrk_64",
-        "cublasDsyr_v2_64",
-        "cublasDsyr_64",
-        "cublasDsyr2k_v2_64",
-        "cublasDsyr2k_64",
-        "cublasDsyr2_v2_64",
-        "cublasDsyr2_64",
-        "cublasDsymv_v2_64",
-        "cublasDsymv_64",
-        "cublasDsymm_v2_64",
-        "cublasDsymm_64",
-        "cublasDspr_v2_64",
-        "cublasDspr_64",
-        "cublasDspr2_v2_64",
-        "cublasDspr2_64",
-        "cublasDspmv_v2_64",
-        "cublasDspmv_64",
-        "cublasDsbmv_v2_64",
-        "cublasDsbmv_64",
-        "cublasDmatinvBatched",
-        "cublasDgetrsBatched",
-        "cublasDgetriBatched",
-        "cublasDgetrfBatched",
-        "cublasDger_v2_64",
-        "cublasDger_64",
-        "cublasDgeqrfBatched",
-        "cublasDgemv_v2_64",
-        "cublasDgemv_64",
-        "cublasDgemvStridedBatched_64",
-        "cublasDgemvStridedBatched",
-        "cublasDgemvBatched_64",
-        "cublasDgemvBatched",
-        "cublasDgemm_v2_64",
-        "cublasDgemm_64",
-        "cublasDgemmStridedBatched_64",
-        "cublasDgemmGroupedBatched_64",
-        "cublasDgemmGroupedBatched",
-        "cublasDgemmBatched_64",
-        "cublasDgelsBatched",
-        "cublasDgeam_64",
-        "cublasDgbmv_v2_64",
-        "cublasDgbmv_64",
-        "cublasDdgmm_64",
-        "cublasCtrttp",
-        "cublasCtrsv_v2_64",
-        "cublasCtrsv_64",
-        "cublasCtrsm_v2_64",
-        "cublasCtrsm_64",
-        "cublasCtrsmBatched_64",
-        "cublasCtrmv_v2_64",
-        "cublasCtrmv_64",
-        "cublasCtrmm_v2_64",
-        "cublasCtrmm_64",
-        "cublasCtpttr",
-        "cublasCtpsv_v2_64",
-        "cublasCtpsv_64",
-        "cublasCtpmv_v2_64",
-        "cublasCtpmv_64",
-        "cublasCtbsv_v2_64",
-        "cublasCtbsv_64",
-        "cublasCtbmv_v2_64",
-        "cublasCtbmv_64",
-        "cublasCsyrkx_64",
-        "cublasCsyrk_v2_64",
-        "cublasCsyrk_64",
-        "cublasCsyrkEx_64",
-        "cublasCsyrkEx",
-        "cublasCsyrk3mEx_64",
-        "cublasCsyrk3mEx",
-        "cublasCsyr_v2_64",
-        "cublasCsyr_64",
-        "cublasCsyr2k_v2_64",
-        "cublasCsyr2k_64",
-        "cublasCsyr2_v2_64",
-        "cublasCsyr2_64",
-        "cublasCsymv_v2_64",
-        "cublasCsymv_64",
-        "cublasCsymm_v2_64",
-        "cublasCsymm_64",
-        "cublasCopyEx_64",
-        "cublasCopyEx",
-        "cublasCmatinvBatched",
-        "cublasChpr_v2_64",
-        "cublasChpr_64",
-        "cublasChpr2_v2_64",
-        "cublasChpr2_64",
-        "cublasChpmv_v2_64",
-        "cublasChpmv_64",
-        "cublasCherkx_64",
-        "cublasCherk_v2_64",
-        "cublasCherk_64",
-        "cublasCherkEx_64",
-        "cublasCherkEx",
-        "cublasCherk3mEx_64",
-        "cublasCherk3mEx",
-        "cublasCher_v2_64",
-        "cublasCher_64",
-        "cublasCher2k_v2_64",
-        "cublasCher2k_64",
-        "cublasCher2_v2_64",
-        "cublasCher2_64",
-        "cublasChemv_v2_64",
-        "cublasChemv_64",
-        "cublasChemm_v2_64",
-        "cublasChemm_64",
-        "cublasChbmv_v2_64",
-        "cublasChbmv_64",
-        "cublasCgetrsBatched",
-        "cublasCgetriBatched",
-        "cublasCgetrfBatched",
-        "cublasCgeru_v2_64",
-        "cublasCgeru_64",
-        "cublasCgerc_v2_64",
-        "cublasCgerc_64",
-        "cublasCgeqrfBatched",
-        "cublasCgemv_v2_64",
-        "cublasCgemv_64",
-        "cublasCgemvStridedBatched_64",
-        "cublasCgemvBatched_64",
-        "cublasCgemm_v2_64",
-        "cublasCgemm_64",
-        "cublasCgemmStridedBatched_64",
-        "cublasCgemmEx_64",
-        "cublasCgemmEx",
-        "cublasCgemmBatched_64",
-        "cublasCgemm3m_64",
-        "cublasCgemm3mStridedBatched_64",
-        "cublasCgemm3mStridedBatched",
-        "cublasCgemm3mEx_64",
-        "cublasCgemm3mEx",
-        "cublasCgemm3mBatched_64",
-        "cublasCgemm3mBatched",
-        "cublasCgemm3m",
-        "cublasCgelsBatched",
-        "cublasCgeam_64",
-        "cublasCgbmv_v2_64",
-        "cublasCgbmv_64",
-        "cublasCdgmm_64",
-        "cublasAsumEx_64",
-        "cublasAsumEx",
-        "cublasAlloc",
-        "CUDA_R_8F_E5M2",
-        "CUDA_R_8F_E4M3",
-        "CUDA_R_64U",
-        "CUDA_R_64I",
-        "CUDA_R_4U",
-        "CUDA_R_4I",
-        "CUDA_R_16U",
-        "CUDA_R_16I",
-        "CUDA_C_64U",
-        "CUDA_C_64I",
-        "CUDA_C_4U",
-        "CUDA_C_4I",
-        "CUDA_C_16U",
-        "CUDA_C_16I",
-        "CUBLAS_TF32_TENSOR_OP_MATH",
-        "CUBLAS_TENSOR_OP_MATH",
-        "CUBLAS_STATUS_LICENSE_ERROR",
-        "CUBLAS_PEDANTIC_MATH",
-        "CUBLAS_OP_CONJG",
-        "CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION",
-        "CUBLAS_GEMM_DFALT_TENSOR_OP",
-        "CUBLAS_GEMM_DEFAULT_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO9_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO9",
-        "CUBLAS_GEMM_ALGO8_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO8",
-        "CUBLAS_GEMM_ALGO7_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO7",
-        "CUBLAS_GEMM_ALGO6_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO6",
-        "CUBLAS_GEMM_ALGO5_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO5",
-        "CUBLAS_GEMM_ALGO4_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO4",
-        "CUBLAS_GEMM_ALGO3_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO3",
-        "CUBLAS_GEMM_ALGO2_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO23",
-        "CUBLAS_GEMM_ALGO22",
-        "CUBLAS_GEMM_ALGO21",
-        "CUBLAS_GEMM_ALGO20",
-        "CUBLAS_GEMM_ALGO2",
-        "CUBLAS_GEMM_ALGO1_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO19",
-        "CUBLAS_GEMM_ALGO18",
-        "CUBLAS_GEMM_ALGO17",
-        "CUBLAS_GEMM_ALGO16",
-        "CUBLAS_GEMM_ALGO15_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO15",
-        "CUBLAS_GEMM_ALGO14_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO14",
-        "CUBLAS_GEMM_ALGO13_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO13",
-        "CUBLAS_GEMM_ALGO12_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO12",
-        "CUBLAS_GEMM_ALGO11_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO11",
-        "CUBLAS_GEMM_ALGO10_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO10",
-        "CUBLAS_GEMM_ALGO1",
-        "CUBLAS_GEMM_ALGO0_TENSOR_OP",
-        "CUBLAS_GEMM_ALGO0",
-        "CUBLAS_COMPUTE_64F_PEDANTIC",
-        "CUBLAS_COMPUTE_64F",
-        "CUBLAS_COMPUTE_32I_PEDANTIC",
-        "CUBLAS_COMPUTE_32I",
-        "CUBLAS_COMPUTE_32F_PEDANTIC",
-        "CUBLAS_COMPUTE_32F_FAST_TF32",
-        "CUBLAS_COMPUTE_32F_FAST_16F",
-        "CUBLAS_COMPUTE_32F_FAST_16BF",
-        "CUBLAS_COMPUTE_16F_PEDANTIC",
-        "CUBLAS_COMPUTE_16F",
-        "CUBLASLT_SEARCH_RESERVED_05",
-        "CUBLASLT_SEARCH_RESERVED_04",
-        "CUBLASLT_SEARCH_RESERVED_03",
-        "CUBLASLT_SEARCH_RESERVED_02",
-        "CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID",
-        "CUBLASLT_SEARCH_BEST_FIT",
-        "CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE",
-        "CUBLASLT_REDUCTION_SCHEME_NONE",
-        "CUBLASLT_REDUCTION_SCHEME_MASK",
-        "CUBLASLT_REDUCTION_SCHEME_INPLACE",
-        "CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE",
-        "CUBLASLT_POINTER_MODE_MASK_HOST",
-        "CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR",
-        "CUBLASLT_POINTER_MODE_MASK_DEVICE",
-        "CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO",
-        "CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST",
-        "CUBLASLT_POINTER_MODE_HOST",
-        "CUBLASLT_POINTER_MODE_DEVICE_VECTOR",
-        "CUBLASLT_POINTER_MODE_DEVICE",
-        "CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO",
-        "CUBLASLT_ORDER_ROW",
-        "CUBLASLT_ORDER_COL4_4R2_8C",
-        "CUBLASLT_ORDER_COL32_2R_4R4",
-        "CUBLASLT_ORDER_COL32",
-        "CUBLASLT_ORDER_COL",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F",
-        "CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F",
-        "CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB",
-        "CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA",
-        "CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE",
-        "CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE",
-        "CUBLASLT_MATRIX_LAYOUT_TYPE",
-        "CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET",
-        "CUBLASLT_MATRIX_LAYOUT_ROWS",
-        "CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET",
-        "CUBLASLT_MATRIX_LAYOUT_ORDER",
-        "CUBLASLT_MATRIX_LAYOUT_LD",
-        "CUBLASLT_MATRIX_LAYOUT_COLS",
-        "CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT",
-        "CUBLASLT_MATMUL_TILE_UNDEFINED",
-        "CUBLASLT_MATMUL_TILE_END",
-        "CUBLASLT_MATMUL_TILE_96x64",
-        "CUBLASLT_MATMUL_TILE_96x128",
-        "CUBLASLT_MATMUL_TILE_8x8",
-        "CUBLASLT_MATMUL_TILE_8x64",
-        "CUBLASLT_MATMUL_TILE_8x32",
-        "CUBLASLT_MATMUL_TILE_8x16",
-        "CUBLASLT_MATMUL_TILE_64x96",
-        "CUBLASLT_MATMUL_TILE_64x8",
-        "CUBLASLT_MATMUL_TILE_64x64",
-        "CUBLASLT_MATMUL_TILE_64x512",
-        "CUBLASLT_MATMUL_TILE_64x32",
-        "CUBLASLT_MATMUL_TILE_64x256",
-        "CUBLASLT_MATMUL_TILE_64x128",
-        "CUBLASLT_MATMUL_TILE_512x64",
-        "CUBLASLT_MATMUL_TILE_32x8",
-        "CUBLASLT_MATMUL_TILE_32x64",
-        "CUBLASLT_MATMUL_TILE_32x32",
-        "CUBLASLT_MATMUL_TILE_32x256",
-        "CUBLASLT_MATMUL_TILE_32x16",
-        "CUBLASLT_MATMUL_TILE_32x128",
-        "CUBLASLT_MATMUL_TILE_256x64",
-        "CUBLASLT_MATMUL_TILE_256x32",
-        "CUBLASLT_MATMUL_TILE_256x128",
-        "CUBLASLT_MATMUL_TILE_192x128",
-        "CUBLASLT_MATMUL_TILE_16x8",
-        "CUBLASLT_MATMUL_TILE_16x32",
-        "CUBLASLT_MATMUL_TILE_16x16",
-        "CUBLASLT_MATMUL_TILE_160x128",
-        "CUBLASLT_MATMUL_TILE_128x96",
-        "CUBLASLT_MATMUL_TILE_128x64",
-        "CUBLASLT_MATMUL_TILE_128x32",
-        "CUBLASLT_MATMUL_TILE_128x256",
-        "CUBLASLT_MATMUL_TILE_128x192",
-        "CUBLASLT_MATMUL_TILE_128x160",
-        "CUBLASLT_MATMUL_TILE_128x128",
-        "CUBLASLT_MATMUL_STAGES_UNDEFINED",
-        "CUBLASLT_MATMUL_STAGES_END",
-        "CUBLASLT_MATMUL_STAGES_8xAUTO",
-        "CUBLASLT_MATMUL_STAGES_8x5",
-        "CUBLASLT_MATMUL_STAGES_8x4",
-        "CUBLASLT_MATMUL_STAGES_8x3",
-        "CUBLASLT_MATMUL_STAGES_64xAUTO",
-        "CUBLASLT_MATMUL_STAGES_64x6",
-        "CUBLASLT_MATMUL_STAGES_64x5",
-        "CUBLASLT_MATMUL_STAGES_64x4",
-        "CUBLASLT_MATMUL_STAGES_64x3",
-        "CUBLASLT_MATMUL_STAGES_64x2",
-        "CUBLASLT_MATMUL_STAGES_64x1",
-        "CUBLASLT_MATMUL_STAGES_32xAUTO",
-        "CUBLASLT_MATMUL_STAGES_32x6",
-        "CUBLASLT_MATMUL_STAGES_32x5",
-        "CUBLASLT_MATMUL_STAGES_32x4",
-        "CUBLASLT_MATMUL_STAGES_32x3",
-        "CUBLASLT_MATMUL_STAGES_32x2",
-        "CUBLASLT_MATMUL_STAGES_32x10",
-        "CUBLASLT_MATMUL_STAGES_32x1",
-        "CUBLASLT_MATMUL_STAGES_16xAUTO",
-        "CUBLASLT_MATMUL_STAGES_16x6",
-        "CUBLASLT_MATMUL_STAGES_16x5",
-        "CUBLASLT_MATMUL_STAGES_16x4",
-        "CUBLASLT_MATMUL_STAGES_16x3",
-        "CUBLASLT_MATMUL_STAGES_16x2",
-        "CUBLASLT_MATMUL_STAGES_16x10",
-        "CUBLASLT_MATMUL_STAGES_16x1",
-        "CUBLASLT_MATMUL_STAGES_128xAUTO",
-        "CUBLASLT_MATMUL_STAGES_128x6",
-        "CUBLASLT_MATMUL_STAGES_128x5",
-        "CUBLASLT_MATMUL_STAGES_128x4",
-        "CUBLASLT_MATMUL_STAGES_128x3",
-        "CUBLASLT_MATMUL_STAGES_128x2",
-        "CUBLASLT_MATMUL_STAGES_128x1",
-        "CUBLASLT_MATMUL_PREF_SEARCH_MODE",
-        "CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES",
-        "CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES",
-        "CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES",
-        "CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT",
-        "CUBLASLT_MATMUL_PREF_IMPL_MASK",
-        "CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA884",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA1688",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA1684",
-        "CUBLASLT_MATMUL_INNER_SHAPE_MMA16816",
-        "CUBLASLT_MATMUL_INNER_SHAPE_END",
-        "CUBLASLT_MATMUL_DESC_TRANSC",
-        "CUBLASLT_MATMUL_DESC_TRANSB",
-        "CUBLASLT_MATMUL_DESC_TRANSA",
-        "CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET",
-        "CUBLASLT_MATMUL_DESC_SCALE_TYPE",
-        "CUBLASLT_MATMUL_DESC_POINTER_MODE",
-        "CUBLASLT_MATMUL_DESC_FILL_MODE",
-        "CUBLASLT_MATMUL_DESC_FAST_ACCUM",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER",
-        "CUBLASLT_MATMUL_DESC_EPILOGUE",
-        "CUBLASLT_MATMUL_DESC_D_SCALE_POINTER",
-        "CUBLASLT_MATMUL_DESC_C_SCALE_POINTER",
-        "CUBLASLT_MATMUL_DESC_COMPUTE_TYPE",
-        "CUBLASLT_MATMUL_DESC_B_SCALE_POINTER",
-        "CUBLASLT_MATMUL_DESC_BIAS_POINTER",
-        "CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE",
-        "CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE",
-        "CUBLASLT_MATMUL_DESC_A_SCALE_POINTER",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS",
-        "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER",
-        "CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE",
-        "CUBLASLT_EPILOGUE_RELU_BIAS",
-        "CUBLASLT_EPILOGUE_RELU_AUX_BIAS",
-        "CUBLASLT_EPILOGUE_RELU_AUX",
-        "CUBLASLT_EPILOGUE_RELU",
-        "CUBLASLT_EPILOGUE_GELU_BIAS",
-        "CUBLASLT_EPILOGUE_GELU_AUX_BIAS",
-        "CUBLASLT_EPILOGUE_GELU_AUX",
-        "CUBLASLT_EPILOGUE_GELU",
-        "CUBLASLT_EPILOGUE_DRELU_BGRAD",
-        "CUBLASLT_EPILOGUE_DRELU",
-        "CUBLASLT_EPILOGUE_DGELU_BGRAD",
-        "CUBLASLT_EPILOGUE_DGELU",
-        "CUBLASLT_EPILOGUE_DEFAULT",
-        "CUBLASLT_EPILOGUE_BIAS",
-        "CUBLASLT_EPILOGUE_BGRADB",
-        "CUBLASLT_EPILOGUE_BGRADA",
-        "CUBLASLT_CLUSTER_SHAPE_END",
-        "CUBLASLT_CLUSTER_SHAPE_AUTO",
-        "CUBLASLT_CLUSTER_SHAPE_9x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_8x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_8x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_7x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_7x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_6x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_6x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_5x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_5x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_5x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_4x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x5x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_3x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x8x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x7x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x6x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x5x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_2x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x9x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x8x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x7x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x6x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x5x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x4x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x3x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x2x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x16x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x15x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x14x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x13x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x12x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x11x1",
-        "CUBLASLT_CLUSTER_SHAPE_1x10x1",
-        "CUBLASLT_CLUSTER_SHAPE_16x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_15x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_14x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_13x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_12x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_11x1x1",
-        "CUBLASLT_CLUSTER_SHAPE_10x1x1",
-        "CUBLASLT_ALGO_CONFIG_TILE_ID",
-        "CUBLASLT_ALGO_CONFIG_STAGES_ID",
-        "CUBLASLT_ALGO_CONFIG_SPLITK_NUM",
-        "CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME",
-        "CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID",
-        "CUBLASLT_ALGO_CONFIG_ID",
-        "CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION",
-        "CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING",
-        "CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID",
-        "CUBLASLT_ALGO_CAP_UPLO_SUPPORT",
-        "CUBLASLT_ALGO_CAP_TILE_IDS",
-        "CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT",
-        "CUBLASLT_ALGO_CAP_STAGES_IDS",
-        "CUBLASLT_ALGO_CAP_SPLITK_SUPPORT",
-        "CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK",
-        "CUBLASLT_ALGO_CAP_POINTER_MODE_MASK",
-        "CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT",
-        "CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES",
-        "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES",
-        "CUBLASLT_ALGO_CAP_LD_NEGATIVE",
-        "CUBLASLT_ALGO_CAP_EPILOGUE_MASK",
-        "CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX",
-        "CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER",
-        "CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT",
-        "CUBLASLT_ALGO_CAP_ATOMIC_SYNC"
-    )
-    {
-        my $mt = m/($func)/g;
-        if ($mt) {
-            $k += $mt;
-            print STDERR "  warning: $fileName:$line_num: unsupported by ROC identifier \"$func\"\n";
-        }
-    }
-    return $k;
-}
-
-# Count of transforms in all files
-my %tt;
-clearStats(\%tt, \@statNames);
-$Twarnings = 0;
-$TlineCount = 0;
-my %TwarningTags;
-my $fileCount = @ARGV;
-if ($help) {
-    print STDERR "$USAGE\n";
-}
-if ($version) {
-    print STDERR "HIP version 6.2.0\n";
-}
-while (@ARGV) {
-    $fileName=shift (@ARGV);
-    my $direxclude = 0;
-    my $fileDir = dirname(Cwd::realpath($fileName));
-    if ($exclude_dirhash{$fileDir}) {
-        print STDERR "Skipping the file: $fileName in the excluded directory: $fileDir \n";
-        $direxclude = 1;
-    } else {
-        $fileDir = dirname($fileDir);
-    }
-    if ($exclude_filehash{$fileName}) {
-        print STDERR "Skipping the excluded file: $fileName \n";
-    }
-    unless ($direxclude or $exclude_filehash{$fileName}) {
-        if ($inplace) {
-            my $file_prehip = "$fileName" . ".prehip";
-            my $infile;
-            my $outfile;
-            if (-e $file_prehip) {
-                $infile  = $file_prehip;
-                $outfile = $fileName;
-            } else {
-                system ("cp $fileName $file_prehip");
-                $infile = $file_prehip;
-                $outfile = $fileName;
-            }
-            open(INFILE,"<", $infile) or die "error: could not open $infile";
-            open(OUTFILE,">", $outfile) or die "error: could not open $outfile";
-            $OUTFILE = OUTFILE;
-        } else {
-            open(INFILE,"<", $fileName) or die "error: could not open $fileName";
-            if ($hipFileName ne "") {
-                open(OUTFILE,">", $hipFileName) or die "error: could not open $hipFileName";
-                $OUTFILE = OUTFILE;
-            } else {
-                $OUTFILE = STDOUT;
-            }
-        }
-        # Count of transforms in this file
-        clearStats(\%ft, \@statNames);
-        my $countIncludes = 0;
-        my $countKeywords = 0;
-        my $warnings = 0;
-        my %warningTags;
-        my $lineCount = 0;
-        %tags = ();
-        %convertedTags = ();
-        %tagsToConvertedTags = ();
-        undef $/;
-        # Read whole file at once, so we can match newlines
-        while (<INFILE>) {
-            $countKeywords += m/__global__/;
-            $countKeywords += m/__shared__/;
-            unless ($quiet_warnings) {
-                my @lines = split /\n/, $_;
-                # Copy the whole file
-                my $tmp = $_;
-                my $line_num = 0;
-                foreach (@lines) {
-                    $line_num++;
-                    if (!$experimental) {
-                        $s = warnExperimentalFunctions($line_num);
-                        $warnings += $s;
-                    }
-                    $s = warnRemovedFunctions($line_num);
-                    $warnings += $s;
-                    $s = warnDeprecatedFunctions($line_num);
-                    $warnings += $s;
-                    $s = warnUnsupportedFunctions($line_num);
-                    $warnings += $s;
-                    if ($roc) {
-                        $s = warnRocOnlyUnsupportedFunctions($line_num);
-                        $warnings += $s;
-                    } else {
-                        $s = warnHipOnlyUnsupportedFunctions($line_num);
-                        $warnings += $s;
-                    }
-                    $s = warnUnsupportedDeviceFunctions($line_num);
-                    $warnings += $s;
-                }
-                $_ = $tmp;
-            }
-            if ($roc) {
-                rocSubstitutions();
-            }
-            if ($experimental) {
-                experimentalSubstitutions();
-            }
-            simpleSubstitutions();
-            if (!$cuda_kernel_execution_syntax || $hip_kernel_execution_syntax) {
-                transformKernelLaunch();
-            }
-            transformCubNamespace();
-            my $hasDeviceCode = $countKeywords + $ft{'device_function'};
-            unless ($quiet_warnings) {
-                # Copy into array of lines, process line-by-line to show warnings
-                if ($hasDeviceCode or (/\bcu|CU/) or (/<<<.*>>>/)) {
-                    my @lines = split /\n/, $_;
-                    # Copy the whole file
-                    my $tmp = $_;
-                    my $line_num = 0;
-                    foreach (@lines) {
-                        $line_num++;
-                        # Remove any whitelisted words
-                        foreach $w (@whitelist) {
-                            redo if s/\b$w\b/ZAP/
-                        }
-                        my $tag;
-                        if ((/(\bcuda[A-Z]\w+)/) or ((/<<<.*>>>/) and ($hip_kernel_execution_syntax))) {
-                            # Flag any remaining code that look like cuda API calls: may want to add these to hipify
-                            $tag = (defined $1) ? $1 : "Launch";
-                        }
-                        if (defined $tag) {
-                            $warnings++;
-                            $warningTags{$tag}++;
-                            print STDERR "  warning: $fileName:#$line_num : $_\n";
-                        }
-                    }
-                    $_ = $tmp;
-                }
-            }
-            if ($hasDeviceCode > 0) {
-                $ft{'device_function'} += countSupportedDeviceFunctions();
-            }
-            transformHostFunctions();
-            # TODO: would like to move this code outside loop but it uses $_ which contains the whole file
-            unless ($no_output) {
-                my $apiCalls   = $ft{'error'} + $ft{'init'} + $ft{'version'} + $ft{'device'} + $ft{'context'} + $ft{'module'} + $ft{'library'} + $ft{'memory'} + $ft{'virtual_memory'} + $ft{'ordered_memory'} + $ft{'multicast'} + $ft{'unified'} + $ft{'stream'} + $ft{'event'} + $ft{'external_resource'} + $ft{'stream_memory'} + $ft{'execution'} + $ft{'graph'} + $ft{'occupancy'} + $ft{'texture'} + $ft{'surface'} + $ft{'tensor'} + $ft{'peer'} + $ft{'graphics'} + $ft{'driver_entry_point'} + $ft{'cpp'} + $ft{'coredump'} + $ft{'driver_interact'} + $ft{'profiler'} + $ft{'openGL'} + $ft{'D3D9'} + $ft{'D3D10'} + $ft{'D3D11'} + $ft{'VDPAU'} + $ft{'EGL'} + $ft{'thread'} + $ft{'complex'} + $ft{'library'} + $ft{'device_library'} + $ft{'device_type'} + $ft{'include'} + $ft{'include_cuda_main_header'} + $ft{'include_cuda_main_header_v2'} + $ft{'type'} + $ft{'literal'} + $ft{'numeric_literal'} + $ft{'define'};
-                my $kernStuff  = $hasDeviceCode + $ft{'kernel_launch'} + $ft{'device_function'};
-                my $totalCalls = $apiCalls + $kernStuff;
-                $is_dos = m/\r\n$/;
-                if ($totalCalls and ($countIncludes == 0) and ($kernStuff != 0)) {
-                    # TODO: implement hipify-clang's logic with header files AMAP
-                    print $OUTFILE '#include "hip/hip_runtime.h"' . ($is_dos ? "\r\n" : "\n");
-                }
-                print $OUTFILE  "$_";
-            }
-            $lineCount = $_ =~ tr/\n//;
-        }
-        my $totalConverted = totalStats(\%ft);
-        if (($totalConverted+$warnings) and $print_stats) {
-            printStats(\%ft, $warnings, $lineCount, $fileName, 0);
-        }
-        # Update totals for all files
-        addStats(\%tt, \%ft);
-        $Twarnings += $warnings;
-        $TlineCount += $lineCount;
-        foreach $key (keys %warningTags) {
-            $TwarningTags{$key} += $warningTags{$key};
-        }
-    }   # Unless filtered directory or file
-}
-# Print total stats for all files processed
-if ($print_stats and ($fileCount > 1)) {
-    printStats(\%tt, $Twarnings, $TlineCount, "GLOBAL", 1);
-    print STDERR "\n";
-}
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index 47286c364a90f..84a86dea6adcd 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -6,7 +6,7 @@ numpy==1.21.6 ; python_version < '3.9'
 numpy==2.0.0 ; python_version >= '3.9'
 torch
 coloredlogs==15.0
-transformers==4.46.3
+transformers==4.48.0
 parameterized>=0.8.1
 sentencepiece
 psutil
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 08e840092bc22..365adbe0c4294 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -106,12 +106,15 @@ def is_model_file_to_convert(file_path: pathlib.Path):
 
     providers = ["CPUExecutionProvider"]
 
-    # if the optimization level is 'all' we manually exclude the NCHWc transformer. It's not applicable to ARM
-    # devices, and creates a device specific model which won't run on all hardware.
+    # if the optimization level is greater than or equal to 'layout' we manually exclude the NCHWc transformer.
+    # It's not applicable to ARM devices, and creates a device specific model which won't run on all hardware.
     # If someone really really really wants to run it they could manually create an optimized onnx model first,
     # or they could comment out this code.
     optimizer_filter = None
-    if optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL and target_platform != "amd64":
+    if (
+        (optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL)
+        or (optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_LAYOUT)
+    ) and target_platform != "amd64":
         optimizer_filter = ["NchwcTransformer"]
 
     converted_models = []
diff --git a/tools/python/util/onnx_model_utils.py b/tools/python/util/onnx_model_utils.py
index 298e2153a9688..65a03ad9d4c51 100644
--- a/tools/python/util/onnx_model_utils.py
+++ b/tools/python/util/onnx_model_utils.py
@@ -376,6 +376,9 @@ def get_optimization_level(level):
     if level == "extended":
         # Optimizations using custom operators, excluding NCHWc and NHWC layout optimizers
         return ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+    if level == "layout":
+        # NCHWc and NHWC layout optimizers
+        return ort.GraphOptimizationLevel.ORT_ENABLE_LAYOUT
     if level == "all":
         return ort.GraphOptimizationLevel.ORT_ENABLE_ALL
 
diff --git a/tools/python/util/optimize_onnx_model.py b/tools/python/util/optimize_onnx_model.py
index c5459b2d9ff9a..4ef94f150eca6 100644
--- a/tools/python/util/optimize_onnx_model.py
+++ b/tools/python/util/optimize_onnx_model.py
@@ -22,7 +22,7 @@ def optimize_model_helper():
     parser.add_argument(
         "--opt_level",
         default="basic",
-        choices=["disable", "basic", "extended", "all"],
+        choices=["disable", "basic", "extended", "layout", "all"],
         help="Optimization level to use.",
     )
     parser.add_argument(