From 47133318497f742d525059618adb0e7e92a345da Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Tue, 5 Mar 2024 22:16:02 +0700
Subject: [PATCH 01/33] Rebase to rel branch

---
 3rdparty/cutlass                              |   2 +-
 cpp/CMakeLists.txt                            |  15 +-
 cpp/tensorrt_llm/CMakeLists.txt               |   4 +
 cpp/tensorrt_llm/nitro/CMakeLists.txt         |  49 +++++
 cpp/tensorrt_llm/nitro/install_deps.sh        |   3 +
 cpp/tensorrt_llm/nitro/main.cc                | 191 ++++++++++++++++++
 .../nitro/nitro_deps/CMakeLists.txt           | 108 ++++++++++
 7 files changed, 364 insertions(+), 8 deletions(-)
 create mode 100644 cpp/tensorrt_llm/nitro/CMakeLists.txt
 create mode 100644 cpp/tensorrt_llm/nitro/install_deps.sh
 create mode 100644 cpp/tensorrt_llm/nitro/main.cc
 create mode 100644 cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 39c6a83f231..8236f30675b 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 39c6a83f231d6db2bc6b9c251e7add77d68cbfb4
+Subproject commit 8236f30675bbe98f81d11c05764b77bfcb25b8cc
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6ef4b374a4f..dc5e3f0b477 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -29,9 +29,10 @@ project(tensorrt_llm LANGUAGES CXX)
 # Build options
 option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
 option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager"
-       ON)
-option(BUILD_TESTS "Build Google tests" ON)
-option(BUILD_BENCHMARKS "Build benchmarks" ON)
+  OFF)
+option(BUILD_TESTS "Build Google tests" OFF)
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+option(BUILD_NITRO "Build nitro" ON)
 option(NVTX_DISABLE "Disable all NVTX features" ON)
 option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
 option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF)
@@ -129,9 +130,9 @@ endif()
 # Initialize CMAKE_CUDA_ARCHITECTURES before enabling CUDA
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
-    set(CMAKE_CUDA_ARCHITECTURES 70-real 80-real 86-real 89-real 90-real)
+    set(CMAKE_CUDA_ARCHITECTURES 89-real) 
   else()
-    set(CMAKE_CUDA_ARCHITECTURES 70-real 80-real 86-real)
+    set(CMAKE_CUDA_ARCHITECTURES 89-real)
   endif()
 endif()
 
@@ -177,8 +178,8 @@ include_directories(
   ${3RDPARTY_DIR}/json/include)
 
 # TRT dependencies
-set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
-set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
+set_ifndef(TRT_LIB_DIR /usr/local/tensorrt/lib)
+set_ifndef(TRT_INCLUDE_DIR /usr/local/tensorrt/include)
 set(TRT_LIB nvinfer)
 find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
 
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index bcbf107e04a..29583f0f6c9 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -188,3 +188,7 @@ if(BUILD_PYBIND)
 endif()
 
 add_subdirectory(plugins)
+
+if(BUILD_NITRO)
+  add_subdirectory(nitro)
+endif()
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
new file mode 100644
index 00000000000..ebb6073e485
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# C++17
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install)
+
+message(STATUS "Current Source Directory NITRO: ${CMAKE_CURRENT_SOURCE_DIR}")
+
+# Enable pkg-config support in CMake
+find_package(PkgConfig REQUIRED)
+
+# Use pkg-config to find the SentencePiece library
+pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
+
+
+include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})
+
+link_directories(${SENTENCEPIECE_LIBRARY_DIRS})
+
+set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
+
+add_custom_target(nitro_proj)
+
+set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
+add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+
+add_executable(nitro main.cc)
+
+target_link_libraries(
+  nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts ${SENTENCEPIECE_LIBRARIES})
+
+target_compile_features(nitro PRIVATE cxx_std_17)
+target_compile_definitions(nitro PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
+
+add_dependencies(nitro_proj nitro)
diff --git a/cpp/tensorrt_llm/nitro/install_deps.sh b/cpp/tensorrt_llm/nitro/install_deps.sh
new file mode 100644
index 00000000000..30de5afa4e1
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/install_deps.sh
@@ -0,0 +1,3 @@
+cmake -S ./nitro_deps -B ./build_deps/nitro_deps
+make -C ./build_deps/nitro_deps -j 10
+rm -rf ./build_deps/nitro_deps
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc
new file mode 100644
index 00000000000..efa387fec3a
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/main.cc
@@ -0,0 +1,191 @@
+#include "sentencepiece_processor.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
+#include "tensorrt_llm/runtime/tllmLogger.h"
+#include <NvInfer.h>
+#include <filesystem>
+#include <iostream>
+#include <ostream>
+#include <string>
+
+using namespace tensorrt_llm::runtime;
+
+namespace tc = tensorrt_llm::common;
+namespace trt = nvinfer1;
+
+class Tokenizer
+{
+private:
+    sentencepiece::SentencePieceProcessor processor;
+
+    void replaceSubstring(std::string& base, const std::string& from, const std::string& to)
+    {
+        size_t start_pos = 0;
+        while ((start_pos = base.find(from, start_pos)) != std::string::npos)
+        {
+            base.replace(start_pos, from.length(), to);
+            start_pos += to.length();
+        }
+    }
+
+public:
+    Tokenizer(const std::string& modelPath)
+    {
+        auto status = processor.Load(modelPath);
+        if (!status.ok())
+        {
+            std::cerr << status.ToString() << std::endl;
+        }
+    }
+
+    std::string decodeWithSpace(const int id)
+    {
+        std::string text = processor.IdToPiece(id);
+        replaceSubstring(text, "▁", " ");
+        return text;
+    }
+
+    std::vector<int> encode(const std::string& input)
+    {
+        std::vector<int> ids;
+        processor.Encode(input, &ids);
+        return ids;
+    }
+};
+
+namespace
+{
+void runBenchmark()
+{
+    Tokenizer nitro_tokenizer("./tokenizer.model");
+    std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
+
+    // Fixed settings
+    const std::string modelName = "mistral";
+    const std::filesystem::path engineDir = "/app/mistral_engine_2/";
+    const int batchSize = 1;
+    const int inputLen = text_input.size();
+    const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
+
+    // Logger setup
+    auto logger = std::make_shared<TllmLogger>();
+    logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
+
+    initTrtLlmPlugins(logger.get());
+
+    // Load model configuration
+    std::filesystem::path jsonFileName = engineDir / "config.json";
+    auto const json = GptJsonConfig::parse(jsonFileName);
+    auto const modelConfig = json.getModelConfig();
+    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
+    auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
+    auto const dtype = modelConfig.getDataType();
+
+    GptSession::Config sessionConfig{1, 1, 1};
+    sessionConfig.maxBatchSize = batchSize;
+    sessionConfig.maxBeamWidth = 4; // Fixed for simplicity
+    sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
+    sessionConfig.cudaGraphMode = false; // Fixed for simplicity
+
+    SamplingConfig samplingConfig{1}; // Fixed for simplicity
+    samplingConfig.temperature = std::vector{0.0f};
+    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+    samplingConfig.topK = std::vector{40};
+    samplingConfig.topP = std::vector{0.0f};
+    samplingConfig.minLength = std::vector{inOutLen[1]};
+    samplingConfig.repetitionPenalty = std::vector{1.3f};
+
+    // Initialize session
+    GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger};
+    // Generate random input IDs within the model's vocabulary range
+    const int vocabSize = modelConfig.getVocabSize();
+    std::vector<int32_t> inputIdsHost = text_input;
+
+    std::cout << "Start Nitro testing session: " << std::endl;
+    //    for (auto& id : inputIdsHost)
+    //    {
+    //        id = rand() % vocabSize; // Random token ID within vocabulary range
+    //        std::cout << id << std::endl;
+    //    }
+    //    // Simplified benchmarking process for a single run
+    // Note: This example does not include input data preparation or output handling for brevity
+
+    // Input preparation
+    auto& bufferManager = session.getBufferManager();
+    GenerationInput::TensorPtr inputIds
+        = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
+
+    std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
+    GenerationInput::TensorPtr inputLengths
+        = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
+
+    bool inputPacked = modelConfig.usePackedInput();
+
+    GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked};
+
+    GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
+        bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+
+    // Define the callback to stream each generated token
+    generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer](
+                                            GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
+    {
+        if (!finished)
+        {
+            // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
+            int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
+            // Copy output IDs from GPU to host for printing
+            std::vector<int32_t> outputIdsHost(outputLength);
+            bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
+
+            // Find the last non-zero value in the output IDs starting from the end of the input sequence
+            int lastNonZeroIndex = -1;
+            for (int i = outputLength - 1; i >= inOutLen[0]; --i)
+            {
+                if (outputIdsHost[i] != 0)
+                {
+                    lastNonZeroIndex = i;
+                    break; // Stop at the first non-zero token found from the end
+                }
+            }
+
+            // Directly print the last non-zero value if found, without using 'step'
+            if (lastNonZeroIndex != -1)
+            {
+                int outTok = outputIdsHost[lastNonZeroIndex];
+                if (outTok == 13)
+                {
+                    std::cout << "\n";
+                }
+                else
+                {
+                    std::cout << nitro_tokenizer.decodeWithSpace(outTok);
+                }
+            }
+        }
+    };
+
+    session.generate(generationOutput, generationInput, samplingConfig);
+    bufferManager.getStream().synchronize();
+}
+
+} // namespace
+
+int main()
+{
+    try
+    {
+        runBenchmark();
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
new file mode 100644
index 00000000000..c097fcb4b37
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
@@ -0,0 +1,108 @@
+cmake_minimum_required(VERSION 3.22)  # Required for FetchContent
+
+project(MyProject)
+
+include(ExternalProject)
+
+# Define variables
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(THIRD_PARTY_INSTALL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/_install)
+#if(NOT THIRD_PARTY_INSTALL_PATH )
+#  message(FATAL_ERROR "TRITON_THIRD_PARTY_INSTALL_PREFIX must be set")
+#endif() # TRITON_THIRD_PARTY_INSTALL_PREFIX
+# To force the find_package to look for .a inside self installed version
+#set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+#set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+#set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+#
+# Add the external project
+set(ZLIB_USE_STATIC_LIBS OFF)
+find_package(ZLIB)
+if(NOT ZLIB_FOUND)
+    set(ZLIB_USE_STATIC_LIBS ON)
+    ExternalProject_Add(
+        zlib
+	GIT_REPOSITORY https://github.com/madler/zlib.git
+	GIT_TAG v1.2.11
+	CMAKE_ARGS
+	    -DBUILD_SHARED_LIBS=OFF
+	    -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+    )
+endif()
+
+ExternalProject_Add(
+    brotli
+    GIT_REPOSITORY https://github.com/google/brotli
+    GIT_TAG v1.1.0
+    CMAKE_ARGS 
+	-DCMAKE_BUILD_TYPE=Release
+	-DBUILD_SHARED_LIBS=OFF
+	-DSHARE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/share
+	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    jsoncpp
+    GIT_REPOSITORY https://github.com/open-source-parsers/jsoncpp
+    GIT_TAG 1.9.5
+    CMAKE_ARGS 
+    	-DBUILD_SHARED_LIBS=OFF
+    	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    c-ares
+    GIT_REPOSITORY https://github.com/c-ares/c-ares
+    GIT_TAG cares-1_26_0
+    CMAKE_ARGS
+    	-DCARES_SHARED=OFF
+	-DCARES_STATIC=ON
+    	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    drogon
+    GIT_REPOSITORY https://github.com/drogonframework/drogon
+    GIT_TAG v1.9.2
+    CMAKE_ARGS
+	-DCMAKE_BUILD_TYPE=release
+	-DOPENSSL_USE_STATIC_LIBS=TRUE
+	-DZLIB_USE_STATIC_LIBS=${ZLIB_USE_STATIC_LIBS}
+	-DBUILD_ORM=OFF
+	-DBUILD_YAML_CONFIG=OFF
+	-DBUILD_EXAMPLES=OFF
+	-DBUILD_CTL=OFF
+	-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+	-DBUILD_BROTLI=ON
+	-DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH}
+	# -DCMAKE_FIND_ROOT_PATH=${THIRD_PARTY_INSTALL_PATH} # To set the dir (that will be used to force the look for .a)
+	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    sentencepiece
+    GIT_REPOSITORY https://github.com/google/sentencepiece
+    GIT_TAG v0.2.0
+    CMAKE_ARGS
+    	-DSPM_ENABLE_SHARED=OFF
+    	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+# Fix trantor cmakelists to link c-ares on Windows
+if(WIN32)
+    set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/nitro_deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt)
+    ExternalProject_Add_Step(drogon trantor_custom_target
+        COMMAND ${CMAKE_COMMAND} -E echo add_definitions(-DCARES_STATICLIB) >> ${TRANTOR_CMAKE_FILE}
+	DEPENDEES download
+    )
+endif()
+
+include_directories(${THIRD_PARTY_INSTALL_PATH}/include)
+link_directories(${THIRD_PARTY_INSTALL_PATH}/lib)
+# Optionally link or add dependencies to your targets
+add_dependencies(drogon c-ares jsoncpp brotli)
+
+if(ZLIB_USE_STATIC_LIBS)
+    add_dependencies(drogon zlib)
+endif()
+# target_link_libraries(<your-target> ...)
\ No newline at end of file

From 44a60b3a005959754be99cde21f9f012ed5a9f27 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Wed, 6 Mar 2024 07:38:49 +0000
Subject: [PATCH 02/33] only build release

---
 cpp/CMakeLists.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dc5e3f0b477..37adf9dd9f3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -17,6 +17,7 @@
 
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_BUILD_TYPE Release)
 
 include(CheckLanguage)
 include(cmake/modules/set_ifndef.cmake)
@@ -45,12 +46,7 @@ else()
   message(STATUS "NVTX is enabled")
 endif()
 
-if(EXISTS
-   "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt")
-  set(BUILD_BATCH_MANAGER_DEFAULT ON)
-else()
-  set(BUILD_BATCH_MANAGER_DEFAULT OFF)
-endif()
+set(BUILD_BATCH_MANAGER_DEFAULT OFF)
 
 option(BUILD_BATCH_MANAGER "Build batch manager from source"
        ${BUILD_BATCH_MANAGER_DEFAULT})

From e015bcf849cd1fdea897c0cbdb3898741c674a48 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Wed, 6 Mar 2024 07:39:35 +0000
Subject: [PATCH 03/33] upgrade nitro

---
 cpp/tensorrt_llm/nitro/CMakeLists.txt         |  53 ++++-
 .../nitro/controllers/tensorrtllm.cc          |  78 +++++++
 .../nitro/controllers/tensorrtllm.h           | 133 ++++++++++++
 cpp/tensorrt_llm/nitro/install_deps.sh        |   2 +-
 cpp/tensorrt_llm/nitro/main.cc                | 200 +-----------------
 .../nitro/nitro_deps/CMakeLists.txt           |   2 +-
 cpp/tensorrt_llm/nitro/test.cc                | 181 ++++++++++++++++
 7 files changed, 456 insertions(+), 193 deletions(-)
 create mode 100644 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
 create mode 100644 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
 mode change 100644 => 100755 cpp/tensorrt_llm/nitro/install_deps.sh
 create mode 100644 cpp/tensorrt_llm/nitro/test.cc

diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
index ebb6073e485..6aac914bbb5 100644
--- a/cpp/tensorrt_llm/nitro/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -13,15 +13,38 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 # C++17
+# NItro init
+include(CheckIncludeFileCXX)
+
+check_include_file_cxx(any HAS_ANY)
+check_include_file_cxx(string_view HAS_STRING_VIEW)
+check_include_file_cxx(coroutine HAS_COROUTINE)
+if(HAS_ANY
+   AND HAS_STRING_VIEW
+   AND HAS_COROUTINE)
+  set(CMAKE_CXX_STANDARD 20)
+elseif(HAS_ANY AND HAS_STRING_VIEW)
+  set(CMAKE_CXX_STANDARD 17)
+else()
+  set(CMAKE_CXX_STANDARD 14)
+endif()
+
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install)
 
 message(STATUS "Current Source Directory NITRO: ${CMAKE_CURRENT_SOURCE_DIR}")
+message(STATUS "Current Cmake Prefix Path of NITRO: ${CMAKE_PREFIX_PATH}")
+
+
+set(OPENSSL_USE_STATIC_LIBS TRUE)
+
 
 # Enable pkg-config support in CMake
 find_package(PkgConfig REQUIRED)
+find_package(Drogon CONFIG REQUIRED)
 
 # Use pkg-config to find the SentencePiece library
 pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
@@ -38,12 +61,40 @@ add_custom_target(nitro_proj)
 set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
 add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
 
+# main
 add_executable(nitro main.cc)
 
 target_link_libraries(
-  nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts ${SENTENCEPIECE_LIBRARIES})
+  nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} )
+
 
 target_compile_features(nitro PRIVATE cxx_std_17)
 target_compile_definitions(nitro PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
 
+
+
+aux_source_directory(controllers CTL_SRC)
+aux_source_directory(common COMMON_SRC)
+aux_source_directory(context CONTEXT_SRC)
+aux_source_directory(models MODEL_SRC)
+
+target_include_directories(nitro PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+# ${CMAKE_CURRENT_SOURCE_DIR}/models)
+target_sources(nitro PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC})
+
+
+
+
+# test
+add_executable(test test.cc)
+
+target_link_libraries(
+  test PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece )
+
+target_compile_features(test PRIVATE cxx_std_17)
+target_compile_definitions(test PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
+#
+
+
 add_dependencies(nitro_proj nitro)
+
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
new file mode 100644
index 00000000000..79c0ad43ef5
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -0,0 +1,78 @@
+#include "tensorrtllm.h"
+#include <ostream>
+#include <trantor/utils/Logger.h>
+
+void tensorrtllm::chat_completion(
+    const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback) const
+{
+    std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
+    const int inputLen = text_input.size();
+    const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
+
+    const int batchSize = 1;
+
+    std::vector<int32_t> inputIdsHost = text_input;
+
+    std::cout << "Start Nitro testing session: " << std::endl;
+    // Input preparation
+    auto& bufferManager = gptSession->getBufferManager();
+    GenerationInput::TensorPtr inputIds
+        = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
+
+    std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
+    GenerationInput::TensorPtr inputLengths
+        = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
+
+    bool inputPacked = modelConfig->usePackedInput();
+
+    GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked};
+
+    GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
+        bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+    // Define the callback to stream each generated token
+    generationOutput.onTokenGenerated = [&bufferManager, inOutLen, this, &generationOutput](
+                                            GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
+    {
+        if (!finished)
+        {
+            // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
+            int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
+            // Copy output IDs from GPU to host for printing
+            std::vector<int32_t> outputIdsHost(outputLength);
+            bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
+            // Find the last non-zero value in the output IDs starting from the end of the input sequence
+            int lastNonZeroIndex = -1;
+            for (int i = outputLength - 1; i >= inOutLen[0]; --i)
+            {
+                if (outputIdsHost[i] != 0)
+                {
+                    lastNonZeroIndex = i;
+                    break; // Stop at the first non-zero token found from the end
+                }
+            }
+
+            // Directly print the last non-zero value if found, without using 'step'
+            if (lastNonZeroIndex != -1)
+            {
+                int outTok = outputIdsHost[lastNonZeroIndex];
+                if (outTok == 13)
+                {
+                        std::cout<<"\n" <<std::flush;
+                }
+                else
+                {
+                        std::cout<< this->nitro_tokenizer.decodeWithSpace(outTok)  <<std::flush;
+                }
+            }
+        }
+    };
+
+    gptSession->generate(generationOutput, generationInput, samplingConfig);
+
+    bufferManager.getStream().synchronize();
+
+    LOG_INFO << "Hello world";
+    return;
+};
+
+// Add definition of your processing function here
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
new file mode 100644
index 00000000000..7f5b0c15a03
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include "sentencepiece_processor.h"
+#include <drogon/HttpController.h>
+
+#include "sentencepiece_processor.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/gptModelConfig.h"
+#include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "tensorrt_llm/runtime/tllmLogger.h"
+#include "thread"
+#include <NvInfer.h>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+
+using namespace drogon;
+
+using namespace tensorrt_llm::runtime;
+
+class Tokenizer
+{
+private:
+    sentencepiece::SentencePieceProcessor processor;
+
+    void replaceSubstring(std::string& base, const std::string& from, const std::string& to) const
+    {
+        size_t start_pos = 0;
+        while ((start_pos = base.find(from, start_pos)) != std::string::npos)
+        {
+            base.replace(start_pos, from.length(), to);
+            start_pos += to.length();
+        }
+    }
+
+public:
+    Tokenizer(const std::string& modelPath)
+    {
+        auto status = processor.Load(modelPath);
+        if (!status.ok())
+        {
+            std::cerr << status.ToString() << std::endl;
+        }
+        LOG_INFO << "Successully loaded the tokenizer";
+    }
+
+    std::string decodeWithSpace(const int id) const
+    {
+        std::string text = processor.IdToPiece(id);
+        replaceSubstring(text, "▁", " ");
+        return text;
+    }
+
+    std::vector<int> encode(const std::string& input) const
+    {
+        std::vector<int> ids;
+        processor.Encode(input, &ids);
+        return ids;
+    }
+};
+
+class tensorrtllm : public drogon::HttpController<tensorrtllm>
+{
+public:
+    tensorrtllm()
+    {
+        std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
+        const int inputLen = text_input.size();
+        const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
+
+        logger = std::make_shared<TllmLogger>();
+        logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
+        // Fixed settings
+        const std::string modelName = "mistral";
+        const std::filesystem::path engineDir = "/app/mistral_engine_2/";
+        const int batchSize = 1;
+        initTrtLlmPlugins(logger.get());
+        // Load model configuration
+        std::filesystem::path jsonFileName = engineDir / "config.json";
+        auto const json = GptJsonConfig::parse(jsonFileName);
+        auto config = json.getModelConfig();
+        modelConfig = std::make_unique<GptModelConfig>(config);
+        auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
+        auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
+        auto const dtype = modelConfig->getDataType();
+
+        // Set gptsessionconfig
+        sessionConfig.maxBatchSize = batchSize;
+        sessionConfig.maxBeamWidth = 4; // Fixed for simplicity
+        sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
+        sessionConfig.cudaGraphMode = false; // Fixed for simplicity
+
+        // Set smapling config
+        samplingConfig.temperature = std::vector{0.0f};
+        samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+        samplingConfig.topK = std::vector{40};
+        samplingConfig.topP = std::vector{0.0f};
+        samplingConfig.minLength = std::vector{inOutLen[1]};
+        samplingConfig.repetitionPenalty = std::vector{1.3f};
+
+        gptSession
+            = std::make_unique<GptSession>(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger);
+    };
+
+    METHOD_LIST_BEGIN
+    // use METHOD_ADD to add your custom processing function here;
+    // METHOD_ADD(tensorrtllm::get, "/{2}/{1}", Get); // path is /tensorrtllm/{arg2}/{arg1}
+    // METHOD_ADD(tensorrtllm::your_method_name, "/{1}/{2}/list", Get); // path is /tensorrtllm/{arg1}/{arg2}/list
+    ADD_METHOD_TO(tensorrtllm::chat_completion, "/testing", Get); // path is
+    // /absolute/path/{arg1}/{arg2}/list
+
+    METHOD_LIST_END
+    // your declaration of processing function maybe like this:
+    // void get(const HttpRequestPtr& req, std::function<void (const HttpResponsePtr &)> &&callback, int p1, std::string
+    // p2);
+    void chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback) const;
+
+private:
+    GptSession::Config sessionConfig{1, 1, 1};
+    SamplingConfig samplingConfig{1};
+    std::unique_ptr<GptModelConfig> modelConfig;
+    Tokenizer nitro_tokenizer{"./tokenizer.model"};
+    std::unique_ptr<GptSession> gptSession;
+    std::shared_ptr<TllmLogger> logger;
+};
diff --git a/cpp/tensorrt_llm/nitro/install_deps.sh b/cpp/tensorrt_llm/nitro/install_deps.sh
old mode 100644
new mode 100755
index 30de5afa4e1..d43257aa08e
--- a/cpp/tensorrt_llm/nitro/install_deps.sh
+++ b/cpp/tensorrt_llm/nitro/install_deps.sh
@@ -1,3 +1,3 @@
 cmake -S ./nitro_deps -B ./build_deps/nitro_deps
 make -C ./build_deps/nitro_deps -j 10
-rm -rf ./build_deps/nitro_deps
\ No newline at end of file
+rm -rf ./build_deps/nitro_deps
diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc
index efa387fec3a..97c7ddba686 100644
--- a/cpp/tensorrt_llm/nitro/main.cc
+++ b/cpp/tensorrt_llm/nitro/main.cc
@@ -1,191 +1,11 @@
-#include "sentencepiece_processor.h"
-#include "tensorrt_llm/common/cudaUtils.h"
-#include "tensorrt_llm/common/memoryUtils.h"
-#include "tensorrt_llm/plugins/api/tllmPlugin.h"
-#include "tensorrt_llm/runtime/gptJsonConfig.h"
-#include "tensorrt_llm/runtime/gptSession.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/memoryCounters.h"
-#include "tensorrt_llm/runtime/tllmLogger.h"
-#include <NvInfer.h>
-#include <filesystem>
-#include <iostream>
-#include <ostream>
-#include <string>
-
-using namespace tensorrt_llm::runtime;
-
-namespace tc = tensorrt_llm::common;
-namespace trt = nvinfer1;
-
-class Tokenizer
-{
-private:
-    sentencepiece::SentencePieceProcessor processor;
-
-    void replaceSubstring(std::string& base, const std::string& from, const std::string& to)
-    {
-        size_t start_pos = 0;
-        while ((start_pos = base.find(from, start_pos)) != std::string::npos)
-        {
-            base.replace(start_pos, from.length(), to);
-            start_pos += to.length();
-        }
-    }
-
-public:
-    Tokenizer(const std::string& modelPath)
-    {
-        auto status = processor.Load(modelPath);
-        if (!status.ok())
-        {
-            std::cerr << status.ToString() << std::endl;
-        }
-    }
-
-    std::string decodeWithSpace(const int id)
-    {
-        std::string text = processor.IdToPiece(id);
-        replaceSubstring(text, "▁", " ");
-        return text;
-    }
-
-    std::vector<int> encode(const std::string& input)
-    {
-        std::vector<int> ids;
-        processor.Encode(input, &ids);
-        return ids;
-    }
-};
-
-namespace
-{
-void runBenchmark()
-{
-    Tokenizer nitro_tokenizer("./tokenizer.model");
-    std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
-
-    // Fixed settings
-    const std::string modelName = "mistral";
-    const std::filesystem::path engineDir = "/app/mistral_engine_2/";
-    const int batchSize = 1;
-    const int inputLen = text_input.size();
-    const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
-
-    // Logger setup
-    auto logger = std::make_shared<TllmLogger>();
-    logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
-
-    initTrtLlmPlugins(logger.get());
-
-    // Load model configuration
-    std::filesystem::path jsonFileName = engineDir / "config.json";
-    auto const json = GptJsonConfig::parse(jsonFileName);
-    auto const modelConfig = json.getModelConfig();
-    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
-    auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
-    auto const dtype = modelConfig.getDataType();
-
-    GptSession::Config sessionConfig{1, 1, 1};
-    sessionConfig.maxBatchSize = batchSize;
-    sessionConfig.maxBeamWidth = 4; // Fixed for simplicity
-    sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
-    sessionConfig.cudaGraphMode = false; // Fixed for simplicity
-
-    SamplingConfig samplingConfig{1}; // Fixed for simplicity
-    samplingConfig.temperature = std::vector{0.0f};
-    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
-    samplingConfig.topK = std::vector{40};
-    samplingConfig.topP = std::vector{0.0f};
-    samplingConfig.minLength = std::vector{inOutLen[1]};
-    samplingConfig.repetitionPenalty = std::vector{1.3f};
-
-    // Initialize session
-    GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger};
-    // Generate random input IDs within the model's vocabulary range
-    const int vocabSize = modelConfig.getVocabSize();
-    std::vector<int32_t> inputIdsHost = text_input;
-
-    std::cout << "Start Nitro testing session: " << std::endl;
-    //    for (auto& id : inputIdsHost)
-    //    {
-    //        id = rand() % vocabSize; // Random token ID within vocabulary range
-    //        std::cout << id << std::endl;
-    //    }
-    //    // Simplified benchmarking process for a single run
-    // Note: This example does not include input data preparation or output handling for brevity
-
-    // Input preparation
-    auto& bufferManager = session.getBufferManager();
-    GenerationInput::TensorPtr inputIds
-        = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
-
-    std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
-    GenerationInput::TensorPtr inputLengths
-        = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
-
-    bool inputPacked = modelConfig.usePackedInput();
-
-    GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked};
-
-    GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
-        bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
-
-    // Define the callback to stream each generated token
-    generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer](
-                                            GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
-    {
-        if (!finished)
-        {
-            // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
-            int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
-            // Copy output IDs from GPU to host for printing
-            std::vector<int32_t> outputIdsHost(outputLength);
-            bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
-
-            // Find the last non-zero value in the output IDs starting from the end of the input sequence
-            int lastNonZeroIndex = -1;
-            for (int i = outputLength - 1; i >= inOutLen[0]; --i)
-            {
-                if (outputIdsHost[i] != 0)
-                {
-                    lastNonZeroIndex = i;
-                    break; // Stop at the first non-zero token found from the end
-                }
-            }
-
-            // Directly print the last non-zero value if found, without using 'step'
-            if (lastNonZeroIndex != -1)
-            {
-                int outTok = outputIdsHost[lastNonZeroIndex];
-                if (outTok == 13)
-                {
-                    std::cout << "\n";
-                }
-                else
-                {
-                    std::cout << nitro_tokenizer.decodeWithSpace(outTok);
-                }
-            }
-        }
-    };
-
-    session.generate(generationOutput, generationInput, samplingConfig);
-    bufferManager.getStream().synchronize();
-}
-
-} // namespace
-
-int main()
-{
-    try
-    {
-        runBenchmark();
-    }
-    catch (const std::exception& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return 1;
-    }
+#include <drogon/drogon.h>
+int main() {
+    //Set HTTP listener address and port
+    drogon::app().addListener("0.0.0.0", 5555);
+    //Load config file
+    //drogon::app().loadConfigFile("../config.json");
+    //drogon::app().loadConfigFile("../config.yaml");
+    //Run HTTP framework,the method will block in the internal event loop
+    drogon::app().run();
     return 0;
-}
\ No newline at end of file
+}
diff --git a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
index c097fcb4b37..cd0d76a719e 100644
--- a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
@@ -105,4 +105,4 @@ add_dependencies(drogon c-ares jsoncpp brotli)
 if(ZLIB_USE_STATIC_LIBS)
     add_dependencies(drogon zlib)
 endif()
-# target_link_libraries(<your-target> ...)
\ No newline at end of file
+# target_link_libraries(<your-target> ...)
diff --git a/cpp/tensorrt_llm/nitro/test.cc b/cpp/tensorrt_llm/nitro/test.cc
new file mode 100644
index 00000000000..b3e0dd754e6
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/test.cc
@@ -0,0 +1,181 @@
+#include "sentencepiece_processor.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
+#include "tensorrt_llm/runtime/tllmLogger.h"
+#include "thread"
+#include <NvInfer.h>
+#include <filesystem>
+#include <iostream>
+#include <ostream>
+#include <string>
+using namespace tensorrt_llm::runtime;
+
+namespace tc = tensorrt_llm::common;
+namespace trt = nvinfer1;
+
+class Tokenizer
+{
+private:
+    sentencepiece::SentencePieceProcessor processor;
+
+    void replaceSubstring(std::string& base, const std::string& from, const std::string& to)
+    {
+        size_t start_pos = 0;
+        while ((start_pos = base.find(from, start_pos)) != std::string::npos)
+        {
+            base.replace(start_pos, from.length(), to);
+            start_pos += to.length();
+        }
+    }
+
+public:
+    Tokenizer(const std::string& modelPath)
+    {
+        auto status = processor.Load(modelPath);
+        if (!status.ok())
+        {
+            std::cerr << status.ToString() << std::endl;
+        }
+    }
+
+    std::string decodeWithSpace(const int id)
+    {
+        std::string text = processor.IdToPiece(id);
+        replaceSubstring(text, "▁", " ");
+        return text;
+    }
+
+    std::vector<int> encode(const std::string& input)
+    {
+        std::vector<int> ids;
+        processor.Encode(input, &ids);
+        return ids;
+    }
+};
+
+namespace
+{
+void runBenchmark()
+{
+    Tokenizer nitro_tokenizer("./tokenizer.model");
+    std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
+
+    // Fixed settings
+    const std::string modelName = "mistral";
+    const std::filesystem::path engineDir = "/app/mistral_engine_2/";
+    const int batchSize = 1;
+    const int inputLen = text_input.size();
+    const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
+
+    // Logger setup
+    auto logger = std::make_shared<TllmLogger>();
+    logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
+
+    initTrtLlmPlugins(logger.get());
+
+    // Load model configuration
+    std::filesystem::path jsonFileName = engineDir / "config.json";
+    auto const json = GptJsonConfig::parse(jsonFileName);
+    auto const modelConfig = json.getModelConfig();
+    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
+    auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
+    auto const dtype = modelConfig.getDataType();
+
+    GptSession::Config sessionConfig{1, 1, 1};
+    sessionConfig.maxBatchSize = batchSize;
+    sessionConfig.maxBeamWidth = 4; // Fixed for simplicity
+    sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
+    sessionConfig.cudaGraphMode = false; // Fixed for simplicity
+
+    SamplingConfig samplingConfig{1}; // Fixed for simplicity
+    samplingConfig.temperature = std::vector{0.0f};
+    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+    samplingConfig.topK = std::vector{40};
+    samplingConfig.topP = std::vector{0.0f};
+    samplingConfig.minLength = std::vector{inOutLen[1]};
+    samplingConfig.repetitionPenalty = std::vector{1.3f};
+
+    // Initialize session
+    GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger};
+    // Generate random input IDs within the model's vocabulary range
+    std::vector<int32_t> inputIdsHost = text_input;
+
+    std::cout << "Start Nitro testing session: " << std::endl;
+    // Input preparation
+    auto& bufferManager = session.getBufferManager();
+    GenerationInput::TensorPtr inputIds
+        = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
+
+    std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
+    GenerationInput::TensorPtr inputLengths
+        = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
+
+    bool inputPacked = modelConfig.usePackedInput();
+
+    GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked};
+
+    GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
+        bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+    // Define the callback to stream each generated token
+    generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer, &generationOutput](
+                                            GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
+    {
+        if (!finished)
+        {
+            // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
+            int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
+            // Copy output IDs from GPU to host for printing
+            std::vector<int32_t> outputIdsHost(outputLength);
+            bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
+            // Find the last non-zero value in the output IDs starting from the end of the input sequence
+            int lastNonZeroIndex = -1;
+            for (int i = outputLength - 1; i >= inOutLen[0]; --i)
+            {
+                if (outputIdsHost[i] != 0)
+                {
+                    lastNonZeroIndex = i;
+                    break; // Stop at the first non-zero token found from the end
+                }
+            }
+
+            // Directly print the last non-zero value if found, without using 'step'
+            if (lastNonZeroIndex != -1)
+            {
+                int outTok = outputIdsHost[lastNonZeroIndex];
+                if (outTok == 13)
+                {
+                    std::cout << "\n";
+                }
+                else
+                {
+                    std::cout << nitro_tokenizer.decodeWithSpace(outTok);
+                }
+            }
+        }
+    };
+
+    session.generate(generationOutput, generationInput, samplingConfig);
+    bufferManager.getStream().synchronize();
+}
+
+} // namespace
+
+int main()
+{
+    try
+    {
+        runBenchmark();
+        std::this_thread::sleep_for(std::chrono::seconds(10));
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+    return 0;
+}

From 97010fdafd2ea12acc2d9cb055e7536cb848d6b3 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Wed, 6 Mar 2024 07:57:48 +0000
Subject: [PATCH 04/33] better example

---
 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc | 8 ++++----
 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h  | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index 79c0ad43ef5..db8b24c609a 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -5,9 +5,9 @@
 void tensorrtllm::chat_completion(
     const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback) const
 {
-    std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
+    std::vector<int> text_input = nitro_tokenizer.encode(example_string);
     const int inputLen = text_input.size();
-    const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
+    const std::vector<int> inOutLen = {inputLen, 1500}; // input_length, output_length
 
     const int batchSize = 1;
 
@@ -57,11 +57,11 @@ void tensorrtllm::chat_completion(
                 int outTok = outputIdsHost[lastNonZeroIndex];
                 if (outTok == 13)
                 {
-                        std::cout<<"\n" <<std::flush;
+                    std::cout << "\n" << std::flush;
                 }
                 else
                 {
-                        std::cout<< this->nitro_tokenizer.decodeWithSpace(outTok)  <<std::flush;
+                    std::cout << this->nitro_tokenizer.decodeWithSpace(outTok) << std::flush;
                 }
             }
         }
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index 7f5b0c15a03..577a19dccae 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -72,15 +72,15 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
 public:
     tensorrtllm()
     {
-        std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
+        std::vector<int> text_input = nitro_tokenizer.encode(example_string);
         const int inputLen = text_input.size();
-        const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
+        const std::vector<int> inOutLen = {inputLen, 1500}; // input_length, output_length
 
         logger = std::make_shared<TllmLogger>();
         logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
         // Fixed settings
         const std::string modelName = "mistral";
-        const std::filesystem::path engineDir = "/app/mistral_engine_2/";
+        const std::filesystem::path engineDir = "/app/mistral_engine_3/";
         const int batchSize = 1;
         initTrtLlmPlugins(logger.get());
         // Load model configuration
@@ -130,4 +130,5 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     Tokenizer nitro_tokenizer{"./tokenizer.model"};
     std::unique_ptr<GptSession> gptSession;
     std::shared_ptr<TllmLogger> logger;
+    std::string example_string{"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease tell me a long and sad story<|im_end|>\n<|im_start|>assistant"};
 };

From 61f908c25474062e601818dc6c5c238a712d19aa Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Thu, 7 Mar 2024 05:17:58 +0000
Subject: [PATCH 05/33] latest demo

---
 .../nitro/controllers/tensorrtllm.cc          | 89 ++++++++++++++-----
 1 file changed, 65 insertions(+), 24 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index db8b24c609a..bb4bef870d1 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -1,36 +1,69 @@
 #include "tensorrtllm.h"
+#include <iostream>
+#include <memory>
 #include <ostream>
+#include <string>
 #include <trantor/utils/Logger.h>
+#include <vector>
 
-void tensorrtllm::chat_completion(
-    const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback) const
+void removeZeroes(std::vector<int>& vec)
 {
+    vec.erase(std::remove(vec.begin(), vec.end(), 0), vec.end());
+}
+
+struct inferenceState
+{
+    int prevPos{0};
+    bool isFinished;
+};
+
+void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
+{
+    std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
+
     std::vector<int> text_input = nitro_tokenizer.encode(example_string);
     const int inputLen = text_input.size();
-    const std::vector<int> inOutLen = {inputLen, 1500}; // input_length, output_length
+    const std::vector<int> inOutLen = {inputLen, 2000}; // input_length, output_length
 
     const int batchSize = 1;
 
     std::vector<int32_t> inputIdsHost = text_input;
 
     std::cout << "Start Nitro testing session: " << std::endl;
-    // Input preparation
+
     auto& bufferManager = gptSession->getBufferManager();
+    // Make stopwordlists
+
+    // Your stop word single token "32000"
+    std::vector<int32_t> stopWordsTokens = {32000, -1, 1, -1}; // Extend with -1 for increased length
+
+    // Tensor creation for stopWordsList
+    // Assuming the framework allows similar operations for creating custom tensors
+    // At this point,
+    // Input preparation
     GenerationInput::TensorPtr inputIds
         = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
 
     std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
     GenerationInput::TensorPtr inputLengths
         = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
-
     bool inputPacked = modelConfig->usePackedInput();
 
-    GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked};
+    GenerationInput generationInput{0, 32000, inputIds, inputLengths, inputPacked};
 
+    // generationInput.stopWordsList = stopWordsTokensTensor;
+
+    generationInput.stopWordsList = bufferManager.copyFrom(stopWordsTokens, ITensor::makeShape({2,2}), MemoryType::kGPU);
+    generationInput.stopWordsList->reshape(ITensor::makeShape({1,2,2}));
+
+    LOG_INFO << "here is the shape:   " << generationInput.stopWordsList->getShape().d[0];
+
+    LOG_INFO << "here is the shape:   " << generationInput.stopWordsList->getShape().d[1];
     GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
         bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+
     // Define the callback to stream each generated token
-    generationOutput.onTokenGenerated = [&bufferManager, inOutLen, this, &generationOutput](
+    generationOutput.onTokenGenerated = [&inferState, &bufferManager, inOutLen, this, &generationOutput](
                                             GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
     {
         if (!finished)
@@ -41,29 +74,33 @@ void tensorrtllm::chat_completion(
             std::vector<int32_t> outputIdsHost(outputLength);
             bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
             // Find the last non-zero value in the output IDs starting from the end of the input sequence
-            int lastNonZeroIndex = -1;
-            for (int i = outputLength - 1; i >= inOutLen[0]; --i)
+            std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inOutLen[0], outputIdsHost.end());
+            removeZeroes(outputIdsHostDecode);
+            std::string text = nitro_tokenizer.decode(outputIdsHostDecode);
+
+            if (inferState->prevPos > 0 && inferState->prevPos < text.size())
             {
-                if (outputIdsHost[i] != 0)
-                {
-                    lastNonZeroIndex = i;
-                    break; // Stop at the first non-zero token found from the end
-                }
+                // Valid prevPos, proceed with slicing the string from prevPos to the end
+                std::string stringTok(text.begin() + inferState->prevPos, text.end());
+                std::cout << stringTok << std::flush;
             }
-
-            // Directly print the last non-zero value if found, without using 'step'
-            if (lastNonZeroIndex != -1)
+            else if (inferState->prevPos >= text.size())
             {
-                int outTok = outputIdsHost[lastNonZeroIndex];
-                if (outTok == 13)
-                {
-                    std::cout << "\n" << std::flush;
-                }
-                else
+                // prevPos is out of bounds, indicating there might be no new text or an error in logic
+                // You can handle this case as needed, for example, by logging a warning
+                inferState->prevPos = text.size();
+            }
+            else
+            {
+                // inferState->prevPos is 0 or negative, indicating a potential logic error or initial state
+                // If there's valid text, you might want to print it all or handle this case specifically
+                if (!text.empty())
                 {
-                    std::cout << this->nitro_tokenizer.decodeWithSpace(outTok) << std::flush;
+                    std::cout << text << std::flush; // Optionally print all text if it's the initial state
                 }
             }
+            // Update prevPos to the new length of the text for the next iteration
+            inferState->prevPos = text.size();
         }
     };
 
@@ -71,6 +108,10 @@ void tensorrtllm::chat_completion(
 
     bufferManager.getStream().synchronize();
 
+    auto resp=HttpResponse::newHttpResponse();
+    resp->setStatusCode(k200OK);
+    resp->setBody("Your Page Contents");
+    callback(resp);
     LOG_INFO << "Hello world";
     return;
 };

From 636b6897d6fe1cc9a145651af3a73892d5d3c248 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Thu, 7 Mar 2024 05:18:08 +0000
Subject: [PATCH 06/33] latest demo

---
 .../nitro/controllers/tensorrtllm.h           | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index 577a19dccae..ee7b234d714 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "sentencepiece_processor.h"
+#include <cstdint>
 #include <drogon/HttpController.h>
 
 #include "sentencepiece_processor.h"
@@ -31,7 +32,7 @@ class Tokenizer
 private:
     sentencepiece::SentencePieceProcessor processor;
 
-    void replaceSubstring(std::string& base, const std::string& from, const std::string& to) const
+    void replaceSubstring(std::string& base, const std::string& from, const std::string& to)
     {
         size_t start_pos = 0;
         while ((start_pos = base.find(from, start_pos)) != std::string::npos)
@@ -52,14 +53,20 @@ class Tokenizer
         LOG_INFO << "Successully loaded the tokenizer";
     }
 
-    std::string decodeWithSpace(const int id) const
+    std::string decodeWithSpace(const int id)
     {
         std::string text = processor.IdToPiece(id);
         replaceSubstring(text, "▁", " ");
         return text;
     }
 
-    std::vector<int> encode(const std::string& input) const
+    std::string decode(const std::vector<int32_t> ids)
+    {
+        std::string text = processor.DecodeIds(ids);
+        return text;
+    }
+
+    std::vector<int> encode(const std::string& input)
     {
         std::vector<int> ids;
         processor.Encode(input, &ids);
@@ -74,7 +81,7 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     {
         std::vector<int> text_input = nitro_tokenizer.encode(example_string);
         const int inputLen = text_input.size();
-        const std::vector<int> inOutLen = {inputLen, 1500}; // input_length, output_length
+        const std::vector<int> inOutLen = {inputLen, 2000}; // input_length, output_length
 
         logger = std::make_shared<TllmLogger>();
         logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
@@ -94,7 +101,7 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
 
         // Set gptsessionconfig
         sessionConfig.maxBatchSize = batchSize;
-        sessionConfig.maxBeamWidth = 4; // Fixed for simplicity
+        sessionConfig.maxBeamWidth = 1; // Fixed for simplicity
         sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
         sessionConfig.cudaGraphMode = false; // Fixed for simplicity
 
@@ -105,7 +112,6 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
         samplingConfig.topP = std::vector{0.0f};
         samplingConfig.minLength = std::vector{inOutLen[1]};
         samplingConfig.repetitionPenalty = std::vector{1.3f};
-
         gptSession
             = std::make_unique<GptSession>(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger);
     };
@@ -121,14 +127,15 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     // your declaration of processing function maybe like this:
     // void get(const HttpRequestPtr& req, std::function<void (const HttpResponsePtr &)> &&callback, int p1, std::string
     // p2);
-    void chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback) const;
+    void chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback);
 
 private:
     GptSession::Config sessionConfig{1, 1, 1};
     SamplingConfig samplingConfig{1};
     std::unique_ptr<GptModelConfig> modelConfig;
-    Tokenizer nitro_tokenizer{"./tokenizer.model"};
+    Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"};
     std::unique_ptr<GptSession> gptSession;
     std::shared_ptr<TllmLogger> logger;
-    std::string example_string{"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease tell me a long and sad story<|im_end|>\n<|im_start|>assistant"};
+    std::string example_string{
+        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello there<|im_end|>\n<|im_start|>assistant"};
 };

From 03c0892ecc7b935ed3ca1e985d7e8a4d4ceb7dc0 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Thu, 7 Mar 2024 05:46:03 +0000
Subject: [PATCH 07/33] remove redundant include

---
 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index ee7b234d714..11001ac8508 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -5,17 +5,12 @@
 #include <drogon/HttpController.h>
 
 #include "sentencepiece_processor.h"
-#include "tensorrt_llm/common/cudaUtils.h"
-#include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptModelConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/memoryCounters.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
-#include "thread"
 #include <NvInfer.h>
 #include <filesystem>
 #include <iostream>

From e537b9ccdc81cd9577d9d79f5545cab5b2bacd8a Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Thu, 7 Mar 2024 11:08:19 +0000
Subject: [PATCH 08/33] streaming working checkpoint

---
 .../nitro/controllers/tensorrtllm.cc          | 213 +++++++++++----
 .../nitro/controllers/tensorrtllm.h           |  31 +--
 cpp/tensorrt_llm/nitro/utils/nitro_utils.h    | 251 ++++++++++++++++++
 3 files changed, 423 insertions(+), 72 deletions(-)
 create mode 100644 cpp/tensorrt_llm/nitro/utils/nitro_utils.h

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index bb4bef870d1..6257f5afccd 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -1,69 +1,119 @@
 #include "tensorrtllm.h"
+#include "tensorrt_llm/runtime/generationInput.h"
+#include "tensorrt_llm/runtime/generationOutput.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "utils/nitro_utils.h"
+#include <algorithm>
+#include <cstdint>
 #include <iostream>
 #include <memory>
 #include <ostream>
+#include <queue>
 #include <string>
 #include <trantor/utils/Logger.h>
 #include <vector>
 
-void removeZeroes(std::vector<int>& vec)
+void removeId(std::vector<int>& vec, int id)
 {
-    vec.erase(std::remove(vec.begin(), vec.end(), 0), vec.end());
+    vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end());
 }
 
 struct inferenceState
 {
     int prevPos{0};
     bool isFinished;
+    std::queue<std::string> textsToStream;
+    std::mutex queueMutex; // Mutex to protect access to textsToStream
 };
 
-void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
+// Only support single token stopping point now
+std::string create_return_json(const std::string& id, const std::string& model, const std::string& content,
+    Json::Value finish_reason = Json::Value())
 {
-    std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
+    Json::Value root;
 
-    std::vector<int> text_input = nitro_tokenizer.encode(example_string);
-    const int inputLen = text_input.size();
-    const std::vector<int> inOutLen = {inputLen, 2000}; // input_length, output_length
+    root["id"] = id;
+    root["model"] = model;
+    root["created"] = static_cast<int>(std::time(nullptr));
+    root["object"] = "chat.completion.chunk";
 
-    const int batchSize = 1;
+    Json::Value choicesArray(Json::arrayValue);
+    Json::Value choice;
 
-    std::vector<int32_t> inputIdsHost = text_input;
+    choice["index"] = 0;
+    Json::Value delta;
+    delta["content"] = content;
+    choice["delta"] = delta;
+    choice["finish_reason"] = finish_reason;
 
-    std::cout << "Start Nitro testing session: " << std::endl;
+    choicesArray.append(choice);
+    root["choices"] = choicesArray;
 
-    auto& bufferManager = gptSession->getBufferManager();
-    // Make stopwordlists
+    Json::StreamWriterBuilder writer;
+    writer["indentation"] = ""; // This sets the indentation to an empty string,
+                                // producing compact output.
+    return Json::writeString(writer, root);
+}
 
-    // Your stop word single token "32000"
-    std::vector<int32_t> stopWordsTokens = {32000, -1, 1, -1}; // Extend with -1 for increased length
+GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToken)
+{
 
-    // Tensor creation for stopWordsList
-    // Assuming the framework allows similar operations for creating custom tensors
-    // At this point,
-    // Input preparation
-    GenerationInput::TensorPtr inputIds
-        = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
+    std::vector<int32_t> stopWordsTokens = {stopToken, -1, 1, -1}; // Extend with -1 for increased length
+    return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU);
+}
 
-    std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
+GenerationInput tensorrtllm::createGenerationInput(std::vector<int32_t> inputIdsHost)
+{
+    int inputLen = inputIdsHost.size();
+    std::vector<int32_t> inputLengthsHost(batchSize, inputLen);
     GenerationInput::TensorPtr inputLengths
-        = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
-    bool inputPacked = modelConfig->usePackedInput();
+        = gptSession->getBufferManager().copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
+    GenerationInput::TensorPtr inputIds = gptSession->getBufferManager().copyFrom(
+        inputIdsHost, ITensor::makeShape({batchSize, inputLen}), MemoryType::kGPU);
 
-    GenerationInput generationInput{0, 32000, inputIds, inputLengths, inputPacked};
+    GenerationInput generationInput{0, 0, inputIds, inputLengths, modelConfig->usePackedInput()};
 
-    // generationInput.stopWordsList = stopWordsTokensTensor;
+    generationInput.stopWordsList = getTensorSingleStopWordList(32000);
+    return generationInput;
+}
 
-    generationInput.stopWordsList = bufferManager.copyFrom(stopWordsTokens, ITensor::makeShape({2,2}), MemoryType::kGPU);
-    generationInput.stopWordsList->reshape(ITensor::makeShape({1,2,2}));
+GenerationOutput tensorrtllm::createGenerationOutput()
+{
+    GenerationOutput generationOutput{
+        gptSession->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
+        gptSession->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+    return generationOutput;
+}
 
-    LOG_INFO << "here is the shape:   " << generationInput.stopWordsList->getShape().d[0];
 
-    LOG_INFO << "here is the shape:   " << generationInput.stopWordsList->getShape().d[1];
-    GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
-        bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+void inferenceThread(std::shared_ptr<inferenceState> inferState, 
+                     std::vector<int32_t> inputIdsHost, 
+                     std::function<void(const HttpResponsePtr&)> callback,
+                     tensorrtllm* self)
+{
+    const int inputLen = inputIdsHost.size();
+    const int outputLen = 2048 - inputLen;
+
+    // Create sampling config
+    SamplingConfig samplingConfig{1};
+    samplingConfig.temperature = std::vector{0.0f};
+    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+    samplingConfig.topK = std::vector{40};
+    samplingConfig.topP = std::vector{0.0f};
+    samplingConfig.minLength = std::vector{outputLen};
+    samplingConfig.repetitionPenalty = std::vector{1.3f};
 
-    // Define the callback to stream each generated token
-    generationOutput.onTokenGenerated = [&inferState, &bufferManager, inOutLen, this, &generationOutput](
+    std::cout << "Start Nitro testing session: " << std::endl;
+
+    // Input preparation
+
+    GenerationInput generationInput = self->createGenerationInput(inputIdsHost);
+
+    GenerationOutput generationOutput = self->createGenerationOutput();
+
+
+        // Define the callback to stream each generated token
+    generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput](
                                             GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
     {
         if (!finished)
@@ -72,47 +122,96 @@ void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(
             int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
             // Copy output IDs from GPU to host for printing
             std::vector<int32_t> outputIdsHost(outputLength);
-            bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
+            self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
             // Find the last non-zero value in the output IDs starting from the end of the input sequence
-            std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inOutLen[0], outputIdsHost.end());
-            removeZeroes(outputIdsHostDecode);
-            std::string text = nitro_tokenizer.decode(outputIdsHostDecode);
+            std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
+            removeId(outputIdsHostDecode, 0);
+            removeId(outputIdsHostDecode, 32000);
+            std::string text = self->nitro_tokenizer.decode(outputIdsHostDecode);
 
             if (inferState->prevPos > 0 && inferState->prevPos < text.size())
             {
                 // Valid prevPos, proceed with slicing the string from prevPos to the end
                 std::string stringTok(text.begin() + inferState->prevPos, text.end());
-                std::cout << stringTok << std::flush;
+                std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
+                inferState->textsToStream.push(stringTok);
             }
             else if (inferState->prevPos >= text.size())
             {
-                // prevPos is out of bounds, indicating there might be no new text or an error in logic
-                // You can handle this case as needed, for example, by logging a warning
                 inferState->prevPos = text.size();
             }
-            else
-            {
-                // inferState->prevPos is 0 or negative, indicating a potential logic error or initial state
-                // If there's valid text, you might want to print it all or handle this case specifically
-                if (!text.empty())
-                {
-                    std::cout << text << std::flush; // Optionally print all text if it's the initial state
-                }
-            }
-            // Update prevPos to the new length of the text for the next iteration
             inferState->prevPos = text.size();
         }
     };
+    // The rest of the logic inside the `chat_completion` remains unchanged...
+    // After finishing the setup, call the inference logic
+    self->gptSession->generate(generationOutput, generationInput, samplingConfig);
+}
+
+
+void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
+{
+    std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
+
+    std::vector<int32_t> inputIdsHost = nitro_tokenizer.encode(example_string);
+    const int inputLen = inputIdsHost.size();
+    const int outputLen = 2048 - inputLen;
 
-    gptSession->generate(generationOutput, generationInput, samplingConfig);
+    // Create sampling config
+    SamplingConfig samplingConfig{1};
+    samplingConfig.temperature = std::vector{0.0f};
+    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+    samplingConfig.topK = std::vector{40};
+    samplingConfig.topP = std::vector{0.0f};
+    samplingConfig.minLength = std::vector{outputLen};
+    samplingConfig.repetitionPenalty = std::vector{1.3f};
 
-    bufferManager.getStream().synchronize();
+    std::cout << "Start Nitro testing session: " << std::endl;
+
+    // Input preparation
+
+    std::thread infThread(inferenceThread, inferState, inputIdsHost, callback, this);
+    infThread.detach(); // Detach the thread to allow it to run independently
+
+
+    auto chunked_content_provider = [inferState](char* pBuffer, std::size_t nBuffSize) -> std::size_t
+    {
+            std::cout << "EMPTY";
+        if (!pBuffer)
+        {
+            LOG_INFO << "Connection closed or buffer is null. Reset context";
+            return 0; // Indicate no more data to send
+        }
+
+        while (true) // Continuously check if the queue is not empty
+        {
+            std::unique_lock<std::mutex> lock(inferState->queueMutex); // Lock the queue for exclusive access
+            if (!inferState->textsToStream.empty())
+            {
+
+                std::string rawText = inferState->textsToStream.front();
+                const std::string textToStream
+                    = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n";
+                inferState->textsToStream.pop();
+                lock.unlock(); // Unlock as soon as possible
+
+                // Ensure we do not exceed the buffer size. Truncate if necessary.
+                std::size_t bytesToWrite = std::min(nBuffSize, textToStream.size());
+
+                // Copy the text to the provided buffer
+                std::memcpy(pBuffer, textToStream.data(), bytesToWrite);
+                return bytesToWrite; // Return the number of bytes written to the buffer
+            }
+            else
+            {
+                // If the queue is empty, release the lock and wait before trying again
+                lock.unlock();
+            }
+        }
+    };
 
-    auto resp=HttpResponse::newHttpResponse();
-    resp->setStatusCode(k200OK);
-    resp->setBody("Your Page Contents");
-    callback(resp);
-    LOG_INFO << "Hello world";
+    auto streamResponse = nitro_utils::nitroStreamResponse(chunked_content_provider);
+    callback(streamResponse);
     return;
 };
 
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index 11001ac8508..9a4c9b7dbc0 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -6,6 +6,8 @@
 
 #include "sentencepiece_processor.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
+#include "tensorrt_llm/runtime/generationInput.h"
+#include "tensorrt_llm/runtime/generationOutput.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptModelConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
@@ -83,7 +85,6 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
         // Fixed settings
         const std::string modelName = "mistral";
         const std::filesystem::path engineDir = "/app/mistral_engine_3/";
-        const int batchSize = 1;
         initTrtLlmPlugins(logger.get());
         // Load model configuration
         std::filesystem::path jsonFileName = engineDir / "config.json";
@@ -94,19 +95,13 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
         auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
         auto const dtype = modelConfig->getDataType();
 
-        // Set gptsessionconfig
+        // Currently doing fixed session config
         sessionConfig.maxBatchSize = batchSize;
         sessionConfig.maxBeamWidth = 1; // Fixed for simplicity
-        sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
-        sessionConfig.cudaGraphMode = false; // Fixed for simplicity
-
-        // Set smapling config
-        samplingConfig.temperature = std::vector{0.0f};
-        samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
-        samplingConfig.topK = std::vector{40};
-        samplingConfig.topP = std::vector{0.0f};
-        samplingConfig.minLength = std::vector{inOutLen[1]};
-        samplingConfig.repetitionPenalty = std::vector{1.3f};
+        sessionConfig.maxSequenceLength = 2048;
+        sessionConfig.cudaGraphMode = true; // Fixed for simplicity
+
+        // Init gptSession
         gptSession
             = std::make_unique<GptSession>(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger);
     };
@@ -124,13 +119,19 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     // p2);
     void chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback);
 
+    std::unique_ptr<GptSession> gptSession;
+    GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken);
+    GenerationInput createGenerationInput(std::vector<int32_t> inputIds);
+    GenerationOutput createGenerationOutput();
+    Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"};
+
 private:
     GptSession::Config sessionConfig{1, 1, 1};
     SamplingConfig samplingConfig{1};
     std::unique_ptr<GptModelConfig> modelConfig;
-    Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"};
-    std::unique_ptr<GptSession> gptSession;
     std::shared_ptr<TllmLogger> logger;
     std::string example_string{
-        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello there<|im_end|>\n<|im_start|>assistant"};
+        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease write a long and sad "
+        "story<|im_end|>\n<|im_start|>assistant"};
+    int batchSize = 1;
 };
diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
new file mode 100644
index 00000000000..c5dda96eb66
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
@@ -0,0 +1,251 @@
+#pragma once
+#include "cstdio"
+#include "random"
+#include "string"
+#include <algorithm>
+#include <drogon/HttpClient.h>
+#include <drogon/HttpResponse.h>
+#include <fstream>
+#include <iostream>
+#include <ostream>
+#include <regex>
+#include <vector>
+// Include platform-specific headers
+#ifdef _WIN32
+#include <winsock2.h>
+#include <windows.h>
+#else
+#include <dirent.h>
+#endif
+
+namespace nitro_utils {
+
+inline std::string models_folder = "./models";
+
+inline std::string extractBase64(const std::string &input) {
+  std::regex pattern("base64,(.*)");
+  std::smatch match;
+
+  if (std::regex_search(input, match, pattern)) {
+    std::string base64_data = match[1];
+    base64_data = base64_data.substr(0, base64_data.length() - 1);
+    return base64_data;
+  }
+
+  return "";
+}
+
+// Helper function to encode data to Base64
+inline std::string base64Encode(const std::vector<unsigned char> &data) {
+  static const char encodingTable[] =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  std::string encodedData;
+  int i = 0;
+  int j = 0;
+  unsigned char array3[3];
+  unsigned char array4[4];
+
+  for (unsigned char c : data) {
+    array3[i++] = c;
+    if (i == 3) {
+      array4[0] = (array3[0] & 0xfc) >> 2;
+      array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
+      array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
+      array4[3] = array3[2] & 0x3f;
+
+      for (i = 0; i < 4; i++)
+        encodedData += encodingTable[array4[i]];
+      i = 0;
+    }
+  }
+
+  if (i) {
+    for (j = i; j < 3; j++)
+      array3[j] = '\0';
+
+    array4[0] = (array3[0] & 0xfc) >> 2;
+    array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
+    array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
+
+    for (j = 0; j < i + 1; j++)
+      encodedData += encodingTable[array4[j]];
+
+    while (i++ < 3)
+      encodedData += '=';
+  }
+
+  return encodedData;
+}
+
+// Function to load an image and convert it to Base64
+inline std::string imageToBase64(const std::string &imagePath) {
+  std::ifstream imageFile(imagePath, std::ios::binary);
+  if (!imageFile.is_open()) {
+    throw std::runtime_error("Could not open the image file.");
+  }
+
+  std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(imageFile),
+                                    {});
+  return base64Encode(buffer);
+}
+
+// Helper function to generate a unique filename
+inline std::string generateUniqueFilename(const std::string &prefix,
+                                          const std::string &extension) {
+  // Get current time as a timestamp
+  auto now = std::chrono::system_clock::now();
+  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
+  auto epoch = now_ms.time_since_epoch();
+  auto value = std::chrono::duration_cast<std::chrono::milliseconds>(epoch);
+
+  // Generate a random number
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> dis(1000, 9999);
+
+  std::stringstream ss;
+  ss << prefix << value.count() << "_" << dis(gen) << extension;
+  return ss.str();
+}
+
+inline void
+processLocalImage(const std::string &localPath,
+                  std::function<void(const std::string &)> callback) {
+  try {
+    std::string base64Image = imageToBase64(localPath);
+    callback(base64Image); // Invoke the callback with the Base64 string
+  } catch (const std::exception &e) {
+    std::cerr << "Error during processing: " << e.what() << std::endl;
+  }
+}
+
+inline std::vector<std::string> listFilesInDir(const std::string &path) {
+  std::vector<std::string> files;
+
+#ifdef _WIN32
+  // Windows-specific code
+  WIN32_FIND_DATA findFileData;
+  HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData);
+
+  if (hFind != INVALID_HANDLE_VALUE) {
+    do {
+      if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        files.push_back(findFileData.cFileName);
+      }
+    } while (FindNextFile(hFind, &findFileData) != 0);
+    FindClose(hFind);
+  }
+#else
+  // POSIX-specific code (Linux, Unix, MacOS)
+  DIR *dir;
+  struct dirent *ent;
+
+  if ((dir = opendir(path.c_str())) != NULL) {
+    while ((ent = readdir(dir)) != NULL) {
+      if (ent->d_type == DT_REG) { // Check if it's a regular file
+        files.push_back(ent->d_name);
+      }
+    }
+    closedir(dir);
+  }
+#endif
+
+  return files;
+}
+
+inline std::string rtrim(const std::string &str) {
+  size_t end = str.find_last_not_of("\n\t ");
+  return (end == std::string::npos) ? "" : str.substr(0, end + 1);
+}
+
+inline std::string generate_random_string(std::size_t length) {
+  const std::string characters =
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+  std::random_device rd;
+  std::mt19937 generator(rd());
+
+  std::uniform_int_distribution<> distribution(0, characters.size() - 1);
+
+  std::string random_string(length, '\0');
+  std::generate_n(random_string.begin(), length,
+                  [&]() { return characters[distribution(generator)]; });
+
+  return random_string;
+}
+
+inline void nitro_logo() {
+  std::string rainbowColors[] = {
+      "\033[93m", // Yellow
+      "\033[94m", // Blue
+  };
+
+  std::string resetColor = "\033[0m";
+  std::string asciiArt =
+      "      ___                                   ___           ___     \n"
+      "     /__/        ___           ___        /  /\\         /  /\\    \n"
+      "     \\  \\:\\      /  /\\         /  /\\      /  /::\\       /  /::\\  "
+      " \n"
+      "      \\  \\:\\    /  /:/        /  /:/     /  /:/\\:\\     /  /:/\\:\\ "
+      " \n"
+      "  _____\\__\\:\\  /__/::\\       /  /:/     /  /:/  \\:\\   /  /:/  "
+      "\\:\\ \n"
+      " /__/::::::::\\ \\__\\/\\:\\__   /  /::\\    /__/:/ /:/___ /__/:/ "
+      "\\__\\:\\\n"
+      " \\  \\:\\~~\\~~\\/    \\  \\:\\/\\ /__/:/\\:\\   \\  \\:\\/:::::/ \\  "
+      "\\:\\ /  /:/\n"
+      "  \\  \\:\\  ~~~      \\__\\::/ \\__\\/  \\:\\   \\  \\::/~~~~   \\  "
+      "\\:\\  /:/ \n"
+      "   \\  \\:\\          /__/:/       \\  \\:\\   \\  \\:\\        \\  "
+      "\\:\\/:/  \n"
+      "    \\  \\:\\         \\__\\/         \\__\\/    \\  \\:\\        \\  "
+      "\\::/   \n"
+      "     \\__\\/                                 \\__\\/         \\__\\/    "
+      "\n";
+
+  int colorIndex = 0;
+
+  for (char c : asciiArt) {
+    if (c == '\n') {
+      std::cout << resetColor << c;
+      colorIndex = 0;
+    } else {
+      std::cout << rainbowColors[colorIndex % 2] << c;
+      colorIndex++;
+    }
+  }
+
+  std::cout << resetColor; // Reset color at the endreturn;
+}
+
+inline drogon::HttpResponsePtr nitroHttpResponse() {
+  auto resp = drogon::HttpResponse::newHttpResponse();
+#ifdef ALLOW_ALL_CORS
+  LOG_INFO << "Respond for all cors!";
+  resp->addHeader("Access-Control-Allow-Origin", "*");
+#endif
+  return resp;
+}
+
+inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value &data) {
+  auto resp = drogon::HttpResponse::newHttpJsonResponse(data);
+#ifdef ALLOW_ALL_CORS
+  LOG_INFO << "Respond for all cors!";
+  resp->addHeader("Access-Control-Allow-Origin", "*");
+#endif
+  return resp;
+};
+
+inline drogon::HttpResponsePtr nitroStreamResponse(
+    const std::function<std::size_t(char *, std::size_t)> &callback,
+    const std::string &attachmentFileName = "") {
+  auto resp = drogon::HttpResponse::newStreamResponse(
+      callback, attachmentFileName, drogon::CT_NONE, "text/event-stream");
+#ifdef ALLOW_ALL_CORS
+  LOG_INFO << "Respond for all cors!";
+  resp->addHeader("Access-Control-Allow-Origin", "*");
+#endif
+  return resp;
+}
+
+} // namespace nitro_utils

From 07ee3545eab3abcacd5ffb3439a950e2705a5194 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Thu, 7 Mar 2024 14:22:50 +0000
Subject: [PATCH 09/33] latest demo

---
 .../nitro/controllers/tensorrtllm.cc          | 33 +++++++++++++------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index 6257f5afccd..5bf68ebf017 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -85,11 +85,8 @@ GenerationOutput tensorrtllm::createGenerationOutput()
     return generationOutput;
 }
 
-
-void inferenceThread(std::shared_ptr<inferenceState> inferState, 
-                     std::vector<int32_t> inputIdsHost, 
-                     std::function<void(const HttpResponsePtr&)> callback,
-                     tensorrtllm* self)
+void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int32_t> inputIdsHost,
+    std::function<void(const HttpResponsePtr&)> callback, tensorrtllm* self)
 {
     const int inputLen = inputIdsHost.size();
     const int outputLen = 2048 - inputLen;
@@ -111,8 +108,7 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState,
 
     GenerationOutput generationOutput = self->createGenerationOutput();
 
-
-        // Define the callback to stream each generated token
+    // Define the callback to stream each generated token
     generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput](
                                             GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
     {
@@ -141,14 +137,16 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState,
                 inferState->prevPos = text.size();
             }
             inferState->prevPos = text.size();
+            return;
         }
+        std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
+        inferState->textsToStream.push("[DONE]");
     };
     // The rest of the logic inside the `chat_completion` remains unchanged...
     // After finishing the setup, call the inference logic
     self->gptSession->generate(generationOutput, generationInput, samplingConfig);
 }
 
-
 void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
 {
     std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
@@ -173,16 +171,19 @@ void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(
     std::thread infThread(inferenceThread, inferState, inputIdsHost, callback, this);
     infThread.detach(); // Detach the thread to allow it to run independently
 
-
     auto chunked_content_provider = [inferState](char* pBuffer, std::size_t nBuffSize) -> std::size_t
     {
-            std::cout << "EMPTY";
         if (!pBuffer)
         {
             LOG_INFO << "Connection closed or buffer is null. Reset context";
             return 0; // Indicate no more data to send
         }
 
+        if (inferState->isFinished)
+        {
+            return 0;
+        }
+
         while (true) // Continuously check if the queue is not empty
         {
             std::unique_lock<std::mutex> lock(inferState->queueMutex); // Lock the queue for exclusive access
@@ -190,6 +191,18 @@ void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(
             {
 
                 std::string rawText = inferState->textsToStream.front();
+                if (rawText == "[DONE]")
+                {
+                    LOG_INFO << "End of result";
+                    const std::string str
+                        = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", "", "stop")
+                        + "\n\n" + "data: [DONE]" + "\n\n";
+
+                    std::size_t nRead = std::min(str.size(), nBuffSize);
+                    memcpy(pBuffer, str.data(), nRead);
+                    inferState->isFinished = true;
+                    return nRead;
+                }
                 const std::string textToStream
                     = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n";
                 inferState->textsToStream.pop();

From 3a9a5af49729c2c6568666f68ae6be2fed10c69b Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Fri, 8 Mar 2024 02:23:59 +0000
Subject: [PATCH 10/33] openai compatible chat

---
 3rdparty/cutlass                              |  2 +-
 .../nitro/controllers/tensorrtllm.cc          | 55 ++++++++++++++++++-
 .../nitro/controllers/tensorrtllm.h           | 16 ++++--
 .../nitro/models/chat_completion_request.h    | 36 ++++++++++++
 4 files changed, 100 insertions(+), 9 deletions(-)
 create mode 100644 cpp/tensorrt_llm/nitro/models/chat_completion_request.h

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 8236f30675b..a8f2c80db05 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 8236f30675bbe98f81d11c05764b77bfcb25b8cc
+Subproject commit a8f2c80db0564c74f4efccac71993b971dfc448b
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index 5bf68ebf017..8990b1aed7d 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -1,4 +1,6 @@
 #include "tensorrtllm.h"
+#include "models/chat_completion_request.h"
+#include "nlohmann/json.hpp"
 #include "tensorrt_llm/runtime/generationInput.h"
 #include "tensorrt_llm/runtime/generationOutput.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
@@ -13,6 +15,8 @@
 #include <trantor/utils/Logger.h>
 #include <vector>
 
+using json = nlohmann::json;
+
 void removeId(std::vector<int>& vec, int id)
 {
     vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end());
@@ -147,11 +151,58 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int
     self->gptSession->generate(generationOutput, generationInput, samplingConfig);
 }
 
-void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
+void tensorrtllm::chat_completion(
+    inferences::ChatCompletionRequest&& completion, std::function<void(const HttpResponsePtr&)>&& callback)
 {
+
+    std::string formatted_input = pre_prompt;
+
+    nlohmann::json data;
+
+    data["stream"] = completion.stream;
+    data["n_predict"] = completion.max_tokens;
+    data["top_p"] = completion.top_p;
+    data["temperature"] = completion.temperature;
+    data["frequency_penalty"] = completion.frequency_penalty;
+    data["presence_penalty"] = completion.presence_penalty;
+    const Json::Value& messages = completion.messages;
+
+    // Format the input from user
+    for (const auto& message : messages)
+    {
+        std::string input_role = message["role"].asString();
+        std::string role;
+        if (input_role == "user")
+        {
+            role = user_prompt;
+            std::string content = message["content"].asString();
+            formatted_input += role + content;
+        }
+        else if (input_role == "assistant")
+        {
+            role = ai_prompt;
+            std::string content = message["content"].asString();
+            formatted_input += role + content;
+        }
+        else if (input_role == "system")
+        {
+            role = system_prompt;
+            std::string content = message["content"].asString();
+            formatted_input = role + content + formatted_input;
+        }
+        else
+        {
+            role = input_role;
+            std::string content = message["content"].asString();
+            formatted_input += role + content;
+        }
+    }
+    formatted_input += ai_prompt;
+    // Format the input from user
+
     std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
 
-    std::vector<int32_t> inputIdsHost = nitro_tokenizer.encode(example_string);
+    std::vector<int32_t> inputIdsHost = nitro_tokenizer.encode(formatted_input);
     const int inputLen = inputIdsHost.size();
     const int outputLen = 2048 - inputLen;
 
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index 9a4c9b7dbc0..bd770c03b58 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "drogon/HttpTypes.h"
 #include "sentencepiece_processor.h"
 #include <cstdint>
 #include <drogon/HttpController.h>
@@ -20,6 +21,8 @@
 #include <ostream>
 #include <string>
 
+#include "models/chat_completion_request.h"
+
 using namespace drogon;
 
 using namespace tensorrt_llm::runtime;
@@ -76,10 +79,6 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
 public:
     tensorrtllm()
     {
-        std::vector<int> text_input = nitro_tokenizer.encode(example_string);
-        const int inputLen = text_input.size();
-        const std::vector<int> inOutLen = {inputLen, 2000}; // input_length, output_length
-
         logger = std::make_shared<TllmLogger>();
         logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
         // Fixed settings
@@ -110,14 +109,15 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     // use METHOD_ADD to add your custom processing function here;
     // METHOD_ADD(tensorrtllm::get, "/{2}/{1}", Get); // path is /tensorrtllm/{arg2}/{arg1}
     // METHOD_ADD(tensorrtllm::your_method_name, "/{1}/{2}/list", Get); // path is /tensorrtllm/{arg1}/{arg2}/list
-    ADD_METHOD_TO(tensorrtllm::chat_completion, "/testing", Get); // path is
+    ADD_METHOD_TO(tensorrtllm::chat_completion, "/v1/chat/completions", Post); // path is
     // /absolute/path/{arg1}/{arg2}/list
 
     METHOD_LIST_END
     // your declaration of processing function maybe like this:
     // void get(const HttpRequestPtr& req, std::function<void (const HttpResponsePtr &)> &&callback, int p1, std::string
     // p2);
-    void chat_completion(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback);
+    void chat_completion(
+        inferences::ChatCompletionRequest&& completion, std::function<void(const HttpResponsePtr&)>&& callback);
 
     std::unique_ptr<GptSession> gptSession;
     GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken);
@@ -133,5 +133,9 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     std::string example_string{
         "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease write a long and sad "
         "story<|im_end|>\n<|im_start|>assistant"};
+    std::string user_prompt{"<|im_end|>\n<|im_start|>user\n"};
+    std::string ai_prompt{"<|im_end|>\n<|im_start|>assistant\n"};
+    std::string system_prompt{"<|im_start|>system\n"};
+    std::string pre_prompt;
     int batchSize = 1;
 };
diff --git a/cpp/tensorrt_llm/nitro/models/chat_completion_request.h b/cpp/tensorrt_llm/nitro/models/chat_completion_request.h
new file mode 100644
index 00000000000..bd802d67e02
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/models/chat_completion_request.h
@@ -0,0 +1,36 @@
+#pragma once
+#include <drogon/HttpController.h>
+
+namespace inferences {
+struct ChatCompletionRequest {
+  bool stream = false;
+  int max_tokens = 500;
+  float top_p = 0.95;
+  float temperature = 0.8;
+  float frequency_penalty = 0;
+  float presence_penalty = 0;
+  Json::Value stop = Json::Value(Json::arrayValue);
+  Json::Value messages = Json::Value(Json::arrayValue);
+};
+}  // namespace inferences
+
+namespace drogon {
+template <>
+inline inferences::ChatCompletionRequest fromRequest(const HttpRequest& req) {
+  auto jsonBody = req.getJsonObject();
+  inferences::ChatCompletionRequest completion;
+  if (jsonBody) {
+    completion.stream = (*jsonBody).get("stream", false).asBool();
+    completion.max_tokens = (*jsonBody).get("max_tokens", 500).asInt();
+    completion.top_p = (*jsonBody).get("top_p", 0.95).asFloat();
+    completion.temperature = (*jsonBody).get("temperature", 0.8).asFloat();
+    completion.frequency_penalty =
+        (*jsonBody).get("frequency_penalty", 0).asFloat();
+    completion.presence_penalty =
+        (*jsonBody).get("presence_penalty", 0).asFloat();
+    completion.messages = (*jsonBody)["messages"];
+    completion.stop = (*jsonBody)["stop"];
+  }
+  return completion;
+}
+}  // namespace inferences

From 1845d8e4278b9e6a7eef63713156c84a4ac7da58 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Fri, 8 Mar 2024 04:37:04 +0000
Subject: [PATCH 11/33] feat: add splash screen and model load

---
 .../nitro/controllers/tensorrtllm.cc          |  57 ++-
 .../nitro/controllers/tensorrtllm.h           |  40 +-
 cpp/tensorrt_llm/nitro/main.cc                |  76 +++-
 cpp/tensorrt_llm/nitro/utils/nitro_utils.h    | 397 ++++++++++--------
 4 files changed, 356 insertions(+), 214 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index 8990b1aed7d..999d7b18a82 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -16,6 +16,7 @@
 #include <vector>
 
 using json = nlohmann::json;
+using namespace inferences;
 
 void removeId(std::vector<int>& vec, int id)
 {
@@ -127,7 +128,7 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int
             std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
             removeId(outputIdsHostDecode, 0);
             removeId(outputIdsHostDecode, 32000);
-            std::string text = self->nitro_tokenizer.decode(outputIdsHostDecode);
+            std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode);
 
             if (inferState->prevPos > 0 && inferState->prevPos < text.size())
             {
@@ -202,7 +203,7 @@ void tensorrtllm::chat_completion(
 
     std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
 
-    std::vector<int32_t> inputIdsHost = nitro_tokenizer.encode(formatted_input);
+    std::vector<int32_t> inputIdsHost = nitro_tokenizer->encode(formatted_input);
     const int inputLen = inputIdsHost.size();
     const int outputLen = 2048 - inputLen;
 
@@ -279,4 +280,56 @@ void tensorrtllm::chat_completion(
     return;
 };
 
+void tensorrtllm::loadModel(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
+{
+    const auto& jsonBody = req->getJsonObject();
+
+    if (!jsonBody)
+    {
+        Json::Value jsonResp;
+        jsonResp["message"] = "Require params!";
+        auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+        callback(resp);
+        return;
+    }
+
+    const std::filesystem::path engineDir = jsonBody->operator[]("engine_path").asString();
+    int ctx_len = jsonBody->get("ctx_len", 2048).asInt();
+
+    logger = std::make_shared<TllmLogger>();
+    logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
+    // Fixed settings
+    const std::string modelName = "mistral";
+    initTrtLlmPlugins(logger.get());
+    // Load model configuration
+    std::filesystem::path jsonFileName = engineDir / "config.json";
+    std::filesystem::path tokenizerModelName = engineDir / "tokenizer.model";
+
+    nitro_tokenizer = std::make_unique<Tokenizer>(tokenizerModelName.string());
+    LOG_INFO << "Loaded tokenizer";
+
+    auto const json = GptJsonConfig::parse(jsonFileName);
+    auto config = json.getModelConfig();
+    modelConfig = std::make_unique<GptModelConfig>(config);
+    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
+    auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
+    LOG_INFO << "Engine Path : " << enginePath.string();
+    auto const dtype = modelConfig->getDataType();
+
+    // Currently doing fixed session config
+    sessionConfig.maxBatchSize = batchSize;
+    sessionConfig.maxBeamWidth = 1; // Fixed for simplicity
+    sessionConfig.maxSequenceLength = ctx_len;
+    sessionConfig.cudaGraphMode = true; // Fixed for simplicity
+
+    // Init gptSession
+    gptSession = std::make_unique<GptSession>(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger);
+    // Model loaded successfully
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model loaded successfully";
+    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+    callback(resp);
+    return;
+};
+
 // Add definition of your processing function here
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index bd770c03b58..0ecae873d27 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -74,43 +74,18 @@ class Tokenizer
     }
 };
 
+namespace inferences
+{
+
 class tensorrtllm : public drogon::HttpController<tensorrtllm>
 {
 public:
-    tensorrtllm()
-    {
-        logger = std::make_shared<TllmLogger>();
-        logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
-        // Fixed settings
-        const std::string modelName = "mistral";
-        const std::filesystem::path engineDir = "/app/mistral_engine_3/";
-        initTrtLlmPlugins(logger.get());
-        // Load model configuration
-        std::filesystem::path jsonFileName = engineDir / "config.json";
-        auto const json = GptJsonConfig::parse(jsonFileName);
-        auto config = json.getModelConfig();
-        modelConfig = std::make_unique<GptModelConfig>(config);
-        auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
-        auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
-        auto const dtype = modelConfig->getDataType();
-
-        // Currently doing fixed session config
-        sessionConfig.maxBatchSize = batchSize;
-        sessionConfig.maxBeamWidth = 1; // Fixed for simplicity
-        sessionConfig.maxSequenceLength = 2048;
-        sessionConfig.cudaGraphMode = true; // Fixed for simplicity
-
-        // Init gptSession
-        gptSession
-            = std::make_unique<GptSession>(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger);
-    };
+    tensorrtllm(){};
 
     METHOD_LIST_BEGIN
     // use METHOD_ADD to add your custom processing function here;
-    // METHOD_ADD(tensorrtllm::get, "/{2}/{1}", Get); // path is /tensorrtllm/{arg2}/{arg1}
-    // METHOD_ADD(tensorrtllm::your_method_name, "/{1}/{2}/list", Get); // path is /tensorrtllm/{arg1}/{arg2}/list
     ADD_METHOD_TO(tensorrtllm::chat_completion, "/v1/chat/completions", Post); // path is
-    // /absolute/path/{arg1}/{arg2}/list
+    METHOD_ADD(tensorrtllm::loadModel, "loadmodel", Post);
 
     METHOD_LIST_END
     // your declaration of processing function maybe like this:
@@ -119,11 +94,12 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     void chat_completion(
         inferences::ChatCompletionRequest&& completion, std::function<void(const HttpResponsePtr&)>&& callback);
 
+    void loadModel(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback);
     std::unique_ptr<GptSession> gptSession;
     GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken);
     GenerationInput createGenerationInput(std::vector<int32_t> inputIds);
     GenerationOutput createGenerationOutput();
-    Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"};
+    std::unique_ptr<Tokenizer> nitro_tokenizer;
 
 private:
     GptSession::Config sessionConfig{1, 1, 1};
@@ -139,3 +115,5 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     std::string pre_prompt;
     int batchSize = 1;
 };
+
+} // namespace inferences
diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc
index 97c7ddba686..730253f74f3 100644
--- a/cpp/tensorrt_llm/nitro/main.cc
+++ b/cpp/tensorrt_llm/nitro/main.cc
@@ -1,11 +1,73 @@
+#include "utils/nitro_utils.h"
+#include <climits> // for PATH_MAX
+#include <drogon/HttpAppFramework.h>
 #include <drogon/drogon.h>
-int main() {
-    //Set HTTP listener address and port
-    drogon::app().addListener("0.0.0.0", 5555);
-    //Load config file
-    //drogon::app().loadConfigFile("../config.json");
-    //drogon::app().loadConfigFile("../config.yaml");
-    //Run HTTP framework,the method will block in the internal event loop
+#include <iostream>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <libgen.h> // for dirname()
+#include <mach-o/dyld.h>
+#elif defined(__linux__)
+#include <libgen.h> // for dirname()
+#include <unistd.h> // for readlink()
+#elif defined(_WIN32)
+#include <windows.h>
+#undef max
+#else
+#error "Unsupported platform!"
+#endif
+
+int main(int argc, char* argv[])
+{
+    int thread_num = 1;
+    std::string host = "127.0.0.1";
+    int port = 3928;
+    std::string uploads_folder_path;
+
+    // Number of nitro threads
+    if (argc > 1)
+    {
+        thread_num = std::atoi(argv[1]);
+    }
+
+    // Check for host argument
+    if (argc > 2)
+    {
+        host = argv[2];
+    }
+
+    // Check for port argument
+    if (argc > 3)
+    {
+        port = std::atoi(argv[3]); // Convert string argument to int
+    }
+
+    // Uploads folder path
+    if (argc > 4)
+    {
+        uploads_folder_path = argv[4];
+    }
+
+    int logical_cores = std::thread::hardware_concurrency();
+    int drogon_thread_num = 1; // temporarily set thread num to 1
+    nitro_utils::nitro_logo();
+#ifdef NITRO_VERSION
+    LOG_INFO << "Nitro version: " << NITRO_VERSION;
+#else
+    LOG_INFO << "Nitro version: undefined";
+#endif
+    LOG_INFO << "Server started, listening at: " << host << ":" << port;
+    LOG_INFO << "Please load your model";
+    drogon::app().addListener(host, port);
+    drogon::app().setThreadNum(drogon_thread_num);
+    if (!uploads_folder_path.empty())
+    {
+        LOG_INFO << "Drogon uploads folder is at: " << uploads_folder_path;
+        drogon::app().setUploadPath(uploads_folder_path);
+    }
+    LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum();
+
     drogon::app().run();
+
     return 0;
 }
diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
index c5dda96eb66..628dc5f46b1 100644
--- a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
+++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
@@ -12,240 +12,289 @@
 #include <vector>
 // Include platform-specific headers
 #ifdef _WIN32
-#include <winsock2.h>
 #include <windows.h>
+#include <winsock2.h>
 #else
 #include <dirent.h>
 #endif
 
-namespace nitro_utils {
+namespace nitro_utils
+{
 
 inline std::string models_folder = "./models";
 
-inline std::string extractBase64(const std::string &input) {
-  std::regex pattern("base64,(.*)");
-  std::smatch match;
+inline std::string extractBase64(const std::string& input)
+{
+    std::regex pattern("base64,(.*)");
+    std::smatch match;
 
-  if (std::regex_search(input, match, pattern)) {
-    std::string base64_data = match[1];
-    base64_data = base64_data.substr(0, base64_data.length() - 1);
-    return base64_data;
-  }
+    if (std::regex_search(input, match, pattern))
+    {
+        std::string base64_data = match[1];
+        base64_data = base64_data.substr(0, base64_data.length() - 1);
+        return base64_data;
+    }
 
-  return "";
+    return "";
 }
 
 // Helper function to encode data to Base64
-inline std::string base64Encode(const std::vector<unsigned char> &data) {
-  static const char encodingTable[] =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-  std::string encodedData;
-  int i = 0;
-  int j = 0;
-  unsigned char array3[3];
-  unsigned char array4[4];
-
-  for (unsigned char c : data) {
-    array3[i++] = c;
-    if (i == 3) {
-      array4[0] = (array3[0] & 0xfc) >> 2;
-      array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
-      array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
-      array4[3] = array3[2] & 0x3f;
-
-      for (i = 0; i < 4; i++)
-        encodedData += encodingTable[array4[i]];
-      i = 0;
+inline std::string base64Encode(const std::vector<unsigned char>& data)
+{
+    static const char encodingTable[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    std::string encodedData;
+    int i = 0;
+    int j = 0;
+    unsigned char array3[3];
+    unsigned char array4[4];
+
+    for (unsigned char c : data)
+    {
+        array3[i++] = c;
+        if (i == 3)
+        {
+            array4[0] = (array3[0] & 0xfc) >> 2;
+            array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
+            array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
+            array4[3] = array3[2] & 0x3f;
+
+            for (i = 0; i < 4; i++)
+                encodedData += encodingTable[array4[i]];
+            i = 0;
+        }
     }
-  }
 
-  if (i) {
-    for (j = i; j < 3; j++)
-      array3[j] = '\0';
+    if (i)
+    {
+        for (j = i; j < 3; j++)
+            array3[j] = '\0';
 
-    array4[0] = (array3[0] & 0xfc) >> 2;
-    array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
-    array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
+        array4[0] = (array3[0] & 0xfc) >> 2;
+        array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
+        array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
 
-    for (j = 0; j < i + 1; j++)
-      encodedData += encodingTable[array4[j]];
+        for (j = 0; j < i + 1; j++)
+            encodedData += encodingTable[array4[j]];
 
-    while (i++ < 3)
-      encodedData += '=';
-  }
+        while (i++ < 3)
+            encodedData += '=';
+    }
 
-  return encodedData;
+    return encodedData;
 }
 
 // Function to load an image and convert it to Base64
-inline std::string imageToBase64(const std::string &imagePath) {
-  std::ifstream imageFile(imagePath, std::ios::binary);
-  if (!imageFile.is_open()) {
-    throw std::runtime_error("Could not open the image file.");
-  }
-
-  std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(imageFile),
-                                    {});
-  return base64Encode(buffer);
+inline std::string imageToBase64(const std::string& imagePath)
+{
+    std::ifstream imageFile(imagePath, std::ios::binary);
+    if (!imageFile.is_open())
+    {
+        throw std::runtime_error("Could not open the image file.");
+    }
+
+    std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(imageFile), {});
+    return base64Encode(buffer);
 }
 
 // Helper function to generate a unique filename
-inline std::string generateUniqueFilename(const std::string &prefix,
-                                          const std::string &extension) {
-  // Get current time as a timestamp
-  auto now = std::chrono::system_clock::now();
-  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
-  auto epoch = now_ms.time_since_epoch();
-  auto value = std::chrono::duration_cast<std::chrono::milliseconds>(epoch);
-
-  // Generate a random number
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<> dis(1000, 9999);
-
-  std::stringstream ss;
-  ss << prefix << value.count() << "_" << dis(gen) << extension;
-  return ss.str();
+inline std::string generateUniqueFilename(const std::string& prefix, const std::string& extension)
+{
+    // Get current time as a timestamp
+    auto now = std::chrono::system_clock::now();
+    auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
+    auto epoch = now_ms.time_since_epoch();
+    auto value = std::chrono::duration_cast<std::chrono::milliseconds>(epoch);
+
+    // Generate a random number
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(1000, 9999);
+
+    std::stringstream ss;
+    ss << prefix << value.count() << "_" << dis(gen) << extension;
+    return ss.str();
 }
 
-inline void
-processLocalImage(const std::string &localPath,
-                  std::function<void(const std::string &)> callback) {
-  try {
-    std::string base64Image = imageToBase64(localPath);
-    callback(base64Image); // Invoke the callback with the Base64 string
-  } catch (const std::exception &e) {
-    std::cerr << "Error during processing: " << e.what() << std::endl;
-  }
+inline void processLocalImage(const std::string& localPath, std::function<void(const std::string&)> callback)
+{
+    try
+    {
+        std::string base64Image = imageToBase64(localPath);
+        callback(base64Image); // Invoke the callback with the Base64 string
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "Error during processing: " << e.what() << std::endl;
+    }
 }
 
-inline std::vector<std::string> listFilesInDir(const std::string &path) {
-  std::vector<std::string> files;
+inline std::vector<std::string> listFilesInDir(const std::string& path)
+{
+    std::vector<std::string> files;
 
 #ifdef _WIN32
-  // Windows-specific code
-  WIN32_FIND_DATA findFileData;
-  HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData);
-
-  if (hFind != INVALID_HANDLE_VALUE) {
-    do {
-      if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
-        files.push_back(findFileData.cFileName);
-      }
-    } while (FindNextFile(hFind, &findFileData) != 0);
-    FindClose(hFind);
-  }
+    // Windows-specific code
+    WIN32_FIND_DATA findFileData;
+    HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData);
+
+    if (hFind != INVALID_HANDLE_VALUE)
+    {
+        do
+        {
+            if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
+            {
+                files.push_back(findFileData.cFileName);
+            }
+        } while (FindNextFile(hFind, &findFileData) != 0);
+        FindClose(hFind);
+    }
 #else
-  // POSIX-specific code (Linux, Unix, MacOS)
-  DIR *dir;
-  struct dirent *ent;
-
-  if ((dir = opendir(path.c_str())) != NULL) {
-    while ((ent = readdir(dir)) != NULL) {
-      if (ent->d_type == DT_REG) { // Check if it's a regular file
-        files.push_back(ent->d_name);
-      }
+    // POSIX-specific code (Linux, Unix, MacOS)
+    DIR* dir;
+    struct dirent* ent;
+
+    if ((dir = opendir(path.c_str())) != NULL)
+    {
+        while ((ent = readdir(dir)) != NULL)
+        {
+            if (ent->d_type == DT_REG)
+            { // Check if it's a regular file
+                files.push_back(ent->d_name);
+            }
+        }
+        closedir(dir);
     }
-    closedir(dir);
-  }
 #endif
 
-  return files;
+    return files;
 }
 
-inline std::string rtrim(const std::string &str) {
-  size_t end = str.find_last_not_of("\n\t ");
-  return (end == std::string::npos) ? "" : str.substr(0, end + 1);
+inline std::string rtrim(const std::string& str)
+{
+    size_t end = str.find_last_not_of("\n\t ");
+    return (end == std::string::npos) ? "" : str.substr(0, end + 1);
 }
 
-inline std::string generate_random_string(std::size_t length) {
-  const std::string characters =
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+inline std::string generate_random_string(std::size_t length)
+{
+    const std::string characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
-  std::random_device rd;
-  std::mt19937 generator(rd());
+    std::random_device rd;
+    std::mt19937 generator(rd());
 
-  std::uniform_int_distribution<> distribution(0, characters.size() - 1);
+    std::uniform_int_distribution<> distribution(0, characters.size() - 1);
 
-  std::string random_string(length, '\0');
-  std::generate_n(random_string.begin(), length,
-                  [&]() { return characters[distribution(generator)]; });
+    std::string random_string(length, '\0');
+    std::generate_n(random_string.begin(), length, [&]() { return characters[distribution(generator)]; });
 
-  return random_string;
+    return random_string;
 }
 
-inline void nitro_logo() {
-  std::string rainbowColors[] = {
-      "\033[93m", // Yellow
-      "\033[94m", // Blue
-  };
-
-  std::string resetColor = "\033[0m";
-  std::string asciiArt =
-      "      ___                                   ___           ___     \n"
-      "     /__/        ___           ___        /  /\\         /  /\\    \n"
-      "     \\  \\:\\      /  /\\         /  /\\      /  /::\\       /  /::\\  "
-      " \n"
-      "      \\  \\:\\    /  /:/        /  /:/     /  /:/\\:\\     /  /:/\\:\\ "
-      " \n"
-      "  _____\\__\\:\\  /__/::\\       /  /:/     /  /:/  \\:\\   /  /:/  "
-      "\\:\\ \n"
-      " /__/::::::::\\ \\__\\/\\:\\__   /  /::\\    /__/:/ /:/___ /__/:/ "
-      "\\__\\:\\\n"
-      " \\  \\:\\~~\\~~\\/    \\  \\:\\/\\ /__/:/\\:\\   \\  \\:\\/:::::/ \\  "
-      "\\:\\ /  /:/\n"
-      "  \\  \\:\\  ~~~      \\__\\::/ \\__\\/  \\:\\   \\  \\::/~~~~   \\  "
-      "\\:\\  /:/ \n"
-      "   \\  \\:\\          /__/:/       \\  \\:\\   \\  \\:\\        \\  "
-      "\\:\\/:/  \n"
-      "    \\  \\:\\         \\__\\/         \\__\\/    \\  \\:\\        \\  "
-      "\\::/   \n"
-      "     \\__\\/                                 \\__\\/         \\__\\/    "
-      "\n";
-
-  int colorIndex = 0;
-
-  for (char c : asciiArt) {
-    if (c == '\n') {
-      std::cout << resetColor << c;
-      colorIndex = 0;
-    } else {
-      std::cout << rainbowColors[colorIndex % 2] << c;
-      colorIndex++;
+inline void nitro_logo()
+{
+    std::string rainbowColors[] = {
+        "\033[93m", // Yellow
+        "\033[94m", // Blue
+    };
+
+    std::string resetColor = "\033[0m";
+    std::string asciiArt
+        = "      ___                                   ___           ___     \n"
+          "     /__/        ___           ___        /  /\\         /  /\\    \n"
+          "     \\  \\:\\      /  /\\         /  /\\      /  /::\\       /  /::\\  "
+          " \n"
+          "      \\  \\:\\    /  /:/        /  /:/     /  /:/\\:\\     /  /:/\\:\\ "
+          " \n"
+          "  _____\\__\\:\\  /__/::\\       /  /:/     /  /:/  \\:\\   /  /:/  "
+          "\\:\\ \n"
+          " /__/::::::::\\ \\__\\/\\:\\__   /  /::\\    /__/:/ /:/___ /__/:/ "
+          "\\__\\:\\\n"
+          " \\  \\:\\~~\\~~\\/    \\  \\:\\/\\ /__/:/\\:\\   \\  \\:\\/:::::/ \\  "
+          "\\:\\ /  /:/\n"
+          "  \\  \\:\\  ~~~      \\__\\::/ \\__\\/  \\:\\   \\  \\::/~~~~   \\  "
+          "\\:\\  /:/ \n"
+          "   \\  \\:\\          /__/:/       \\  \\:\\   \\  \\:\\        \\  "
+          "\\:\\/:/  \n"
+          "    \\  \\:\\         \\__\\/         \\__\\/    \\  \\:\\        \\  "
+          "\\::/   \n"
+          "     \\__\\/                                 \\__\\/         \\__\\/    "
+          "\n";
+
+    std::string asciiArtRTX = R"( 
+------------------------
+    ____  ______ __  __   ________   __      
+___/ __ \__  __/_  |/ /    __  __ \__  | / /
+__/ /_/ /_/ /  _\    /     _/ / / /_   |/ / 
+_/ _, _/_/ /   _/   |      / /_/ /_  /|  /  
+/_/ |_| /_/    /_/|_|      \____/ /_/ |_/   
+                                            
+)";
+
+    int colorIndex = 0;
+
+    for (char c : asciiArt)
+    {
+        if (c == '\n')
+        {
+            std::cout << resetColor << c;
+            colorIndex = 0;
+        }
+        else
+        {
+            std::cout << rainbowColors[colorIndex % 2] << c;
+            colorIndex++;
+        }
+    }
+
+    std::cout << resetColor; // Reset color at the endreturn;
+
+    for (char c : asciiArtRTX)
+    {
+        if (c == '\n')
+        {
+            std::cout << resetColor << c;
+            colorIndex = 0;
+        }
+        else
+        {
+            std::cout << "\033[1;32m" << c; // bright blue
+            colorIndex++;
+        }
     }
-  }
 
-  std::cout << resetColor; // Reset color at the endreturn;
+    std::cout << resetColor; // Reset color at the endreturn;
 }
 
-inline drogon::HttpResponsePtr nitroHttpResponse() {
-  auto resp = drogon::HttpResponse::newHttpResponse();
+inline drogon::HttpResponsePtr nitroHttpResponse()
+{
+    auto resp = drogon::HttpResponse::newHttpResponse();
 #ifdef ALLOW_ALL_CORS
-  LOG_INFO << "Respond for all cors!";
-  resp->addHeader("Access-Control-Allow-Origin", "*");
+    LOG_INFO << "Respond for all cors!";
+    resp->addHeader("Access-Control-Allow-Origin", "*");
 #endif
-  return resp;
+    return resp;
 }
 
-inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value &data) {
-  auto resp = drogon::HttpResponse::newHttpJsonResponse(data);
+inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value& data)
+{
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(data);
 #ifdef ALLOW_ALL_CORS
-  LOG_INFO << "Respond for all cors!";
-  resp->addHeader("Access-Control-Allow-Origin", "*");
+    LOG_INFO << "Respond for all cors!";
+    resp->addHeader("Access-Control-Allow-Origin", "*");
 #endif
-  return resp;
+    return resp;
 };
 
 inline drogon::HttpResponsePtr nitroStreamResponse(
-    const std::function<std::size_t(char *, std::size_t)> &callback,
-    const std::string &attachmentFileName = "") {
-  auto resp = drogon::HttpResponse::newStreamResponse(
-      callback, attachmentFileName, drogon::CT_NONE, "text/event-stream");
+    const std::function<std::size_t(char*, std::size_t)>& callback, const std::string& attachmentFileName = "")
+{
+    auto resp
+        = drogon::HttpResponse::newStreamResponse(callback, attachmentFileName, drogon::CT_NONE, "text/event-stream");
 #ifdef ALLOW_ALL_CORS
-  LOG_INFO << "Respond for all cors!";
-  resp->addHeader("Access-Control-Allow-Origin", "*");
+    LOG_INFO << "Respond for all cors!";
+    resp->addHeader("Access-Control-Allow-Origin", "*");
 #endif
-  return resp;
+    return resp;
 }
 
 } // namespace nitro_utils

From 514806865bd1dcd6a76b43941f833c69ce58ddbb Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Fri, 8 Mar 2024 05:28:21 +0000
Subject: [PATCH 12/33] change logo

---
 cpp/tensorrt_llm/nitro/utils/nitro_utils.h | 33 +++++++---------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
index 628dc5f46b1..5e382bd82fe 100644
--- a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
+++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
@@ -193,32 +193,19 @@ inline std::string generate_random_string(std::size_t length)
 inline void nitro_logo()
 {
     std::string rainbowColors[] = {
-        "\033[93m", // Yellow
         "\033[94m", // Blue
     };
 
     std::string resetColor = "\033[0m";
-    std::string asciiArt
-        = "      ___                                   ___           ___     \n"
-          "     /__/        ___           ___        /  /\\         /  /\\    \n"
-          "     \\  \\:\\      /  /\\         /  /\\      /  /::\\       /  /::\\  "
-          " \n"
-          "      \\  \\:\\    /  /:/        /  /:/     /  /:/\\:\\     /  /:/\\:\\ "
-          " \n"
-          "  _____\\__\\:\\  /__/::\\       /  /:/     /  /:/  \\:\\   /  /:/  "
-          "\\:\\ \n"
-          " /__/::::::::\\ \\__\\/\\:\\__   /  /::\\    /__/:/ /:/___ /__/:/ "
-          "\\__\\:\\\n"
-          " \\  \\:\\~~\\~~\\/    \\  \\:\\/\\ /__/:/\\:\\   \\  \\:\\/:::::/ \\  "
-          "\\:\\ /  /:/\n"
-          "  \\  \\:\\  ~~~      \\__\\::/ \\__\\/  \\:\\   \\  \\::/~~~~   \\  "
-          "\\:\\  /:/ \n"
-          "   \\  \\:\\          /__/:/       \\  \\:\\   \\  \\:\\        \\  "
-          "\\:\\/:/  \n"
-          "    \\  \\:\\         \\__\\/         \\__\\/    \\  \\:\\        \\  "
-          "\\::/   \n"
-          "     \\__\\/                                 \\__\\/         \\__\\/    "
-          "\n";
+    std::string asciiArt = R"(
+███╗   ██╗██╗████████╗██████╗  ██████╗ 
+████╗  ██║██║╚══██╔══╝██╔══██╗██╔═══██╗
+██╔██╗ ██║██║   ██║   ██████╔╝██║   ██║
+██║╚██╗██║██║   ██║   ██╔══██╗██║   ██║
+██║ ╚████║██║   ██║   ██║  ██║╚██████╔╝
+╚═╝  ╚═══╝╚═╝   ╚═╝   ╚═╝  ╚═╝ ╚═════╝ 
+ 
+  )";
 
     std::string asciiArtRTX = R"( 
 ------------------------
@@ -241,7 +228,7 @@ _/ _, _/_/ /   _/   |      / /_/ /_  /|  /
         }
         else
         {
-            std::cout << rainbowColors[colorIndex % 2] << c;
+            std::cout << "\033[94m" << c;
             colorIndex++;
         }
     }

From b88bc63fd1d85e392bf4fbe476c25ad8a57f2bb5 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Fri, 8 Mar 2024 06:57:33 +0000
Subject: [PATCH 13/33] remove redundant test

---
 cpp/tensorrt_llm/nitro/CMakeLists.txt | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
index 6aac914bbb5..419e62b19cb 100644
--- a/cpp/tensorrt_llm/nitro/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -83,18 +83,5 @@ target_include_directories(nitro PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_sources(nitro PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC})
 
 
-
-
-# test
-add_executable(test test.cc)
-
-target_link_libraries(
-  test PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece )
-
-target_compile_features(test PRIVATE cxx_std_17)
-target_compile_definitions(test PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
-#
-
-
 add_dependencies(nitro_proj nitro)
 

From 98bef79a5009c3b6eb975482abc46331eb9b9e54 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Fri, 8 Mar 2024 06:57:51 +0000
Subject: [PATCH 14/33] remove test.cc

---
 cpp/tensorrt_llm/nitro/test.cc | 181 ---------------------------------
 1 file changed, 181 deletions(-)
 delete mode 100644 cpp/tensorrt_llm/nitro/test.cc

diff --git a/cpp/tensorrt_llm/nitro/test.cc b/cpp/tensorrt_llm/nitro/test.cc
deleted file mode 100644
index b3e0dd754e6..00000000000
--- a/cpp/tensorrt_llm/nitro/test.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-#include "sentencepiece_processor.h"
-#include "tensorrt_llm/common/cudaUtils.h"
-#include "tensorrt_llm/common/memoryUtils.h"
-#include "tensorrt_llm/plugins/api/tllmPlugin.h"
-#include "tensorrt_llm/runtime/gptJsonConfig.h"
-#include "tensorrt_llm/runtime/gptSession.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/memoryCounters.h"
-#include "tensorrt_llm/runtime/tllmLogger.h"
-#include "thread"
-#include <NvInfer.h>
-#include <filesystem>
-#include <iostream>
-#include <ostream>
-#include <string>
-using namespace tensorrt_llm::runtime;
-
-namespace tc = tensorrt_llm::common;
-namespace trt = nvinfer1;
-
-class Tokenizer
-{
-private:
-    sentencepiece::SentencePieceProcessor processor;
-
-    void replaceSubstring(std::string& base, const std::string& from, const std::string& to)
-    {
-        size_t start_pos = 0;
-        while ((start_pos = base.find(from, start_pos)) != std::string::npos)
-        {
-            base.replace(start_pos, from.length(), to);
-            start_pos += to.length();
-        }
-    }
-
-public:
-    Tokenizer(const std::string& modelPath)
-    {
-        auto status = processor.Load(modelPath);
-        if (!status.ok())
-        {
-            std::cerr << status.ToString() << std::endl;
-        }
-    }
-
-    std::string decodeWithSpace(const int id)
-    {
-        std::string text = processor.IdToPiece(id);
-        replaceSubstring(text, "▁", " ");
-        return text;
-    }
-
-    std::vector<int> encode(const std::string& input)
-    {
-        std::vector<int> ids;
-        processor.Encode(input, &ids);
-        return ids;
-    }
-};
-
-namespace
-{
-void runBenchmark()
-{
-    Tokenizer nitro_tokenizer("./tokenizer.model");
-    std::vector<int> text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n ");
-
-    // Fixed settings
-    const std::string modelName = "mistral";
-    const std::filesystem::path engineDir = "/app/mistral_engine_2/";
-    const int batchSize = 1;
-    const int inputLen = text_input.size();
-    const std::vector<int> inOutLen = {inputLen, 500}; // input_length, output_length
-
-    // Logger setup
-    auto logger = std::make_shared<TllmLogger>();
-    logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
-
-    initTrtLlmPlugins(logger.get());
-
-    // Load model configuration
-    std::filesystem::path jsonFileName = engineDir / "config.json";
-    auto const json = GptJsonConfig::parse(jsonFileName);
-    auto const modelConfig = json.getModelConfig();
-    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
-    auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
-    auto const dtype = modelConfig.getDataType();
-
-    GptSession::Config sessionConfig{1, 1, 1};
-    sessionConfig.maxBatchSize = batchSize;
-    sessionConfig.maxBeamWidth = 4; // Fixed for simplicity
-    sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1];
-    sessionConfig.cudaGraphMode = false; // Fixed for simplicity
-
-    SamplingConfig samplingConfig{1}; // Fixed for simplicity
-    samplingConfig.temperature = std::vector{0.0f};
-    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
-    samplingConfig.topK = std::vector{40};
-    samplingConfig.topP = std::vector{0.0f};
-    samplingConfig.minLength = std::vector{inOutLen[1]};
-    samplingConfig.repetitionPenalty = std::vector{1.3f};
-
-    // Initialize session
-    GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger};
-    // Generate random input IDs within the model's vocabulary range
-    std::vector<int32_t> inputIdsHost = text_input;
-
-    std::cout << "Start Nitro testing session: " << std::endl;
-    // Input preparation
-    auto& bufferManager = session.getBufferManager();
-    GenerationInput::TensorPtr inputIds
-        = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU);
-
-    std::vector<int32_t> inputLengthsHost(batchSize, inOutLen[0]);
-    GenerationInput::TensorPtr inputLengths
-        = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
-
-    bool inputPacked = modelConfig.usePackedInput();
-
-    GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked};
-
-    GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
-        bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
-    // Define the callback to stream each generated token
-    generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer, &generationOutput](
-                                            GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
-    {
-        if (!finished)
-        {
-            // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
-            int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
-            // Copy output IDs from GPU to host for printing
-            std::vector<int32_t> outputIdsHost(outputLength);
-            bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
-            // Find the last non-zero value in the output IDs starting from the end of the input sequence
-            int lastNonZeroIndex = -1;
-            for (int i = outputLength - 1; i >= inOutLen[0]; --i)
-            {
-                if (outputIdsHost[i] != 0)
-                {
-                    lastNonZeroIndex = i;
-                    break; // Stop at the first non-zero token found from the end
-                }
-            }
-
-            // Directly print the last non-zero value if found, without using 'step'
-            if (lastNonZeroIndex != -1)
-            {
-                int outTok = outputIdsHost[lastNonZeroIndex];
-                if (outTok == 13)
-                {
-                    std::cout << "\n";
-                }
-                else
-                {
-                    std::cout << nitro_tokenizer.decodeWithSpace(outTok);
-                }
-            }
-        }
-    };
-
-    session.generate(generationOutput, generationInput, samplingConfig);
-    bufferManager.getStream().synchronize();
-}
-
-} // namespace
-
-int main()
-{
-    try
-    {
-        runBenchmark();
-        std::this_thread::sleep_for(std::chrono::seconds(10));
-    }
-    catch (const std::exception& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return 1;
-    }
-    return 0;
-}

From 0500a5ee2cfbed0178a26bbf112499a259007141 Mon Sep 17 00:00:00 2001
From: hiro <hiro@jan.ai>
Date: Fri, 8 Mar 2024 15:05:08 +0700
Subject: [PATCH 15/33] Add Dockerfile and update cmakelist

---
 BUILD_ENGINE_MODEL.md                  |   0
 BUILD_NITRO.md                         |   0
 Dockerfile.nitro.windows               | 237 +++++++++++++++++++++++++
 Dockerfile.tensorrt-llm-python.windows | 233 ++++++++++++++++++++++++
 cpp/tensorrt_llm/nitro/CMakeLists.txt  |   8 +-
 5 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 BUILD_ENGINE_MODEL.md
 create mode 100644 BUILD_NITRO.md
 create mode 100644 Dockerfile.nitro.windows
 create mode 100644 Dockerfile.tensorrt-llm-python.windows

diff --git a/BUILD_ENGINE_MODEL.md b/BUILD_ENGINE_MODEL.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/BUILD_NITRO.md b/BUILD_NITRO.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows
new file mode 100644
index 00000000000..0503267a19f
--- /dev/null
+++ b/Dockerfile.nitro.windows
@@ -0,0 +1,237 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+COPY ["cuDNN", "cuDNN"]
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+# Requirements to build tensorrt-llm on windows
+COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+RUN powershell -Command \
+    cd tensorrt-llm-nitro; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+COPY ./.git ./tensorrt-llm-nitro/.git
+
+COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
diff --git a/Dockerfile.tensorrt-llm-python.windows b/Dockerfile.tensorrt-llm-python.windows
new file mode 100644
index 00000000000..ee61239d001
--- /dev/null
+++ b/Dockerfile.tensorrt-llm-python.windows
@@ -0,0 +1,233 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+COPY ["cuDNN", "cuDNN"]
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+# Requirements to build tensorrt-llm on windows
+COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+RUN powershell -Command \
+    cd tensorrt-llm-nitro; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+COPY . ./tensorrt-llm-nitro/
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
index 419e62b19cb..a0667eba820 100644
--- a/cpp/tensorrt_llm/nitro/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -47,8 +47,14 @@ find_package(PkgConfig REQUIRED)
 find_package(Drogon CONFIG REQUIRED)
 
 # Use pkg-config to find the SentencePiece library
-pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
 
+if(NOT WIN32) # Linux
+  # Use pkg-config to find the SentencePiece library
+  pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
+else() # Windows
+  set(SENTENCEPIECE_INCLUDE_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/include")
+  set(SENTENCEPIECE_LIBRARY_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/lib")
+endif()
 
 include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})
 

From 7819d27f1b51d26c73d8aa224763c1312f909286 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Sat, 9 Mar 2024 16:52:43 +0700
Subject: [PATCH 16/33] Add Dockerfile for github action runner to build
 tensorrt llm

---
 .github/runners/Dockerfile.windows.runner | 256 ++++++++++++++++++++++
 .github/runners/runner.ps1                |   2 +
 Dockerfile.nitro.windows                  |  56 +++--
 3 files changed, 295 insertions(+), 19 deletions(-)
 create mode 100644 .github/runners/Dockerfile.windows.runner
 create mode 100644 .github/runners/runner.ps1

diff --git a/.github/runners/Dockerfile.windows.runner b/.github/runners/Dockerfile.windows.runner
new file mode 100644
index 00000000000..ac462d37a58
--- /dev/null
+++ b/.github/runners/Dockerfile.windows.runner
@@ -0,0 +1,256 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite --allow-empty-checksums -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+RUN choco install 7zip -y; \
+    7z --help
+
+# Requirements to build tensorrt-llm on windows
+ARG RUNNER_VERSION=2.314.1
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip; \
+    setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\")
+
+ADD runner.ps1 ./runner.ps1
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/runner.ps1 b/.github/runners/runner.ps1
new file mode 100644
index 00000000000..a08f3725bf1
--- /dev/null
+++ b/.github/runners/runner.ps1
@@ -0,0 +1,2 @@
+.\actions-runner\config.cmd --unattended --replace --url https://github.com/${env:RUNNER_REPO} --pat $env:RUNNER_PAT --runnergroup $env:RUNNER_GROUP --labels $env:RUNNER_LABELS --name $env:RUNNER_NAME --work $env:RUNNER_WORKDIR;
+.\actions-runner\run.cmd;
\ No newline at end of file
diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows
index 0503267a19f..616f4b8283f 100644
--- a/Dockerfile.nitro.windows
+++ b/Dockerfile.nitro.windows
@@ -14,7 +14,7 @@ SHELL ["cmd", "/S", "/C"]
 
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
     --output "cuda_installer.exe"; \
     Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
     Remove-Item cuda_installer.exe -Force
@@ -26,7 +26,7 @@ RUN powershell -Command \
 # Download and install Python
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
     Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
     Remove-Item python-3.10.11.exe -Force
 
@@ -44,7 +44,7 @@ RUN powershell -Command \
 # We use 10.1.1 which has a release on the GitHub page
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
     --output "msmpisetup.exe"; \
     Start-Process .\msmpisetup.exe -Wait ; \
     Remove-Item msmpisetup.exe -Force
@@ -55,7 +55,7 @@ RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
 # Download the MSMPI SDK
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
     --output "msmpisdk.msi"; \
     Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
     Remove-Item msmpisdk.msi -Force
@@ -66,7 +66,7 @@ RUN powershell -Command \
 
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
     --output "cmake.msi"; \
     Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
     Remove-Item cmake.msi -Force
@@ -80,7 +80,7 @@ RUN setx Path "%Path%;C:\Program Files\CMake\bin"
 
 RUN \
     # Download the Build Tools bootstrapper.
-    curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
     \
     # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
     && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
@@ -103,7 +103,7 @@ RUN \
 
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
     --output "install_vim.exe"; \
     Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
     Remove-Item install_vim.exe -Force
@@ -152,7 +152,14 @@ RUN powershell -Command \
 # Instead, we just copy the older NvToolsExt version to where CMake expects.
 # This assumes NvToolsExt was installed on the host machine using the
 # CUDA 11.8 GUI installer and copied to the build context
-COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
 
 # -----------------------------------------------------------------------------
 
@@ -164,7 +171,7 @@ WORKDIR "C:\\\\workspace"
 # Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
     --output TensorRT-9.2.0.5.zip; \
     Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
     Remove-Item TensorRT-9.2.0.5.zip -Force
@@ -181,7 +188,13 @@ RUN powershell -Command \
 
 # Copy cuDNN into the working directory
 # This assumes cuDNN exists on the host machine in the build context
-COPY ["cuDNN", "cuDNN"]
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
 
 # Add cuDNN libs and bin to Path.
 RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
@@ -213,23 +226,28 @@ RUN powershell -Command \
 
 # Package for nitro compile
 RUN powershell -Command \
-    choco install pkgconfiglite -y
+    choco install pkgconfiglite --allow-empty-checksums -y
 
 RUN powershell -Command \
     choco install Ninja -y
 
+RUN choco install 7zip -y; \
+    7z --help
+
 # Requirements to build tensorrt-llm on windows
-COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
-COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
-RUN powershell -Command \
-    cd tensorrt-llm-nitro; \
-    pip install --no-cache-dir -r .\requirements-dev-windows.txt
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
 
-COPY ./.git ./tensorrt-llm-nitro/.git
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
 
-COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
 
-COPY ./cpp ./tensorrt-llm-nitro/cpp
+COPY . ./nitro-tensort-llm
 
 # Define the entry point for the docker container.
 # This entry point launches the 64-bit PowerShell developer shell.

From b0051157a76929b71bf964363358e33a7649a61f Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Sun, 10 Mar 2024 21:09:31 +0700
Subject: [PATCH 17/33] Correct SENTENCEPIECE path nitro cmakelist

---
 Dockerfile.nitro.windows              | 21 ++++++++++++++++-----
 cpp/tensorrt_llm/nitro/CMakeLists.txt |  4 ++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows
index 616f4b8283f..5dcbcde66ae 100644
--- a/Dockerfile.nitro.windows
+++ b/Dockerfile.nitro.windows
@@ -247,9 +247,20 @@ RUN choco install 7zip -y; \
 
 # COPY ./cpp ./tensorrt-llm-nitro/cpp
 
-COPY . ./nitro-tensort-llm
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
 
-# Define the entry point for the docker container.
-# This entry point launches the 64-bit PowerShell developer shell.
-# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
-# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
index a0667eba820..cf13d40f4c7 100644
--- a/cpp/tensorrt_llm/nitro/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -52,8 +52,8 @@ if(NOT WIN32) # Linux
   # Use pkg-config to find the SentencePiece library
   pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
 else() # Windows
-  set(SENTENCEPIECE_INCLUDE_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/include")
-  set(SENTENCEPIECE_LIBRARY_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/lib")
+  set(SENTENCEPIECE_INCLUDE_DIRS "${CMAKE_PREFIX_PATH}/include")
+  set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib")
 endif()
 
 include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})

From 89ca0579a385d75299b39728c7cef71ee743bf08 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Sun, 10 Mar 2024 21:31:48 +0700
Subject: [PATCH 18/33] Separate dockerfile for ada and ampere arch

---
 ...ws.runner => Dockerfile.window.runner-ada} | 31 +++++++
 .../runners/Dockerfile.window.runner-ampere   | 86 +++++++++++++++----
 2 files changed, 101 insertions(+), 16 deletions(-)
 rename .github/runners/{Dockerfile.windows.runner => Dockerfile.window.runner-ada} (89%)
 rename Dockerfile.tensorrt-llm-python.windows => .github/runners/Dockerfile.window.runner-ampere (72%)

diff --git a/.github/runners/Dockerfile.windows.runner b/.github/runners/Dockerfile.window.runner-ada
similarity index 89%
rename from .github/runners/Dockerfile.windows.runner
rename to .github/runners/Dockerfile.window.runner-ada
index ac462d37a58..f5f9d5c3ffe 100644
--- a/.github/runners/Dockerfile.windows.runner
+++ b/.github/runners/Dockerfile.window.runner-ada
@@ -234,6 +234,37 @@ RUN powershell -Command \
 RUN choco install 7zip -y; \
     7z --help
 
+# Requirements to build tensorrt-llm on windows
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real;90-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
+
 # Requirements to build tensorrt-llm on windows
 ARG RUNNER_VERSION=2.314.1
 
diff --git a/Dockerfile.tensorrt-llm-python.windows b/.github/runners/Dockerfile.window.runner-ampere
similarity index 72%
rename from Dockerfile.tensorrt-llm-python.windows
rename to .github/runners/Dockerfile.window.runner-ampere
index ee61239d001..e957b97ad80 100644
--- a/Dockerfile.tensorrt-llm-python.windows
+++ b/.github/runners/Dockerfile.window.runner-ampere
@@ -14,7 +14,7 @@ SHELL ["cmd", "/S", "/C"]
 
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
     --output "cuda_installer.exe"; \
     Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
     Remove-Item cuda_installer.exe -Force
@@ -26,7 +26,7 @@ RUN powershell -Command \
 # Download and install Python
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
     Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
     Remove-Item python-3.10.11.exe -Force
 
@@ -44,7 +44,7 @@ RUN powershell -Command \
 # We use 10.1.1 which has a release on the GitHub page
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
     --output "msmpisetup.exe"; \
     Start-Process .\msmpisetup.exe -Wait ; \
     Remove-Item msmpisetup.exe -Force
@@ -55,7 +55,7 @@ RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
 # Download the MSMPI SDK
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
     --output "msmpisdk.msi"; \
     Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
     Remove-Item msmpisdk.msi -Force
@@ -66,7 +66,7 @@ RUN powershell -Command \
 
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
     --output "cmake.msi"; \
     Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
     Remove-Item cmake.msi -Force
@@ -80,7 +80,7 @@ RUN setx Path "%Path%;C:\Program Files\CMake\bin"
 
 RUN \
     # Download the Build Tools bootstrapper.
-    curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
     \
     # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
     && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
@@ -103,7 +103,7 @@ RUN \
 
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
     --output "install_vim.exe"; \
     Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
     Remove-Item install_vim.exe -Force
@@ -152,7 +152,14 @@ RUN powershell -Command \
 # Instead, we just copy the older NvToolsExt version to where CMake expects.
 # This assumes NvToolsExt was installed on the host machine using the
 # CUDA 11.8 GUI installer and copied to the build context
-COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
 
 # -----------------------------------------------------------------------------
 
@@ -164,7 +171,7 @@ WORKDIR "C:\\\\workspace"
 # Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
 RUN powershell -Command \
     $ErrorActionPreference = 'Stop'; \
-    curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
     --output TensorRT-9.2.0.5.zip; \
     Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
     Remove-Item TensorRT-9.2.0.5.zip -Force
@@ -181,7 +188,13 @@ RUN powershell -Command \
 
 # Copy cuDNN into the working directory
 # This assumes cuDNN exists on the host machine in the build context
-COPY ["cuDNN", "cuDNN"]
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
 
 # Add cuDNN libs and bin to Path.
 RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
@@ -213,21 +226,62 @@ RUN powershell -Command \
 
 # Package for nitro compile
 RUN powershell -Command \
-    choco install pkgconfiglite -y
+    choco install pkgconfiglite --allow-empty-checksums -y
 
 RUN powershell -Command \
     choco install Ninja -y
 
+RUN choco install 7zip -y; \
+    7z --help
+
 # Requirements to build tensorrt-llm on windows
-COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
-COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
 RUN powershell -Command \
-    cd tensorrt-llm-nitro; \
-    pip install --no-cache-dir -r .\requirements-dev-windows.txt
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
 
-COPY . ./tensorrt-llm-nitro/
+# # -----------------------------------------------------------------------------
+
+# Requirements to build tensorrt-llm on windows
+ARG RUNNER_VERSION=2.314.1
 
 # Define the entry point for the docker container.
 # This entry point launches the 64-bit PowerShell developer shell.
 # We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
 # ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip; \
+    setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\")
+
+ADD runner.ps1 ./runner.ps1
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file

From 85e6bb3229ff6eb1aad6ec60622985737ff475b0 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Sun, 10 Mar 2024 22:44:49 +0700
Subject: [PATCH 19/33] Add CI for nitro tensorrt-llm windows ampere

---
 .github/workflows/windows-build.yml | 79 +++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 .github/workflows/windows-build.yml

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
new file mode 100644
index 00000000000..bd494b84e5f
--- /dev/null
+++ b/.github/workflows/windows-build.yml
@@ -0,0 +1,79 @@
+name: Build for Windows
+on:
+  push:
+    branches:
+      - tensorrt-llm-nitro-rel
+
+jobs:
+  windows-ampere-build:
+    runs-on: windows-nitro-tensorrt-llm-ampere
+    permissions:
+      contents: write
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: remove existing build folder
+        shell: powershell
+        run: |
+          Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue
+          Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue
+          Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue
+          Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
+      
+      - name: Copy build cache `build`
+        shell: powershell
+        continue-on-error: true
+        run: |
+          robocopy 'C:\workspace\nitro-tensorrt-llm\build' '.' /E
+
+      - name: Copy build cache `cpp build`
+        shell: powershell
+        continue-on-error: true
+        run: |
+          robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E
+
+      - name: Copy build cache `nitro build_deps`
+        shell: powershell
+        continue-on-error: true
+        run: |
+          robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E
+      
+      - name: Build Python
+        shell: powershell
+        run: |
+          VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: python-wheel
+          path: ./build
+
+      - name: Build nitro
+        shell: powershell
+        run: |
+          VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass 'cd cpp\build;
+          cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja;
+          cmake --build . --parallel 2 --config Release'
+      
+      - name: create nitro artifact with dll file
+        shell: powershell
+        run: |
+          mkdir build_nitro
+          cp .\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
+          cp .\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
+          cp .\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
+          cp .\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
+          cp .\C:\workspace\cuDNN\cudnn_ops_infer64_8.dll .\build_nitro
+          cp .\C:\workspace\cuDNN\cudnn64_8.dll .\build_nitro
+          ls .\build_nitro
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: nitro-tensorrt-llm-windows-ampere
+          path: ./build_nitro
\ No newline at end of file

From d45051bff524476a7ee3d23d6c1f03dd273ed5c1 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Sun, 10 Mar 2024 23:29:35 +0700
Subject: [PATCH 20/33] Correct build script

---
 .github/workflows/windows-build.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index bd494b84e5f..3f545d71526 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -15,6 +15,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: recursive
+          lfs: true
 
       - name: remove existing build folder
         shell: powershell
@@ -45,7 +46,7 @@ jobs:
       - name: Build Python
         shell: powershell
         run: |
-          VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+          python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
@@ -56,8 +57,8 @@ jobs:
       - name: Build nitro
         shell: powershell
         run: |
-          VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass 'cd cpp\build;
-          cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja;
+          cd cpp\build
+          cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja
           cmake --build . --parallel 2 --config Release'
       
       - name: create nitro artifact with dll file

From 048735c42f10839f1d70dc9f65ba38a959cba8c6 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Sun, 10 Mar 2024 23:53:30 +0700
Subject: [PATCH 21/33] Install nitro_deps instead of using cache

---
 .github/workflows/windows-build.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 3f545d71526..c5858402588 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -21,9 +21,7 @@ jobs:
         shell: powershell
         run: |
           Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue
-          Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue
           Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue
-          Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
       
       - name: Copy build cache `build`
         shell: powershell
@@ -37,12 +35,13 @@ jobs:
         run: |
           robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E
 
-      - name: Copy build cache `nitro build_deps`
+      - name: install nitro deps
         shell: powershell
-        continue-on-error: true
         run: |
-          robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E
-      
+          cd cpp\tensorrt_llm\nitro
+          cmake -S ./nitro_deps -B ./build_deps/nitro_deps
+          cmake --build ./build_deps/nitro_deps --config Release
+
       - name: Build Python
         shell: powershell
         run: |
@@ -59,7 +58,7 @@ jobs:
         run: |
           cd cpp\build
           cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja
-          cmake --build . --parallel 2 --config Release'
+          cmake --build . --parallel 2 --config Release
       
       - name: create nitro artifact with dll file
         shell: powershell

From 13675e07734b8d08cc5203b166362578bfccf97c Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 00:52:14 +0700
Subject: [PATCH 22/33] nitro deps build using cache

---
 .github/workflows/windows-build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index c5858402588..7cebd229af2 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -21,7 +21,9 @@ jobs:
         shell: powershell
         run: |
           Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue
+          Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue
           Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue
+          Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
       
       - name: Copy build cache `build`
         shell: powershell
@@ -35,6 +37,13 @@ jobs:
         run: |
           robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E
 
+      - name: Copy build cache `nitro build_deps`
+        shell: powershell
+        continue-on-error: true
+        run: |
+          robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E
+  
+
       - name: install nitro deps
         shell: powershell
         run: |

From 3aa1e50b71eabb214c27a5401f4517db61717268 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 02:12:02 +0700
Subject: [PATCH 23/33] Fix error Longpath on windows

---
 .github/runners/Dockerfile.window.runner-ada    |  2 ++
 .github/runners/Dockerfile.window.runner-ampere |  2 ++
 .github/workflows/windows-build.yml             | 12 +++++-------
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.github/runners/Dockerfile.window.runner-ada b/.github/runners/Dockerfile.window.runner-ada
index f5f9d5c3ffe..b921ad43b00 100644
--- a/.github/runners/Dockerfile.window.runner-ada
+++ b/.github/runners/Dockerfile.window.runner-ada
@@ -284,4 +284,6 @@ RUN powershell -Command \
 
 ADD runner.ps1 ./runner.ps1
 
+RUN New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
 CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/Dockerfile.window.runner-ampere b/.github/runners/Dockerfile.window.runner-ampere
index e957b97ad80..9d88fe168cf 100644
--- a/.github/runners/Dockerfile.window.runner-ampere
+++ b/.github/runners/Dockerfile.window.runner-ampere
@@ -284,4 +284,6 @@ RUN powershell -Command \
 
 ADD runner.ps1 ./runner.ps1
 
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
 CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 7cebd229af2..ba472fd2b09 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -23,7 +23,6 @@ jobs:
           Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue
           Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue
           Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue
-          Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
       
       - name: Copy build cache `build`
         shell: powershell
@@ -42,19 +41,18 @@ jobs:
         continue-on-error: true
         run: |
           robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E
-  
 
       - name: install nitro deps
         shell: powershell
         run: |
           cd cpp\tensorrt_llm\nitro
-          cmake -S ./nitro_deps -B ./build_deps/nitro_deps
-          cmake --build ./build_deps/nitro_deps --config Release
+          powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps
+          powershell -Command cmake --build ./build_deps/nitro_deps --config Release
 
       - name: Build Python
         shell: powershell
         run: |
-          python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'
+          powershell -Command python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
@@ -66,8 +64,8 @@ jobs:
         shell: powershell
         run: |
           cd cpp\build
-          cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja
-          cmake --build . --parallel 2 --config Release
+          powershell -Command cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja
+          powershell -Command cmake --build . --parallel 2 --config Release
       
       - name: create nitro artifact with dll file
         shell: powershell

From d746a4965e15d48c71e13f6ad8784fe5d9029b69 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 02:57:12 +0700
Subject: [PATCH 24/33] Fix error build nitro deps

---
 .github/workflows/windows-build.yml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index ba472fd2b09..55663cb2f88 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -42,12 +42,13 @@ jobs:
         run: |
           robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E
 
-      - name: install nitro deps
-        shell: powershell
-        run: |
-          cd cpp\tensorrt_llm\nitro
-          powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps
-          powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+      - uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 45
+          max_attempts: 3
+          shell: powershell
+          command: |
+            cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
 
       - name: Build Python
         shell: powershell

From b63c8e1767bdaca8c89c9bfa559611297819eb55 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 14:05:03 +0700
Subject: [PATCH 25/33] nitro build_deps change to use bash

---
 .github/runners/Dockerfile.window.runner-ada    |  6 ++++--
 .github/runners/Dockerfile.window.runner-ampere |  2 ++
 .github/workflows/windows-build.yml             | 17 +++++++++--------
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/.github/runners/Dockerfile.window.runner-ada b/.github/runners/Dockerfile.window.runner-ada
index b921ad43b00..4ed2145599d 100644
--- a/.github/runners/Dockerfile.window.runner-ada
+++ b/.github/runners/Dockerfile.window.runner-ada
@@ -261,7 +261,7 @@ RUN powershell -Command \
 RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
 
 RUN VsDevCmd.bat -arch=amd64 && \
-    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real;90-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
 
 # # -----------------------------------------------------------------------------
 
@@ -284,6 +284,8 @@ RUN powershell -Command \
 
 ADD runner.ps1 ./runner.ps1
 
-RUN New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
+RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T
 
 CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/Dockerfile.window.runner-ampere b/.github/runners/Dockerfile.window.runner-ampere
index 9d88fe168cf..c41eb6205e9 100644
--- a/.github/runners/Dockerfile.window.runner-ampere
+++ b/.github/runners/Dockerfile.window.runner-ampere
@@ -286,4 +286,6 @@ ADD runner.ps1 ./runner.ps1
 
 RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
 
+RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T
+
 CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 55663cb2f88..76b4ba844c7 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -23,32 +23,33 @@ jobs:
           Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue
           Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue
           Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue
-      
+          Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
+
       - name: Copy build cache `build`
-        shell: powershell
+        shell: bash
         continue-on-error: true
         run: |
-          robocopy 'C:\workspace\nitro-tensorrt-llm\build' '.' /E
+          cp -r /c/workspace/nitro-tensorrt-llm/build '.'
 
       - name: Copy build cache `cpp build`
-        shell: powershell
+        shell: bash
         continue-on-error: true
         run: |
-          robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E
+          cp -r /c/workspace/nitro-tensorrt-llm/cpp/build' './cpp'
 
       - name: Copy build cache `nitro build_deps`
         shell: powershell
         continue-on-error: true
         run: |
-          robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E
+          cp -r /c/workspace/nitro-tensorrt-llm/cpp/tensorrt_llm/nitro/build_deps' './cpp/tensorrt_llm/nitro/'
 
       - uses: nick-fields/retry@v3
         with:
           timeout_minutes: 45
           max_attempts: 3
-          shell: powershell
+          shell: bash
           command: |
-            cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+            cd ./cpp/tensorrt_llm/nitro && cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
 
       - name: Build Python
         shell: powershell

From 28aa3dafe33248b23c475955ec7f1fbc96f793f3 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 14:23:52 +0700
Subject: [PATCH 26/33] nitro build_deps change to use powershell

---
 .github/workflows/windows-build.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 76b4ba844c7..cc2a089d3b9 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -26,35 +26,35 @@ jobs:
           Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
 
       - name: Copy build cache `build`
-        shell: bash
+        shell: powershell
         continue-on-error: true
         run: |
-          cp -r /c/workspace/nitro-tensorrt-llm/build '.'
+          Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\build' -Destination '.' -Recurse
 
       - name: Copy build cache `cpp build`
-        shell: bash
+        shell: powershell
         continue-on-error: true
         run: |
-          cp -r /c/workspace/nitro-tensorrt-llm/cpp/build' './cpp'
+          Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse
 
       - name: Copy build cache `nitro build_deps`
         shell: powershell
         continue-on-error: true
         run: |
-          cp -r /c/workspace/nitro-tensorrt-llm/cpp/tensorrt_llm/nitro/build_deps' './cpp/tensorrt_llm/nitro/'
+          Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse
 
       - uses: nick-fields/retry@v3
         with:
           timeout_minutes: 45
           max_attempts: 3
-          shell: bash
+          shell: powershell
           command: |
-            cd ./cpp/tensorrt_llm/nitro && cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+            cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
 
       - name: Build Python
         shell: powershell
         run: |
-          powershell -Command python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'
+          powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2

From 4e035f89bdcf355ab3613d8f6da7101d964f8975 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 16:42:23 +0700
Subject: [PATCH 27/33] Add remove CMakeCache file

---
 .github/workflows/windows-build.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index cc2a089d3b9..a83731423d3 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -36,12 +36,14 @@ jobs:
         continue-on-error: true
         run: |
           Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse
+          rm .\cpp\build\CMakeCache.txt
 
       - name: Copy build cache `nitro build_deps`
         shell: powershell
         continue-on-error: true
         run: |
           Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse
+          rm .\cpp\tensorrt_llm\nitro\build_deps\nitro_deps\CMakeCache.txt
 
       - uses: nick-fields/retry@v3
         with:
@@ -66,8 +68,8 @@ jobs:
         shell: powershell
         run: |
           cd cpp\build
-          powershell -Command cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja
-          powershell -Command cmake --build . --parallel 2 --config Release
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake --build . --parallel 2 --config Release"
       
       - name: create nitro artifact with dll file
         shell: powershell

From 58972be96d5682b7a31c96cd5da959a9e610bcd0 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 16:52:45 +0700
Subject: [PATCH 28/33] Add update CMakeCache.txt path

---
 .github/workflows/windows-build.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index a83731423d3..48e86886d8a 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -36,14 +36,19 @@ jobs:
         continue-on-error: true
         run: |
           Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse
-          rm .\cpp\build\CMakeCache.txt
 
       - name: Copy build cache `nitro build_deps`
         shell: powershell
         continue-on-error: true
         run: |
           Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse
-          rm .\cpp\tensorrt_llm\nitro\build_deps\nitro_deps\CMakeCache.txt
+
+      - name: Override path in CMakeCache.txt
+        shell: powershell
+        run: |
+          Get-ChildItem .\cpp -Recurse -Filter CMakeCache.txt | ForEach-Object {
+              (Get-Content $_.FullName) -replace [regex]::Escape("c:/workspace/nitro-tensorrt-llm"), "c:/w/nitro-tensorrt-llm/nitro-tensorrt-llm" | Set-Content $_.FullName
+          }
 
       - uses: nick-fields/retry@v3
         with:

From 713b54b626cd0103f9f3be9d77392739e9c9f189 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 18:03:03 +0700
Subject: [PATCH 29/33] Change folder git to build CMAKEList

---
 .github/workflows/windows-build.yml   | 65 +++++++--------------------
 cpp/tensorrt_llm/nitro/CMakeLists.txt |  5 ++-
 2 files changed, 21 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 48e86886d8a..a173bf7aca7 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -17,38 +17,7 @@ jobs:
           submodules: recursive
           lfs: true
 
-      - name: remove existing build folder
-        shell: powershell
-        run: |
-          Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue
-          Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue
-          Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue
-          Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue
-
-      - name: Copy build cache `build`
-        shell: powershell
-        continue-on-error: true
-        run: |
-          Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\build' -Destination '.' -Recurse
-
-      - name: Copy build cache `cpp build`
-        shell: powershell
-        continue-on-error: true
-        run: |
-          Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse
-
-      - name: Copy build cache `nitro build_deps`
-        shell: powershell
-        continue-on-error: true
-        run: |
-          Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse
-
-      - name: Override path in CMakeCache.txt
-        shell: powershell
-        run: |
-          Get-ChildItem .\cpp -Recurse -Filter CMakeCache.txt | ForEach-Object {
-              (Get-Content $_.FullName) -replace [regex]::Escape("c:/workspace/nitro-tensorrt-llm"), "c:/w/nitro-tensorrt-llm/nitro-tensorrt-llm" | Set-Content $_.FullName
-          }
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm C:\workspace
 
       - uses: nick-fields/retry@v3
         with:
@@ -56,23 +25,17 @@ jobs:
           max_attempts: 3
           shell: powershell
           command: |
-            cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+            cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
 
       - name: Build Python
         shell: powershell
         run: |
-          powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: python-wheel
-          path: ./build
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
 
       - name: Build nitro
         shell: powershell
         run: |
-          cd cpp\build
+          cd C:\workspace\nitro-tensorrt-llm\cpp\build
           powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
           powershell -Command "cmake --build . --parallel 2 --config Release"
       
@@ -80,16 +43,22 @@ jobs:
         shell: powershell
         run: |
           mkdir build_nitro
-          cp .\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
-          cp .\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
-          cp .\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
-          cp .\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
-          cp .\C:\workspace\cuDNN\cudnn_ops_infer64_8.dll .\build_nitro
-          cp .\C:\workspace\cuDNN\cudnn64_8.dll .\build_nitro
+          cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
+          cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
+          cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
+          cp C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
+          cp C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
+          cp C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
           ls .\build_nitro
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
         with:
           name: nitro-tensorrt-llm-windows-ampere
-          path: ./build_nitro
\ No newline at end of file
+          path: ./build_nitro
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: python-wheel
+          path: C:/workspace/nitro-tensorrt-llm/build
diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
index cf13d40f4c7..5b852afab13 100644
--- a/cpp/tensorrt_llm/nitro/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -56,6 +56,9 @@ else() # Windows
   set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib")
 endif()
 
+message(STATUS "SentencePiece library dirs: ${SENTENCEPIECE_LIBRARY_DIRS}")
+message(STATUS "SentencePiece header dirs: ${SENTENCEPIECE_INCLUDE_DIRS}")
+
 include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})
 
 link_directories(${SENTENCEPIECE_LIBRARY_DIRS})
@@ -71,7 +74,7 @@ add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
 add_executable(nitro main.cc)
 
 target_link_libraries(
-  nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} )
+  nitro PUBLIC ${SHARED_TARGET} sentencepiece nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} )
 
 
 target_compile_features(nitro PRIVATE cxx_std_17)

From 2326e6d14ba8857d4550cce18190c7d1525709b4 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 21:05:44 +0700
Subject: [PATCH 30/33] Add CI for build ada and ampere

---
 .../runners/Dockerfile.window.runner-turing   | 291 ++++++++++++++++++
 .github/workflows/windows-build.yml           |  34 +-
 2 files changed, 312 insertions(+), 13 deletions(-)
 create mode 100644 .github/runners/Dockerfile.window.runner-turing

diff --git a/.github/runners/Dockerfile.window.runner-turing b/.github/runners/Dockerfile.window.runner-turing
new file mode 100644
index 00000000000..ee35f0428c1
--- /dev/null
+++ b/.github/runners/Dockerfile.window.runner-turing
@@ -0,0 +1,291 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite --allow-empty-checksums -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+RUN choco install 7zip -y; \
+    7z --help
+
+# Requirements to build tensorrt-llm on windows
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '75-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
+
+# Requirements to build tensorrt-llm on windows
+ARG RUNNER_VERSION=2.314.1
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip; \
+    setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\")
+
+ADD runner.ps1 ./runner.ps1
+
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
+RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index a173bf7aca7..f1a0626f7fa 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -3,10 +3,18 @@ on:
   push:
     branches:
       - tensorrt-llm-nitro-rel
+      - rel
 
 jobs:
-  windows-ampere-build:
-    runs-on: windows-nitro-tensorrt-llm-ampere
+  windows-build:
+    runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }}
+    strategy:
+      matrix:
+        include:
+          - cuda_arch: '80-real;86-real'
+            cuda_arch_name: 'ampere'
+          - cuda_arch: '89-real'
+            cuda_arch_name: 'ada'
     permissions:
       contents: write
     steps:
@@ -17,7 +25,7 @@ jobs:
           submodules: recursive
           lfs: true
 
-      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm C:\workspace
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm
 
       - uses: nick-fields/retry@v3
         with:
@@ -30,35 +38,35 @@ jobs:
       - name: Build Python
         shell: powershell
         run: |
-          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
 
       - name: Build nitro
         shell: powershell
         run: |
           cd C:\workspace\nitro-tensorrt-llm\cpp\build
-          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
           powershell -Command "cmake --build . --parallel 2 --config Release"
       
       - name: create nitro artifact with dll file
         shell: powershell
         run: |
           mkdir build_nitro
-          cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
-          cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
-          cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
-          cp C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
-          cp C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
-          cp C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
           ls .\build_nitro
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
         with:
-          name: nitro-tensorrt-llm-windows-ampere
+          name: nitro-tensorrt-llm-windows-${{ matrix.cuda_arch_name }}
           path: ./build_nitro
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
         with:
-          name: python-wheel
+          name: python-tensorrt-llm-${{ matrix.cuda_arch }}-wheel
           path: C:/workspace/nitro-tensorrt-llm/build

From f9862020df0f940b45e821dfd4e2bfb97c5eb597 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 23:18:30 +0700
Subject: [PATCH 31/33] Add CI release

---
 .../python-windows-build-release.yml          |  87 +++++++++++++++
 ...ows-build.yml => windows-build-manual.yml} |   8 +-
 .github/workflows/windows-build-release.yml   | 103 ++++++++++++++++++
 3 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/python-windows-build-release.yml
 rename .github/workflows/{windows-build.yml => windows-build-manual.yml} (91%)
 create mode 100644 .github/workflows/windows-build-release.yml

diff --git a/.github/workflows/python-windows-build-release.yml b/.github/workflows/python-windows-build-release.yml
new file mode 100644
index 00000000000..ef21da6b909
--- /dev/null
+++ b/.github/workflows/python-windows-build-release.yml
@@ -0,0 +1,87 @@
+name: Release for Windows
+on:
+  push:
+    tags: ["python-windows-*"]
+
+jobs:
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
+    steps:
+      - name: Extract tag name prefix
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref_name }}
+          release_name: "${{ env.VERSION }}"
+          draft: true
+          prerelease: false
+  windows-build:
+    needs: create-draft-release
+    runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }}
+    strategy:
+      matrix:
+        include:
+          - cuda_arch: '80-real;86-real'
+            cuda_arch_name: 'ampere'
+          - cuda_arch: '89-real'
+            cuda_arch_name: 'ada'
+          - cuda_arch: '75-real'
+            cuda_arch_name: 'turing'
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/setup-dotnet@v3
+        with:
+          dotnet-version: "6.0.x"
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm
+
+      - uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 45
+          max_attempts: 3
+          shell: powershell
+          command: |
+            cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+
+      - name: Build Python
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+      - name: Build nitro
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm\cpp\build
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake --build . --parallel 2 --config Release"
+          tar -czvf python.tar.gz .\build\*.whl
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./python.tar.gz
+          asset_name: ${{ needs.create-draft-release.outputs.version }}-tensorrt-llm-${{ matrix.cuda_arch_name }}.tar.gz
+          asset_content_type: application/gzip
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build-manual.yml
similarity index 91%
rename from .github/workflows/windows-build.yml
rename to .github/workflows/windows-build-manual.yml
index f1a0626f7fa..d5dc1ebb59f 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build-manual.yml
@@ -1,9 +1,9 @@
-name: Build for Windows
+name: Manuall Build for Windows
 on:
   push:
     branches:
       - tensorrt-llm-nitro-rel
-      - rel
+  workflow_dispatch:
 
 jobs:
   windows-build:
@@ -15,6 +15,8 @@ jobs:
             cuda_arch_name: 'ampere'
           - cuda_arch: '89-real'
             cuda_arch_name: 'ada'
+          - cuda_arch: '75-real'
+            cuda_arch_name: 'turing'
     permissions:
       contents: write
     steps:
@@ -56,6 +58,8 @@ jobs:
           cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
           cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
           cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
+          cp -Force C:\workspace\TensorRT-9.2.0.5\lib\nvinfer.dll .\build_nitro
+          cp -Force C:\Windows\SysWOW64\msmpi.dll .\build_nitro
           cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
           ls .\build_nitro
 
diff --git a/.github/workflows/windows-build-release.yml b/.github/workflows/windows-build-release.yml
new file mode 100644
index 00000000000..d4922a537a1
--- /dev/null
+++ b/.github/workflows/windows-build-release.yml
@@ -0,0 +1,103 @@
+name: Release for Windows
+on:
+  push:
+    tags: ["windows-v[0-9]+.[0-9]+.[0-9]+"]
+
+jobs:
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
+    steps:
+      - name: Extract tag name without v prefix
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/windows-v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/windows-v}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref_name }}
+          release_name: "${{ env.VERSION }}"
+          draft: true
+          prerelease: false
+  windows-build:
+    needs: create-draft-release
+    runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }}
+    strategy:
+      matrix:
+        include:
+          - cuda_arch: '80-real;86-real'
+            cuda_arch_name: 'ampere'
+          - cuda_arch: '89-real'
+            cuda_arch_name: 'ada'
+          - cuda_arch: '75-real'
+            cuda_arch_name: 'turing'
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/setup-dotnet@v3
+        with:
+          dotnet-version: "6.0.x"
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm
+
+      - uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 45
+          max_attempts: 3
+          shell: powershell
+          command: |
+            cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+
+      - name: Build Python
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+      - name: Build nitro
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm\cpp\build
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake --build . --parallel 2 --config Release"
+      
+      - name: create nitro artifact with dll file
+        shell: powershell
+        run: |
+          mkdir build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
+          cp -Force C:\workspace\TensorRT-9.2.0.5\lib\nvinfer.dll .\build_nitro
+          cp -Force C:\Windows\SysWOW64\msmpi.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
+          ls .\build_nitro
+          dotnet tool install --global AzureSignTool
+          %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build_nitro\nitro.exe"
+          tar -czvf nitro.tar.gz .\build_nitro
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./nitro.tar.gz
+          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-amd64-tensorrt-llm-${{ matrix.cuda_arch_name }}.tar.gz
+          asset_content_type: application/gzip

From f9ba94b4bb0773ed0868e00f33c1118d5af18c65 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Mon, 11 Mar 2024 23:19:38 +0700
Subject: [PATCH 32/33] Remove debug CI

---
 .github/workflows/python-windows-build-release.yml | 2 +-
 .github/workflows/windows-build-manual.yml         | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/python-windows-build-release.yml b/.github/workflows/python-windows-build-release.yml
index ef21da6b909..fbfe5e76ba6 100644
--- a/.github/workflows/python-windows-build-release.yml
+++ b/.github/workflows/python-windows-build-release.yml
@@ -1,4 +1,4 @@
-name: Release for Windows
+name: Release for python Windows
 on:
   push:
     tags: ["python-windows-*"]
diff --git a/.github/workflows/windows-build-manual.yml b/.github/workflows/windows-build-manual.yml
index d5dc1ebb59f..b3e324ae6ed 100644
--- a/.github/workflows/windows-build-manual.yml
+++ b/.github/workflows/windows-build-manual.yml
@@ -1,8 +1,5 @@
 name: Manuall Build for Windows
 on:
-  push:
-    branches:
-      - tensorrt-llm-nitro-rel
   workflow_dispatch:
 
 jobs:

From 186eee30ebc06b17df277df7f8559294fa515ad0 Mon Sep 17 00:00:00 2001
From: automaticcat <daogiatuank54@gmail.com>
Date: Mon, 11 Mar 2024 17:02:07 +0700
Subject: [PATCH 33/33] Merge pull request #14 from
 janhq/10-epic-add-proper-handler-for-stop-words

Add naive hiding stop words case
---
 .../nitro/controllers/tensorrtllm.cc          | 108 +++++++++++++-----
 .../nitro/controllers/tensorrtllm.h           |   1 +
 2 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
index 999d7b18a82..c3891440dd3 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -26,11 +26,52 @@ void removeId(std::vector<int>& vec, int id)
 struct inferenceState
 {
     int prevPos{0};
+    std::string prevText;
     bool isFinished;
     std::queue<std::string> textsToStream;
     std::mutex queueMutex; // Mutex to protect access to textsToStream
+
+    size_t stopWordMatchLen = 0;
+    std::vector<std::string> sequence{"<", "|", "im", "_", "end", "|", ">"};
+
+    void reset()
+    {
+        stopWordMatchLen = 0;
+        prevText = "";
+    }
+
+    bool isComplete() const
+    {
+        return stopWordMatchLen >= sequence.size();
+    }
 };
 
+bool handleMatch(const std::string& rawText, std::shared_ptr<inferenceState> inferState)
+{
+    if (inferState->isComplete())
+    {
+        return true;
+    }
+
+    if (rawText == inferState->sequence[inferState->stopWordMatchLen])
+    {
+        inferState->stopWordMatchLen++; // Move to next state
+        inferState->prevText = rawText;
+        return true;
+    }
+    else if (inferState->stopWordMatchLen > 0 && rawText == inferState->sequence[0])
+    {
+        inferState->stopWordMatchLen = 1; // Restart from first match if sequence breaks but matches start
+        inferState->prevText = rawText;
+        return true;
+    }
+    else
+    {
+        inferState->reset();
+        return false; // Reset to start if sequence breaks
+    }
+}
+
 // Only support single token stopping point now
 std::string create_return_json(const std::string& id, const std::string& model, const std::string& content,
     Json::Value finish_reason = Json::Value())
@@ -67,6 +108,13 @@ GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToke
     return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU);
 }
 
+GenerationInput::TensorPtr tensorrtllm::getTensorChatMLStopWordList()
+{
+    std::vector<int32_t> stopWordsTokens = {28789, 28766, 321, 28730, 416, 28766, 28767, 32000, 6, 8, -1, -1, -1, -1,
+        -1, -1}; // Extend with -1 for increased length
+    return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 8}), MemoryType::kGPU);
+}
+
 GenerationInput tensorrtllm::createGenerationInput(std::vector<int32_t> inputIdsHost)
 {
     int inputLen = inputIdsHost.size();
@@ -78,7 +126,7 @@ GenerationInput tensorrtllm::createGenerationInput(std::vector<int32_t> inputIds
 
     GenerationInput generationInput{0, 0, inputIds, inputLengths, modelConfig->usePackedInput()};
 
-    generationInput.stopWordsList = getTensorSingleStopWordList(32000);
+    generationInput.stopWordsList = getTensorChatMLStopWordList();
     return generationInput;
 }
 
@@ -117,35 +165,35 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int
     generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput](
                                             GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
     {
-        if (!finished)
+        // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
+        int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
+        // Copy output IDs from GPU to host for printing
+        std::vector<int32_t> outputIdsHost(outputLength);
+        self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
+        // Find the last non-zero value in the output IDs starting from the end of the input sequence
+        std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
+        removeId(outputIdsHostDecode, 0);
+        std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode);
+
+        if (inferState->prevPos > 0 && inferState->prevPos < text.size())
+        {
+            // Valid prevPos, proceed with slicing the string from prevPos to the end
+            std::string stringTok(text.begin() + inferState->prevPos, text.end());
+            std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
+            inferState->textsToStream.push(stringTok);
+        }
+        else if (inferState->prevPos >= text.size())
         {
-            // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
-            int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
-            // Copy output IDs from GPU to host for printing
-            std::vector<int32_t> outputIdsHost(outputLength);
-            self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
-            // Find the last non-zero value in the output IDs starting from the end of the input sequence
-            std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
-            removeId(outputIdsHostDecode, 0);
-            removeId(outputIdsHostDecode, 32000);
-            std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode);
-
-            if (inferState->prevPos > 0 && inferState->prevPos < text.size())
-            {
-                // Valid prevPos, proceed with slicing the string from prevPos to the end
-                std::string stringTok(text.begin() + inferState->prevPos, text.end());
-                std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
-                inferState->textsToStream.push(stringTok);
-            }
-            else if (inferState->prevPos >= text.size())
-            {
-                inferState->prevPos = text.size();
-            }
             inferState->prevPos = text.size();
+        }
+        inferState->prevPos = text.size();
+        if (finished)
+        {
+
+            std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
+            inferState->textsToStream.push("[DONE]");
             return;
         }
-        std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
-        inferState->textsToStream.push("[DONE]");
     };
     // The rest of the logic inside the `chat_completion` remains unchanged...
     // After finishing the setup, call the inference logic
@@ -243,6 +291,12 @@ void tensorrtllm::chat_completion(
             {
 
                 std::string rawText = inferState->textsToStream.front();
+                inferState->textsToStream.pop();
+                if (handleMatch(rawText, inferState))
+                {
+                    continue;
+                };
+
                 if (rawText == "[DONE]")
                 {
                     LOG_INFO << "End of result";
@@ -257,7 +311,6 @@ void tensorrtllm::chat_completion(
                 }
                 const std::string textToStream
                     = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n";
-                inferState->textsToStream.pop();
                 lock.unlock(); // Unlock as soon as possible
 
                 // Ensure we do not exceed the buffer size. Truncate if necessary.
@@ -265,6 +318,7 @@ void tensorrtllm::chat_completion(
 
                 // Copy the text to the provided buffer
                 std::memcpy(pBuffer, textToStream.data(), bytesToWrite);
+                inferState->prevText = rawText;
                 return bytesToWrite; // Return the number of bytes written to the buffer
             }
             else
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
index 0ecae873d27..40454829f6b 100644
--- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -100,6 +100,7 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
     GenerationInput createGenerationInput(std::vector<int32_t> inputIds);
     GenerationOutput createGenerationOutput();
     std::unique_ptr<Tokenizer> nitro_tokenizer;
+    GenerationInput::TensorPtr getTensorChatMLStopWordList();
 
 private:
     GptSession::Config sessionConfig{1, 1, 1};