From 47133318497f742d525059618adb0e7e92a345da Mon Sep 17 00:00:00 2001 From: Hien To Date: Tue, 5 Mar 2024 22:16:02 +0700 Subject: [PATCH 01/33] Rebase to rel branch --- 3rdparty/cutlass | 2 +- cpp/CMakeLists.txt | 15 +- cpp/tensorrt_llm/CMakeLists.txt | 4 + cpp/tensorrt_llm/nitro/CMakeLists.txt | 49 +++++ cpp/tensorrt_llm/nitro/install_deps.sh | 3 + cpp/tensorrt_llm/nitro/main.cc | 191 ++++++++++++++++++ .../nitro/nitro_deps/CMakeLists.txt | 108 ++++++++++ 7 files changed, 364 insertions(+), 8 deletions(-) create mode 100644 cpp/tensorrt_llm/nitro/CMakeLists.txt create mode 100644 cpp/tensorrt_llm/nitro/install_deps.sh create mode 100644 cpp/tensorrt_llm/nitro/main.cc create mode 100644 cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt diff --git a/3rdparty/cutlass b/3rdparty/cutlass index 39c6a83f231..8236f30675b 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit 39c6a83f231d6db2bc6b9c251e7add77d68cbfb4 +Subproject commit 8236f30675bbe98f81d11c05764b77bfcb25b8cc diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6ef4b374a4f..dc5e3f0b477 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -29,9 +29,10 @@ project(tensorrt_llm LANGUAGES CXX) # Build options option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON) option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager" - ON) -option(BUILD_TESTS "Build Google tests" ON) -option(BUILD_BENCHMARKS "Build benchmarks" ON) + OFF) +option(BUILD_TESTS "Build Google tests" OFF) +option(BUILD_BENCHMARKS "Build benchmarks" OFF) +option(BUILD_NITRO "Build nitro" ON) option(NVTX_DISABLE "Disable all NVTX features" ON) option(WARNING_IS_ERROR "Treat all warnings as errors" OFF) option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF) @@ -129,9 +130,9 @@ endif() # Initialize CMAKE_CUDA_ARCHITECTURES before enabling CUDA if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") - set(CMAKE_CUDA_ARCHITECTURES 70-real 80-real 86-real 89-real 90-real) + set(CMAKE_CUDA_ARCHITECTURES 89-real) else() - set(CMAKE_CUDA_ARCHITECTURES 70-real 80-real 86-real) + set(CMAKE_CUDA_ARCHITECTURES 89-real) endif() endif() @@ -177,8 +178,8 @@ include_directories( ${3RDPARTY_DIR}/json/include) # TRT dependencies -set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR}) -set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu) +set_ifndef(TRT_LIB_DIR /usr/local/tensorrt/lib) +set_ifndef(TRT_INCLUDE_DIR /usr/local/tensorrt/include) set(TRT_LIB nvinfer) find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR}) diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index bcbf107e04a..29583f0f6c9 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -188,3 +188,7 @@ if(BUILD_PYBIND) endif() add_subdirectory(plugins) + +if(BUILD_NITRO) + add_subdirectory(nitro) +endif() \ No newline at end of file diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt new file mode 100644 index 00000000000..ebb6073e485 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# C++17 +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install) + +message(STATUS "Current Source Directory NITRO: ${CMAKE_CURRENT_SOURCE_DIR}") + +# Enable pkg-config support in CMake +find_package(PkgConfig REQUIRED) + +# Use pkg-config to find the SentencePiece library +pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece) + + +include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS}) + +link_directories(${SENTENCEPIECE_LIBRARY_DIRS}) + +set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..") + +add_custom_target(nitro_proj) + +set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts) +add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts) + +add_executable(nitro main.cc) + +target_link_libraries( + nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts ${SENTENCEPIECE_LIBRARIES}) + +target_compile_features(nitro PRIVATE cxx_std_17) +target_compile_definitions(nitro PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}") + +add_dependencies(nitro_proj nitro) diff --git a/cpp/tensorrt_llm/nitro/install_deps.sh b/cpp/tensorrt_llm/nitro/install_deps.sh new file mode 100644 index 00000000000..30de5afa4e1 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/install_deps.sh @@ -0,0 +1,3 @@ +cmake -S ./nitro_deps -B ./build_deps/nitro_deps +make -C ./build_deps/nitro_deps -j 10 +rm -rf ./build_deps/nitro_deps \ No newline at end of file diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc new file mode 100644 index 00000000000..efa387fec3a --- /dev/null +++ b/cpp/tensorrt_llm/nitro/main.cc @@ -0,0 +1,191 @@ +#include "sentencepiece_processor.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/plugins/api/tllmPlugin.h" +#include "tensorrt_llm/runtime/gptJsonConfig.h" +#include "tensorrt_llm/runtime/gptSession.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/memoryCounters.h" +#include "tensorrt_llm/runtime/tllmLogger.h" +#include +#include +#include +#include +#include + +using namespace tensorrt_llm::runtime; + +namespace tc = tensorrt_llm::common; +namespace trt = nvinfer1; + +class Tokenizer +{ +private: + sentencepiece::SentencePieceProcessor processor; + + void replaceSubstring(std::string& base, const std::string& from, const std::string& to) + { + size_t start_pos = 0; + while ((start_pos = base.find(from, start_pos)) != std::string::npos) + { + base.replace(start_pos, from.length(), to); + start_pos += to.length(); + } + } + +public: + Tokenizer(const std::string& modelPath) + { + auto status = processor.Load(modelPath); + if (!status.ok()) + { + std::cerr << status.ToString() << std::endl; + } + } + + std::string decodeWithSpace(const int id) + { + std::string text = processor.IdToPiece(id); + replaceSubstring(text, "▁", " "); + return text; + } + + std::vector encode(const std::string& input) + { + std::vector ids; + processor.Encode(input, &ids); + return ids; + } +}; + +namespace +{ +void runBenchmark() +{ + Tokenizer nitro_tokenizer("./tokenizer.model"); + std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); + + // Fixed settings + const std::string modelName = "mistral"; + const std::filesystem::path engineDir = "/app/mistral_engine_2/"; + const int batchSize = 1; + const int inputLen = text_input.size(); + const std::vector inOutLen = {inputLen, 500}; // input_length, output_length + + // Logger setup + auto logger = std::make_shared(); + logger->setLevel(nvinfer1::ILogger::Severity::kINFO); + + initTrtLlmPlugins(logger.get()); + + // Load model configuration + std::filesystem::path jsonFileName = engineDir / "config.json"; + auto const json = GptJsonConfig::parse(jsonFileName); + auto const modelConfig = json.getModelConfig(); + auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); + auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); + auto const dtype = modelConfig.getDataType(); + + GptSession::Config sessionConfig{1, 1, 1}; + sessionConfig.maxBatchSize = batchSize; + sessionConfig.maxBeamWidth = 4; // Fixed for simplicity + sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; + sessionConfig.cudaGraphMode = false; // Fixed for simplicity + + SamplingConfig samplingConfig{1}; // Fixed for simplicity + samplingConfig.temperature = std::vector{0.0f}; + samplingConfig.randomSeed = std::vector{static_cast(42ull)}; + samplingConfig.topK = std::vector{40}; + samplingConfig.topP = std::vector{0.0f}; + samplingConfig.minLength = std::vector{inOutLen[1]}; + samplingConfig.repetitionPenalty = std::vector{1.3f}; + + // Initialize session + GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; + // Generate random input IDs within the model's vocabulary range + const int vocabSize = modelConfig.getVocabSize(); + std::vector inputIdsHost = text_input; + + std::cout << "Start Nitro testing session: " << std::endl; + // for (auto& id : inputIdsHost) + // { + // id = rand() % vocabSize; // Random token ID within vocabulary range + // std::cout << id << std::endl; + // } + // // Simplified benchmarking process for a single run + // Note: This example does not include input data preparation or output handling for brevity + + // Input preparation + auto& bufferManager = session.getBufferManager(); + GenerationInput::TensorPtr inputIds + = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); + + std::vector inputLengthsHost(batchSize, inOutLen[0]); + GenerationInput::TensorPtr inputLengths + = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); + + bool inputPacked = modelConfig.usePackedInput(); + + GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked}; + + GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), + bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + + // Define the callback to stream each generated token + generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer]( + GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) + { + if (!finished) + { + // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens + int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape + // Copy output IDs from GPU to host for printing + std::vector outputIdsHost(outputLength); + bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); + + // Find the last non-zero value in the output IDs starting from the end of the input sequence + int lastNonZeroIndex = -1; + for (int i = outputLength - 1; i >= inOutLen[0]; --i) + { + if (outputIdsHost[i] != 0) + { + lastNonZeroIndex = i; + break; // Stop at the first non-zero token found from the end + } + } + + // Directly print the last non-zero value if found, without using 'step' + if (lastNonZeroIndex != -1) + { + int outTok = outputIdsHost[lastNonZeroIndex]; + if (outTok == 13) + { + std::cout << "\n"; + } + else + { + std::cout << nitro_tokenizer.decodeWithSpace(outTok); + } + } + } + }; + + session.generate(generationOutput, generationInput, samplingConfig); + bufferManager.getStream().synchronize(); +} + +} // namespace + +int main() +{ + try + { + runBenchmark(); + } + catch (const std::exception& e) + { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + return 0; +} \ No newline at end of file diff --git a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt new file mode 100644 index 00000000000..c097fcb4b37 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt @@ -0,0 +1,108 @@ +cmake_minimum_required(VERSION 3.22) # Required for FetchContent + +project(MyProject) + +include(ExternalProject) + +# Define variables +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(THIRD_PARTY_INSTALL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/_install) +#if(NOT THIRD_PARTY_INSTALL_PATH ) +# message(FATAL_ERROR "TRITON_THIRD_PARTY_INSTALL_PREFIX must be set") +#endif() # TRITON_THIRD_PARTY_INSTALL_PREFIX +# To force the find_package to look for .a inside self installed version +#set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +#set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +#set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) +# +# Add the external project +set(ZLIB_USE_STATIC_LIBS OFF) +find_package(ZLIB) +if(NOT ZLIB_FOUND) + set(ZLIB_USE_STATIC_LIBS ON) + ExternalProject_Add( + zlib + GIT_REPOSITORY https://github.com/madler/zlib.git + GIT_TAG v1.2.11 + CMAKE_ARGS + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} + ) +endif() + +ExternalProject_Add( + brotli + GIT_REPOSITORY https://github.com/google/brotli + GIT_TAG v1.1.0 + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=Release + -DBUILD_SHARED_LIBS=OFF + -DSHARE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/share + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} +) + +ExternalProject_Add( + jsoncpp + GIT_REPOSITORY https://github.com/open-source-parsers/jsoncpp + GIT_TAG 1.9.5 + CMAKE_ARGS + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} +) + +ExternalProject_Add( + c-ares + GIT_REPOSITORY https://github.com/c-ares/c-ares + GIT_TAG cares-1_26_0 + CMAKE_ARGS + -DCARES_SHARED=OFF + -DCARES_STATIC=ON + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} +) + +ExternalProject_Add( + drogon + GIT_REPOSITORY https://github.com/drogonframework/drogon + GIT_TAG v1.9.2 + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=release + -DOPENSSL_USE_STATIC_LIBS=TRUE + -DZLIB_USE_STATIC_LIBS=${ZLIB_USE_STATIC_LIBS} + -DBUILD_ORM=OFF + -DBUILD_YAML_CONFIG=OFF + -DBUILD_EXAMPLES=OFF + -DBUILD_CTL=OFF + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + -DBUILD_BROTLI=ON + -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} + # -DCMAKE_FIND_ROOT_PATH=${THIRD_PARTY_INSTALL_PATH} # To set the dir (that will be used to force the look for .a) + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} +) + +ExternalProject_Add( + sentencepiece + GIT_REPOSITORY https://github.com/google/sentencepiece + GIT_TAG v0.2.0 + CMAKE_ARGS + -DSPM_ENABLE_SHARED=OFF + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} +) + +# Fix trantor cmakelists to link c-ares on Windows +if(WIN32) + set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/nitro_deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt) + ExternalProject_Add_Step(drogon trantor_custom_target + COMMAND ${CMAKE_COMMAND} -E echo add_definitions(-DCARES_STATICLIB) >> ${TRANTOR_CMAKE_FILE} + DEPENDEES download + ) +endif() + +include_directories(${THIRD_PARTY_INSTALL_PATH}/include) +link_directories(${THIRD_PARTY_INSTALL_PATH}/lib) +# Optionally link or add dependencies to your targets +add_dependencies(drogon c-ares jsoncpp brotli) + +if(ZLIB_USE_STATIC_LIBS) + add_dependencies(drogon zlib) +endif() +# target_link_libraries( ...) \ No newline at end of file From 44a60b3a005959754be99cde21f9f012ed5a9f27 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Wed, 6 Mar 2024 07:38:49 +0000 Subject: [PATCH 02/33] only build release --- cpp/CMakeLists.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index dc5e3f0b477..37adf9dd9f3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -17,6 +17,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_BUILD_TYPE Release) include(CheckLanguage) include(cmake/modules/set_ifndef.cmake) @@ -45,12 +46,7 @@ else() message(STATUS "NVTX is enabled") endif() -if(EXISTS - "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt") - set(BUILD_BATCH_MANAGER_DEFAULT ON) -else() - set(BUILD_BATCH_MANAGER_DEFAULT OFF) -endif() +set(BUILD_BATCH_MANAGER_DEFAULT OFF) option(BUILD_BATCH_MANAGER "Build batch manager from source" ${BUILD_BATCH_MANAGER_DEFAULT}) From e015bcf849cd1fdea897c0cbdb3898741c674a48 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Wed, 6 Mar 2024 07:39:35 +0000 Subject: [PATCH 03/33] upgrade nitro --- cpp/tensorrt_llm/nitro/CMakeLists.txt | 53 ++++- .../nitro/controllers/tensorrtllm.cc | 78 +++++++ .../nitro/controllers/tensorrtllm.h | 133 ++++++++++++ cpp/tensorrt_llm/nitro/install_deps.sh | 2 +- cpp/tensorrt_llm/nitro/main.cc | 200 +----------------- .../nitro/nitro_deps/CMakeLists.txt | 2 +- cpp/tensorrt_llm/nitro/test.cc | 181 ++++++++++++++++ 7 files changed, 456 insertions(+), 193 deletions(-) create mode 100644 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc create mode 100644 cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h mode change 100644 => 100755 cpp/tensorrt_llm/nitro/install_deps.sh create mode 100644 cpp/tensorrt_llm/nitro/test.cc diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt index ebb6073e485..6aac914bbb5 100644 --- a/cpp/tensorrt_llm/nitro/CMakeLists.txt +++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt @@ -13,15 +13,38 @@ # License for the specific language governing permissions and limitations under # the License. # C++17 +# NItro init +include(CheckIncludeFileCXX) + +check_include_file_cxx(any HAS_ANY) +check_include_file_cxx(string_view HAS_STRING_VIEW) +check_include_file_cxx(coroutine HAS_COROUTINE) +if(HAS_ANY + AND HAS_STRING_VIEW + AND HAS_COROUTINE) + set(CMAKE_CXX_STANDARD 20) +elseif(HAS_ANY AND HAS_STRING_VIEW) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 14) +endif() + + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install) message(STATUS "Current Source Directory NITRO: ${CMAKE_CURRENT_SOURCE_DIR}") +message(STATUS "Current Cmake Prefix Path of NITRO: ${CMAKE_PREFIX_PATH}") + + +set(OPENSSL_USE_STATIC_LIBS TRUE) + # Enable pkg-config support in CMake find_package(PkgConfig REQUIRED) +find_package(Drogon CONFIG REQUIRED) # Use pkg-config to find the SentencePiece library pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece) @@ -38,12 +61,40 @@ add_custom_target(nitro_proj) set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts) add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts) +# main add_executable(nitro main.cc) target_link_libraries( - nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts ${SENTENCEPIECE_LIBRARIES}) + nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} ) + target_compile_features(nitro PRIVATE cxx_std_17) target_compile_definitions(nitro PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}") + + +aux_source_directory(controllers CTL_SRC) +aux_source_directory(common COMMON_SRC) +aux_source_directory(context CONTEXT_SRC) +aux_source_directory(models MODEL_SRC) + +target_include_directories(nitro PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +# ${CMAKE_CURRENT_SOURCE_DIR}/models) +target_sources(nitro PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) + + + + +# test +add_executable(test test.cc) + +target_link_libraries( + test PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece ) + +target_compile_features(test PRIVATE cxx_std_17) +target_compile_definitions(test PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}") +# + + add_dependencies(nitro_proj nitro) + diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc new file mode 100644 index 00000000000..79c0ad43ef5 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -0,0 +1,78 @@ +#include "tensorrtllm.h" +#include +#include + +void tensorrtllm::chat_completion( + const HttpRequestPtr& req, std::function&& callback) const +{ + std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); + const int inputLen = text_input.size(); + const std::vector inOutLen = {inputLen, 500}; // input_length, output_length + + const int batchSize = 1; + + std::vector inputIdsHost = text_input; + + std::cout << "Start Nitro testing session: " << std::endl; + // Input preparation + auto& bufferManager = gptSession->getBufferManager(); + GenerationInput::TensorPtr inputIds + = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); + + std::vector inputLengthsHost(batchSize, inOutLen[0]); + GenerationInput::TensorPtr inputLengths + = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); + + bool inputPacked = modelConfig->usePackedInput(); + + GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked}; + + GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), + bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + // Define the callback to stream each generated token + generationOutput.onTokenGenerated = [&bufferManager, inOutLen, this, &generationOutput]( + GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) + { + if (!finished) + { + // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens + int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape + // Copy output IDs from GPU to host for printing + std::vector outputIdsHost(outputLength); + bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); + // Find the last non-zero value in the output IDs starting from the end of the input sequence + int lastNonZeroIndex = -1; + for (int i = outputLength - 1; i >= inOutLen[0]; --i) + { + if (outputIdsHost[i] != 0) + { + lastNonZeroIndex = i; + break; // Stop at the first non-zero token found from the end + } + } + + // Directly print the last non-zero value if found, without using 'step' + if (lastNonZeroIndex != -1) + { + int outTok = outputIdsHost[lastNonZeroIndex]; + if (outTok == 13) + { + std::cout<<"\n" <nitro_tokenizer.decodeWithSpace(outTok) <generate(generationOutput, generationInput, samplingConfig); + + bufferManager.getStream().synchronize(); + + LOG_INFO << "Hello world"; + return; +}; + +// Add definition of your processing function here diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h new file mode 100644 index 00000000000..7f5b0c15a03 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -0,0 +1,133 @@ +#pragma once + +#include "sentencepiece_processor.h" +#include + +#include "sentencepiece_processor.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/plugins/api/tllmPlugin.h" +#include "tensorrt_llm/runtime/gptJsonConfig.h" +#include "tensorrt_llm/runtime/gptModelConfig.h" +#include "tensorrt_llm/runtime/gptSession.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/memoryCounters.h" +#include "tensorrt_llm/runtime/samplingConfig.h" +#include "tensorrt_llm/runtime/tllmLogger.h" +#include "thread" +#include +#include +#include +#include +#include +#include + +using namespace drogon; + +using namespace tensorrt_llm::runtime; + +class Tokenizer +{ +private: + sentencepiece::SentencePieceProcessor processor; + + void replaceSubstring(std::string& base, const std::string& from, const std::string& to) const + { + size_t start_pos = 0; + while ((start_pos = base.find(from, start_pos)) != std::string::npos) + { + base.replace(start_pos, from.length(), to); + start_pos += to.length(); + } + } + +public: + Tokenizer(const std::string& modelPath) + { + auto status = processor.Load(modelPath); + if (!status.ok()) + { + std::cerr << status.ToString() << std::endl; + } + LOG_INFO << "Successully loaded the tokenizer"; + } + + std::string decodeWithSpace(const int id) const + { + std::string text = processor.IdToPiece(id); + replaceSubstring(text, "▁", " "); + return text; + } + + std::vector encode(const std::string& input) const + { + std::vector ids; + processor.Encode(input, &ids); + return ids; + } +}; + +class tensorrtllm : public drogon::HttpController +{ +public: + tensorrtllm() + { + std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); + const int inputLen = text_input.size(); + const std::vector inOutLen = {inputLen, 500}; // input_length, output_length + + logger = std::make_shared(); + logger->setLevel(nvinfer1::ILogger::Severity::kINFO); + // Fixed settings + const std::string modelName = "mistral"; + const std::filesystem::path engineDir = "/app/mistral_engine_2/"; + const int batchSize = 1; + initTrtLlmPlugins(logger.get()); + // Load model configuration + std::filesystem::path jsonFileName = engineDir / "config.json"; + auto const json = GptJsonConfig::parse(jsonFileName); + auto config = json.getModelConfig(); + modelConfig = std::make_unique(config); + auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); + auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); + auto const dtype = modelConfig->getDataType(); + + // Set gptsessionconfig + sessionConfig.maxBatchSize = batchSize; + sessionConfig.maxBeamWidth = 4; // Fixed for simplicity + sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; + sessionConfig.cudaGraphMode = false; // Fixed for simplicity + + // Set smapling config + samplingConfig.temperature = std::vector{0.0f}; + samplingConfig.randomSeed = std::vector{static_cast(42ull)}; + samplingConfig.topK = std::vector{40}; + samplingConfig.topP = std::vector{0.0f}; + samplingConfig.minLength = std::vector{inOutLen[1]}; + samplingConfig.repetitionPenalty = std::vector{1.3f}; + + gptSession + = std::make_unique(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger); + }; + + METHOD_LIST_BEGIN + // use METHOD_ADD to add your custom processing function here; + // METHOD_ADD(tensorrtllm::get, "/{2}/{1}", Get); // path is /tensorrtllm/{arg2}/{arg1} + // METHOD_ADD(tensorrtllm::your_method_name, "/{1}/{2}/list", Get); // path is /tensorrtllm/{arg1}/{arg2}/list + ADD_METHOD_TO(tensorrtllm::chat_completion, "/testing", Get); // path is + // /absolute/path/{arg1}/{arg2}/list + + METHOD_LIST_END + // your declaration of processing function maybe like this: + // void get(const HttpRequestPtr& req, std::function &&callback, int p1, std::string + // p2); + void chat_completion(const HttpRequestPtr& req, std::function&& callback) const; + +private: + GptSession::Config sessionConfig{1, 1, 1}; + SamplingConfig samplingConfig{1}; + std::unique_ptr modelConfig; + Tokenizer nitro_tokenizer{"./tokenizer.model"}; + std::unique_ptr gptSession; + std::shared_ptr logger; +}; diff --git a/cpp/tensorrt_llm/nitro/install_deps.sh b/cpp/tensorrt_llm/nitro/install_deps.sh old mode 100644 new mode 100755 index 30de5afa4e1..d43257aa08e --- a/cpp/tensorrt_llm/nitro/install_deps.sh +++ b/cpp/tensorrt_llm/nitro/install_deps.sh @@ -1,3 +1,3 @@ cmake -S ./nitro_deps -B ./build_deps/nitro_deps make -C ./build_deps/nitro_deps -j 10 -rm -rf ./build_deps/nitro_deps \ No newline at end of file +rm -rf ./build_deps/nitro_deps diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc index efa387fec3a..97c7ddba686 100644 --- a/cpp/tensorrt_llm/nitro/main.cc +++ b/cpp/tensorrt_llm/nitro/main.cc @@ -1,191 +1,11 @@ -#include "sentencepiece_processor.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" -#include "tensorrt_llm/plugins/api/tllmPlugin.h" -#include "tensorrt_llm/runtime/gptJsonConfig.h" -#include "tensorrt_llm/runtime/gptSession.h" -#include "tensorrt_llm/runtime/iTensor.h" -#include "tensorrt_llm/runtime/memoryCounters.h" -#include "tensorrt_llm/runtime/tllmLogger.h" -#include -#include -#include -#include -#include - -using namespace tensorrt_llm::runtime; - -namespace tc = tensorrt_llm::common; -namespace trt = nvinfer1; - -class Tokenizer -{ -private: - sentencepiece::SentencePieceProcessor processor; - - void replaceSubstring(std::string& base, const std::string& from, const std::string& to) - { - size_t start_pos = 0; - while ((start_pos = base.find(from, start_pos)) != std::string::npos) - { - base.replace(start_pos, from.length(), to); - start_pos += to.length(); - } - } - -public: - Tokenizer(const std::string& modelPath) - { - auto status = processor.Load(modelPath); - if (!status.ok()) - { - std::cerr << status.ToString() << std::endl; - } - } - - std::string decodeWithSpace(const int id) - { - std::string text = processor.IdToPiece(id); - replaceSubstring(text, "▁", " "); - return text; - } - - std::vector encode(const std::string& input) - { - std::vector ids; - processor.Encode(input, &ids); - return ids; - } -}; - -namespace -{ -void runBenchmark() -{ - Tokenizer nitro_tokenizer("./tokenizer.model"); - std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); - - // Fixed settings - const std::string modelName = "mistral"; - const std::filesystem::path engineDir = "/app/mistral_engine_2/"; - const int batchSize = 1; - const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 500}; // input_length, output_length - - // Logger setup - auto logger = std::make_shared(); - logger->setLevel(nvinfer1::ILogger::Severity::kINFO); - - initTrtLlmPlugins(logger.get()); - - // Load model configuration - std::filesystem::path jsonFileName = engineDir / "config.json"; - auto const json = GptJsonConfig::parse(jsonFileName); - auto const modelConfig = json.getModelConfig(); - auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); - auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); - auto const dtype = modelConfig.getDataType(); - - GptSession::Config sessionConfig{1, 1, 1}; - sessionConfig.maxBatchSize = batchSize; - sessionConfig.maxBeamWidth = 4; // Fixed for simplicity - sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; - sessionConfig.cudaGraphMode = false; // Fixed for simplicity - - SamplingConfig samplingConfig{1}; // Fixed for simplicity - samplingConfig.temperature = std::vector{0.0f}; - samplingConfig.randomSeed = std::vector{static_cast(42ull)}; - samplingConfig.topK = std::vector{40}; - samplingConfig.topP = std::vector{0.0f}; - samplingConfig.minLength = std::vector{inOutLen[1]}; - samplingConfig.repetitionPenalty = std::vector{1.3f}; - - // Initialize session - GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; - // Generate random input IDs within the model's vocabulary range - const int vocabSize = modelConfig.getVocabSize(); - std::vector inputIdsHost = text_input; - - std::cout << "Start Nitro testing session: " << std::endl; - // for (auto& id : inputIdsHost) - // { - // id = rand() % vocabSize; // Random token ID within vocabulary range - // std::cout << id << std::endl; - // } - // // Simplified benchmarking process for a single run - // Note: This example does not include input data preparation or output handling for brevity - - // Input preparation - auto& bufferManager = session.getBufferManager(); - GenerationInput::TensorPtr inputIds - = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); - - std::vector inputLengthsHost(batchSize, inOutLen[0]); - GenerationInput::TensorPtr inputLengths - = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); - - bool inputPacked = modelConfig.usePackedInput(); - - GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked}; - - GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), - bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; - - // Define the callback to stream each generated token - generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer]( - GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) - { - if (!finished) - { - // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens - int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape - // Copy output IDs from GPU to host for printing - std::vector outputIdsHost(outputLength); - bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); - - // Find the last non-zero value in the output IDs starting from the end of the input sequence - int lastNonZeroIndex = -1; - for (int i = outputLength - 1; i >= inOutLen[0]; --i) - { - if (outputIdsHost[i] != 0) - { - lastNonZeroIndex = i; - break; // Stop at the first non-zero token found from the end - } - } - - // Directly print the last non-zero value if found, without using 'step' - if (lastNonZeroIndex != -1) - { - int outTok = outputIdsHost[lastNonZeroIndex]; - if (outTok == 13) - { - std::cout << "\n"; - } - else - { - std::cout << nitro_tokenizer.decodeWithSpace(outTok); - } - } - } - }; - - session.generate(generationOutput, generationInput, samplingConfig); - bufferManager.getStream().synchronize(); -} - -} // namespace - -int main() -{ - try - { - runBenchmark(); - } - catch (const std::exception& e) - { - std::cerr << "Error: " << e.what() << std::endl; - return 1; - } +#include +int main() { + //Set HTTP listener address and port + drogon::app().addListener("0.0.0.0", 5555); + //Load config file + //drogon::app().loadConfigFile("../config.json"); + //drogon::app().loadConfigFile("../config.yaml"); + //Run HTTP framework,the method will block in the internal event loop + drogon::app().run(); return 0; -} \ No newline at end of file +} diff --git a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt index c097fcb4b37..cd0d76a719e 100644 --- a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt +++ b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt @@ -105,4 +105,4 @@ add_dependencies(drogon c-ares jsoncpp brotli) if(ZLIB_USE_STATIC_LIBS) add_dependencies(drogon zlib) endif() -# target_link_libraries( ...) \ No newline at end of file +# target_link_libraries( ...) diff --git a/cpp/tensorrt_llm/nitro/test.cc b/cpp/tensorrt_llm/nitro/test.cc new file mode 100644 index 00000000000..b3e0dd754e6 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/test.cc @@ -0,0 +1,181 @@ +#include "sentencepiece_processor.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/plugins/api/tllmPlugin.h" +#include "tensorrt_llm/runtime/gptJsonConfig.h" +#include "tensorrt_llm/runtime/gptSession.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/memoryCounters.h" +#include "tensorrt_llm/runtime/tllmLogger.h" +#include "thread" +#include +#include +#include +#include +#include +using namespace tensorrt_llm::runtime; + +namespace tc = tensorrt_llm::common; +namespace trt = nvinfer1; + +class Tokenizer +{ +private: + sentencepiece::SentencePieceProcessor processor; + + void replaceSubstring(std::string& base, const std::string& from, const std::string& to) + { + size_t start_pos = 0; + while ((start_pos = base.find(from, start_pos)) != std::string::npos) + { + base.replace(start_pos, from.length(), to); + start_pos += to.length(); + } + } + +public: + Tokenizer(const std::string& modelPath) + { + auto status = processor.Load(modelPath); + if (!status.ok()) + { + std::cerr << status.ToString() << std::endl; + } + } + + std::string decodeWithSpace(const int id) + { + std::string text = processor.IdToPiece(id); + replaceSubstring(text, "▁", " "); + return text; + } + + std::vector encode(const std::string& input) + { + std::vector ids; + processor.Encode(input, &ids); + return ids; + } +}; + +namespace +{ +void runBenchmark() +{ + Tokenizer nitro_tokenizer("./tokenizer.model"); + std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); + + // Fixed settings + const std::string modelName = "mistral"; + const std::filesystem::path engineDir = "/app/mistral_engine_2/"; + const int batchSize = 1; + const int inputLen = text_input.size(); + const std::vector inOutLen = {inputLen, 500}; // input_length, output_length + + // Logger setup + auto logger = std::make_shared(); + logger->setLevel(nvinfer1::ILogger::Severity::kINFO); + + initTrtLlmPlugins(logger.get()); + + // Load model configuration + std::filesystem::path jsonFileName = engineDir / "config.json"; + auto const json = GptJsonConfig::parse(jsonFileName); + auto const modelConfig = json.getModelConfig(); + auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); + auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); + auto const dtype = modelConfig.getDataType(); + + GptSession::Config sessionConfig{1, 1, 1}; + sessionConfig.maxBatchSize = batchSize; + sessionConfig.maxBeamWidth = 4; // Fixed for simplicity + sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; + sessionConfig.cudaGraphMode = false; // Fixed for simplicity + + SamplingConfig samplingConfig{1}; // Fixed for simplicity + samplingConfig.temperature = std::vector{0.0f}; + samplingConfig.randomSeed = std::vector{static_cast(42ull)}; + samplingConfig.topK = std::vector{40}; + samplingConfig.topP = std::vector{0.0f}; + samplingConfig.minLength = std::vector{inOutLen[1]}; + samplingConfig.repetitionPenalty = std::vector{1.3f}; + + // Initialize session + GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; + // Generate random input IDs within the model's vocabulary range + std::vector inputIdsHost = text_input; + + std::cout << "Start Nitro testing session: " << std::endl; + // Input preparation + auto& bufferManager = session.getBufferManager(); + GenerationInput::TensorPtr inputIds + = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); + + std::vector inputLengthsHost(batchSize, inOutLen[0]); + GenerationInput::TensorPtr inputLengths + = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); + + bool inputPacked = modelConfig.usePackedInput(); + + GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked}; + + GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), + bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + // Define the callback to stream each generated token + generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer, &generationOutput]( + GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) + { + if (!finished) + { + // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens + int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape + // Copy output IDs from GPU to host for printing + std::vector outputIdsHost(outputLength); + bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); + // Find the last non-zero value in the output IDs starting from the end of the input sequence + int lastNonZeroIndex = -1; + for (int i = outputLength - 1; i >= inOutLen[0]; --i) + { + if (outputIdsHost[i] != 0) + { + lastNonZeroIndex = i; + break; // Stop at the first non-zero token found from the end + } + } + + // Directly print the last non-zero value if found, without using 'step' + if (lastNonZeroIndex != -1) + { + int outTok = outputIdsHost[lastNonZeroIndex]; + if (outTok == 13) + { + std::cout << "\n"; + } + else + { + std::cout << nitro_tokenizer.decodeWithSpace(outTok); + } + } + } + }; + + session.generate(generationOutput, generationInput, samplingConfig); + bufferManager.getStream().synchronize(); +} + +} // namespace + +int main() +{ + try + { + runBenchmark(); + std::this_thread::sleep_for(std::chrono::seconds(10)); + } + catch (const std::exception& e) + { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + return 0; +} From 97010fdafd2ea12acc2d9cb055e7536cb848d6b3 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Wed, 6 Mar 2024 07:57:48 +0000 Subject: [PATCH 04/33] better example --- cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc | 8 ++++---- cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index 79c0ad43ef5..db8b24c609a 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -5,9 +5,9 @@ void tensorrtllm::chat_completion( const HttpRequestPtr& req, std::function&& callback) const { - std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); + std::vector text_input = nitro_tokenizer.encode(example_string); const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 500}; // input_length, output_length + const std::vector inOutLen = {inputLen, 1500}; // input_length, output_length const int batchSize = 1; @@ -57,11 +57,11 @@ void tensorrtllm::chat_completion( int outTok = outputIdsHost[lastNonZeroIndex]; if (outTok == 13) { - std::cout<<"\n" <nitro_tokenizer.decodeWithSpace(outTok) <nitro_tokenizer.decodeWithSpace(outTok) << std::flush; } } } diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index 7f5b0c15a03..577a19dccae 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -72,15 +72,15 @@ class tensorrtllm : public drogon::HttpController public: tensorrtllm() { - std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); + std::vector text_input = nitro_tokenizer.encode(example_string); const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 500}; // input_length, output_length + const std::vector inOutLen = {inputLen, 1500}; // input_length, output_length logger = std::make_shared(); logger->setLevel(nvinfer1::ILogger::Severity::kINFO); // Fixed settings const std::string modelName = "mistral"; - const std::filesystem::path engineDir = "/app/mistral_engine_2/"; + const std::filesystem::path engineDir = "/app/mistral_engine_3/"; const int batchSize = 1; initTrtLlmPlugins(logger.get()); // Load model configuration @@ -130,4 +130,5 @@ class tensorrtllm : public drogon::HttpController Tokenizer nitro_tokenizer{"./tokenizer.model"}; std::unique_ptr gptSession; std::shared_ptr logger; + std::string example_string{"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease tell me a long and sad story<|im_end|>\n<|im_start|>assistant"}; }; From 61f908c25474062e601818dc6c5c238a712d19aa Mon Sep 17 00:00:00 2001 From: automaticcat Date: Thu, 7 Mar 2024 05:17:58 +0000 Subject: [PATCH 05/33] latest demo --- .../nitro/controllers/tensorrtllm.cc | 89 ++++++++++++++----- 1 file changed, 65 insertions(+), 24 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index db8b24c609a..bb4bef870d1 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -1,36 +1,69 @@ #include "tensorrtllm.h" +#include +#include #include +#include #include +#include -void tensorrtllm::chat_completion( - const HttpRequestPtr& req, std::function&& callback) const +void removeZeroes(std::vector& vec) { + vec.erase(std::remove(vec.begin(), vec.end(), 0), vec.end()); +} + +struct inferenceState +{ + int prevPos{0}; + bool isFinished; +}; + +void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function&& callback) +{ + std::shared_ptr inferState = std::make_shared(); + std::vector text_input = nitro_tokenizer.encode(example_string); const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 1500}; // input_length, output_length + const std::vector inOutLen = {inputLen, 2000}; // input_length, output_length const int batchSize = 1; std::vector inputIdsHost = text_input; std::cout << "Start Nitro testing session: " << std::endl; - // Input preparation + auto& bufferManager = gptSession->getBufferManager(); + // Make stopwordlists + + // Your stop word single token "32000" + std::vector stopWordsTokens = {32000, -1, 1, -1}; // Extend with -1 for increased length + + // Tensor creation for stopWordsList + // Assuming the framework allows similar operations for creating custom tensors + // At this point, + // Input preparation GenerationInput::TensorPtr inputIds = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); std::vector inputLengthsHost(batchSize, inOutLen[0]); GenerationInput::TensorPtr inputLengths = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); - bool inputPacked = modelConfig->usePackedInput(); - GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked}; + GenerationInput generationInput{0, 32000, inputIds, inputLengths, inputPacked}; + // generationInput.stopWordsList = stopWordsTokensTensor; + + generationInput.stopWordsList = bufferManager.copyFrom(stopWordsTokens, ITensor::makeShape({2,2}), MemoryType::kGPU); + generationInput.stopWordsList->reshape(ITensor::makeShape({1,2,2})); + + LOG_INFO << "here is the shape: " << generationInput.stopWordsList->getShape().d[0]; + + LOG_INFO << "here is the shape: " << generationInput.stopWordsList->getShape().d[1]; GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + // Define the callback to stream each generated token - generationOutput.onTokenGenerated = [&bufferManager, inOutLen, this, &generationOutput]( + generationOutput.onTokenGenerated = [&inferState, &bufferManager, inOutLen, this, &generationOutput]( GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) { if (!finished) @@ -41,29 +74,33 @@ void tensorrtllm::chat_completion( std::vector outputIdsHost(outputLength); bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); // Find the last non-zero value in the output IDs starting from the end of the input sequence - int lastNonZeroIndex = -1; - for (int i = outputLength - 1; i >= inOutLen[0]; --i) + std::vector outputIdsHostDecode(outputIdsHost.begin() + inOutLen[0], outputIdsHost.end()); + removeZeroes(outputIdsHostDecode); + std::string text = nitro_tokenizer.decode(outputIdsHostDecode); + + if (inferState->prevPos > 0 && inferState->prevPos < text.size()) { - if (outputIdsHost[i] != 0) - { - lastNonZeroIndex = i; - break; // Stop at the first non-zero token found from the end - } + // Valid prevPos, proceed with slicing the string from prevPos to the end + std::string stringTok(text.begin() + inferState->prevPos, text.end()); + std::cout << stringTok << std::flush; } - - // Directly print the last non-zero value if found, without using 'step' - if (lastNonZeroIndex != -1) + else if (inferState->prevPos >= text.size()) { - int outTok = outputIdsHost[lastNonZeroIndex]; - if (outTok == 13) - { - std::cout << "\n" << std::flush; - } - else + // prevPos is out of bounds, indicating there might be no new text or an error in logic + // You can handle this case as needed, for example, by logging a warning + inferState->prevPos = text.size(); + } + else + { + // inferState->prevPos is 0 or negative, indicating a potential logic error or initial state + // If there's valid text, you might want to print it all or handle this case specifically + if (!text.empty()) { - std::cout << this->nitro_tokenizer.decodeWithSpace(outTok) << std::flush; + std::cout << text << std::flush; // Optionally print all text if it's the initial state } } + // Update prevPos to the new length of the text for the next iteration + inferState->prevPos = text.size(); } }; @@ -71,6 +108,10 @@ void tensorrtllm::chat_completion( bufferManager.getStream().synchronize(); + auto resp=HttpResponse::newHttpResponse(); + resp->setStatusCode(k200OK); + resp->setBody("Your Page Contents"); + callback(resp); LOG_INFO << "Hello world"; return; }; From 636b6897d6fe1cc9a145651af3a73892d5d3c248 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Thu, 7 Mar 2024 05:18:08 +0000 Subject: [PATCH 06/33] latest demo --- .../nitro/controllers/tensorrtllm.h | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index 577a19dccae..ee7b234d714 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -1,6 +1,7 @@ #pragma once #include "sentencepiece_processor.h" +#include #include #include "sentencepiece_processor.h" @@ -31,7 +32,7 @@ class Tokenizer private: sentencepiece::SentencePieceProcessor processor; - void replaceSubstring(std::string& base, const std::string& from, const std::string& to) const + void replaceSubstring(std::string& base, const std::string& from, const std::string& to) { size_t start_pos = 0; while ((start_pos = base.find(from, start_pos)) != std::string::npos) @@ -52,14 +53,20 @@ class Tokenizer LOG_INFO << "Successully loaded the tokenizer"; } - std::string decodeWithSpace(const int id) const + std::string decodeWithSpace(const int id) { std::string text = processor.IdToPiece(id); replaceSubstring(text, "▁", " "); return text; } - std::vector encode(const std::string& input) const + std::string decode(const std::vector ids) + { + std::string text = processor.DecodeIds(ids); + return text; + } + + std::vector encode(const std::string& input) { std::vector ids; processor.Encode(input, &ids); @@ -74,7 +81,7 @@ class tensorrtllm : public drogon::HttpController { std::vector text_input = nitro_tokenizer.encode(example_string); const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 1500}; // input_length, output_length + const std::vector inOutLen = {inputLen, 2000}; // input_length, output_length logger = std::make_shared(); logger->setLevel(nvinfer1::ILogger::Severity::kINFO); @@ -94,7 +101,7 @@ class tensorrtllm : public drogon::HttpController // Set gptsessionconfig sessionConfig.maxBatchSize = batchSize; - sessionConfig.maxBeamWidth = 4; // Fixed for simplicity + sessionConfig.maxBeamWidth = 1; // Fixed for simplicity sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; sessionConfig.cudaGraphMode = false; // Fixed for simplicity @@ -105,7 +112,6 @@ class tensorrtllm : public drogon::HttpController samplingConfig.topP = std::vector{0.0f}; samplingConfig.minLength = std::vector{inOutLen[1]}; samplingConfig.repetitionPenalty = std::vector{1.3f}; - gptSession = std::make_unique(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger); }; @@ -121,14 +127,15 @@ class tensorrtllm : public drogon::HttpController // your declaration of processing function maybe like this: // void get(const HttpRequestPtr& req, std::function &&callback, int p1, std::string // p2); - void chat_completion(const HttpRequestPtr& req, std::function&& callback) const; + void chat_completion(const HttpRequestPtr& req, std::function&& callback); private: GptSession::Config sessionConfig{1, 1, 1}; SamplingConfig samplingConfig{1}; std::unique_ptr modelConfig; - Tokenizer nitro_tokenizer{"./tokenizer.model"}; + Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"}; std::unique_ptr gptSession; std::shared_ptr logger; - std::string example_string{"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease tell me a long and sad story<|im_end|>\n<|im_start|>assistant"}; + std::string example_string{ + "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello there<|im_end|>\n<|im_start|>assistant"}; }; From 03c0892ecc7b935ed3ca1e985d7e8a4d4ceb7dc0 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Thu, 7 Mar 2024 05:46:03 +0000 Subject: [PATCH 07/33] remove redundant include --- cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index ee7b234d714..11001ac8508 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -5,17 +5,12 @@ #include #include "sentencepiece_processor.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/gptModelConfig.h" #include "tensorrt_llm/runtime/gptSession.h" -#include "tensorrt_llm/runtime/iTensor.h" -#include "tensorrt_llm/runtime/memoryCounters.h" #include "tensorrt_llm/runtime/samplingConfig.h" #include "tensorrt_llm/runtime/tllmLogger.h" -#include "thread" #include #include #include From e537b9ccdc81cd9577d9d79f5545cab5b2bacd8a Mon Sep 17 00:00:00 2001 From: automaticcat Date: Thu, 7 Mar 2024 11:08:19 +0000 Subject: [PATCH 08/33] streaming working checkpoint --- .../nitro/controllers/tensorrtllm.cc | 213 +++++++++++---- .../nitro/controllers/tensorrtllm.h | 31 +-- cpp/tensorrt_llm/nitro/utils/nitro_utils.h | 251 ++++++++++++++++++ 3 files changed, 423 insertions(+), 72 deletions(-) create mode 100644 cpp/tensorrt_llm/nitro/utils/nitro_utils.h diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index bb4bef870d1..6257f5afccd 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -1,69 +1,119 @@ #include "tensorrtllm.h" +#include "tensorrt_llm/runtime/generationInput.h" +#include "tensorrt_llm/runtime/generationOutput.h" +#include "tensorrt_llm/runtime/samplingConfig.h" +#include "utils/nitro_utils.h" +#include +#include #include #include #include +#include #include #include #include -void removeZeroes(std::vector& vec) +void removeId(std::vector& vec, int id) { - vec.erase(std::remove(vec.begin(), vec.end(), 0), vec.end()); + vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end()); } struct inferenceState { int prevPos{0}; bool isFinished; + std::queue textsToStream; + std::mutex queueMutex; // Mutex to protect access to textsToStream }; -void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function&& callback) +// Only support single token stopping point now +std::string create_return_json(const std::string& id, const std::string& model, const std::string& content, + Json::Value finish_reason = Json::Value()) { - std::shared_ptr inferState = std::make_shared(); + Json::Value root; - std::vector text_input = nitro_tokenizer.encode(example_string); - const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 2000}; // input_length, output_length + root["id"] = id; + root["model"] = model; + root["created"] = static_cast(std::time(nullptr)); + root["object"] = "chat.completion.chunk"; - const int batchSize = 1; + Json::Value choicesArray(Json::arrayValue); + Json::Value choice; - std::vector inputIdsHost = text_input; + choice["index"] = 0; + Json::Value delta; + delta["content"] = content; + choice["delta"] = delta; + choice["finish_reason"] = finish_reason; - std::cout << "Start Nitro testing session: " << std::endl; + choicesArray.append(choice); + root["choices"] = choicesArray; - auto& bufferManager = gptSession->getBufferManager(); - // Make stopwordlists + Json::StreamWriterBuilder writer; + writer["indentation"] = ""; // This sets the indentation to an empty string, + // producing compact output. + return Json::writeString(writer, root); +} - // Your stop word single token "32000" - std::vector stopWordsTokens = {32000, -1, 1, -1}; // Extend with -1 for increased length +GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToken) +{ - // Tensor creation for stopWordsList - // Assuming the framework allows similar operations for creating custom tensors - // At this point, - // Input preparation - GenerationInput::TensorPtr inputIds - = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); + std::vector stopWordsTokens = {stopToken, -1, 1, -1}; // Extend with -1 for increased length + return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU); +} - std::vector inputLengthsHost(batchSize, inOutLen[0]); +GenerationInput tensorrtllm::createGenerationInput(std::vector inputIdsHost) +{ + int inputLen = inputIdsHost.size(); + std::vector inputLengthsHost(batchSize, inputLen); GenerationInput::TensorPtr inputLengths - = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); - bool inputPacked = modelConfig->usePackedInput(); + = gptSession->getBufferManager().copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); + GenerationInput::TensorPtr inputIds = gptSession->getBufferManager().copyFrom( + inputIdsHost, ITensor::makeShape({batchSize, inputLen}), MemoryType::kGPU); - GenerationInput generationInput{0, 32000, inputIds, inputLengths, inputPacked}; + GenerationInput generationInput{0, 0, inputIds, inputLengths, modelConfig->usePackedInput()}; - // generationInput.stopWordsList = stopWordsTokensTensor; + generationInput.stopWordsList = getTensorSingleStopWordList(32000); + return generationInput; +} - generationInput.stopWordsList = bufferManager.copyFrom(stopWordsTokens, ITensor::makeShape({2,2}), MemoryType::kGPU); - generationInput.stopWordsList->reshape(ITensor::makeShape({1,2,2})); +GenerationOutput tensorrtllm::createGenerationOutput() +{ + GenerationOutput generationOutput{ + gptSession->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), + gptSession->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + return generationOutput; +} - LOG_INFO << "here is the shape: " << generationInput.stopWordsList->getShape().d[0]; - LOG_INFO << "here is the shape: " << generationInput.stopWordsList->getShape().d[1]; - GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), - bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; +void inferenceThread(std::shared_ptr inferState, + std::vector inputIdsHost, + std::function callback, + tensorrtllm* self) +{ + const int inputLen = inputIdsHost.size(); + const int outputLen = 2048 - inputLen; + + // Create sampling config + SamplingConfig samplingConfig{1}; + samplingConfig.temperature = std::vector{0.0f}; + samplingConfig.randomSeed = std::vector{static_cast(42ull)}; + samplingConfig.topK = std::vector{40}; + samplingConfig.topP = std::vector{0.0f}; + samplingConfig.minLength = std::vector{outputLen}; + samplingConfig.repetitionPenalty = std::vector{1.3f}; - // Define the callback to stream each generated token - generationOutput.onTokenGenerated = [&inferState, &bufferManager, inOutLen, this, &generationOutput]( + std::cout << "Start Nitro testing session: " << std::endl; + + // Input preparation + + GenerationInput generationInput = self->createGenerationInput(inputIdsHost); + + GenerationOutput generationOutput = self->createGenerationOutput(); + + + // Define the callback to stream each generated token + generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput]( GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) { if (!finished) @@ -72,47 +122,96 @@ void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::functiongetShape().d[2]; // Get the length of output IDs based on the tensor shape // Copy output IDs from GPU to host for printing std::vector outputIdsHost(outputLength); - bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); + self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); // Find the last non-zero value in the output IDs starting from the end of the input sequence - std::vector outputIdsHostDecode(outputIdsHost.begin() + inOutLen[0], outputIdsHost.end()); - removeZeroes(outputIdsHostDecode); - std::string text = nitro_tokenizer.decode(outputIdsHostDecode); + std::vector outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end()); + removeId(outputIdsHostDecode, 0); + removeId(outputIdsHostDecode, 32000); + std::string text = self->nitro_tokenizer.decode(outputIdsHostDecode); if (inferState->prevPos > 0 && inferState->prevPos < text.size()) { // Valid prevPos, proceed with slicing the string from prevPos to the end std::string stringTok(text.begin() + inferState->prevPos, text.end()); - std::cout << stringTok << std::flush; + std::lock_guard guard(inferState->queueMutex); // Protect access with a lock + inferState->textsToStream.push(stringTok); } else if (inferState->prevPos >= text.size()) { - // prevPos is out of bounds, indicating there might be no new text or an error in logic - // You can handle this case as needed, for example, by logging a warning inferState->prevPos = text.size(); } - else - { - // inferState->prevPos is 0 or negative, indicating a potential logic error or initial state - // If there's valid text, you might want to print it all or handle this case specifically - if (!text.empty()) - { - std::cout << text << std::flush; // Optionally print all text if it's the initial state - } - } - // Update prevPos to the new length of the text for the next iteration inferState->prevPos = text.size(); } }; + // The rest of the logic inside the `chat_completion` remains unchanged... + // After finishing the setup, call the inference logic + self->gptSession->generate(generationOutput, generationInput, samplingConfig); +} + + +void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function&& callback) +{ + std::shared_ptr inferState = std::make_shared(); + + std::vector inputIdsHost = nitro_tokenizer.encode(example_string); + const int inputLen = inputIdsHost.size(); + const int outputLen = 2048 - inputLen; - gptSession->generate(generationOutput, generationInput, samplingConfig); + // Create sampling config + SamplingConfig samplingConfig{1}; + samplingConfig.temperature = std::vector{0.0f}; + samplingConfig.randomSeed = std::vector{static_cast(42ull)}; + samplingConfig.topK = std::vector{40}; + samplingConfig.topP = std::vector{0.0f}; + samplingConfig.minLength = std::vector{outputLen}; + samplingConfig.repetitionPenalty = std::vector{1.3f}; - bufferManager.getStream().synchronize(); + std::cout << "Start Nitro testing session: " << std::endl; + + // Input preparation + + std::thread infThread(inferenceThread, inferState, inputIdsHost, callback, this); + infThread.detach(); // Detach the thread to allow it to run independently + + + auto chunked_content_provider = [inferState](char* pBuffer, std::size_t nBuffSize) -> std::size_t + { + std::cout << "EMPTY"; + if (!pBuffer) + { + LOG_INFO << "Connection closed or buffer is null. Reset context"; + return 0; // Indicate no more data to send + } + + while (true) // Continuously check if the queue is not empty + { + std::unique_lock lock(inferState->queueMutex); // Lock the queue for exclusive access + if (!inferState->textsToStream.empty()) + { + + std::string rawText = inferState->textsToStream.front(); + const std::string textToStream + = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n"; + inferState->textsToStream.pop(); + lock.unlock(); // Unlock as soon as possible + + // Ensure we do not exceed the buffer size. Truncate if necessary. + std::size_t bytesToWrite = std::min(nBuffSize, textToStream.size()); + + // Copy the text to the provided buffer + std::memcpy(pBuffer, textToStream.data(), bytesToWrite); + return bytesToWrite; // Return the number of bytes written to the buffer + } + else + { + // If the queue is empty, release the lock and wait before trying again + lock.unlock(); + } + } + }; - auto resp=HttpResponse::newHttpResponse(); - resp->setStatusCode(k200OK); - resp->setBody("Your Page Contents"); - callback(resp); - LOG_INFO << "Hello world"; + auto streamResponse = nitro_utils::nitroStreamResponse(chunked_content_provider); + callback(streamResponse); return; }; diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index 11001ac8508..9a4c9b7dbc0 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -6,6 +6,8 @@ #include "sentencepiece_processor.h" #include "tensorrt_llm/plugins/api/tllmPlugin.h" +#include "tensorrt_llm/runtime/generationInput.h" +#include "tensorrt_llm/runtime/generationOutput.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/gptModelConfig.h" #include "tensorrt_llm/runtime/gptSession.h" @@ -83,7 +85,6 @@ class tensorrtllm : public drogon::HttpController // Fixed settings const std::string modelName = "mistral"; const std::filesystem::path engineDir = "/app/mistral_engine_3/"; - const int batchSize = 1; initTrtLlmPlugins(logger.get()); // Load model configuration std::filesystem::path jsonFileName = engineDir / "config.json"; @@ -94,19 +95,13 @@ class tensorrtllm : public drogon::HttpController auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); auto const dtype = modelConfig->getDataType(); - // Set gptsessionconfig + // Currently doing fixed session config sessionConfig.maxBatchSize = batchSize; sessionConfig.maxBeamWidth = 1; // Fixed for simplicity - sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; - sessionConfig.cudaGraphMode = false; // Fixed for simplicity - - // Set smapling config - samplingConfig.temperature = std::vector{0.0f}; - samplingConfig.randomSeed = std::vector{static_cast(42ull)}; - samplingConfig.topK = std::vector{40}; - samplingConfig.topP = std::vector{0.0f}; - samplingConfig.minLength = std::vector{inOutLen[1]}; - samplingConfig.repetitionPenalty = std::vector{1.3f}; + sessionConfig.maxSequenceLength = 2048; + sessionConfig.cudaGraphMode = true; // Fixed for simplicity + + // Init gptSession gptSession = std::make_unique(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger); }; @@ -124,13 +119,19 @@ class tensorrtllm : public drogon::HttpController // p2); void chat_completion(const HttpRequestPtr& req, std::function&& callback); + std::unique_ptr gptSession; + GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken); + GenerationInput createGenerationInput(std::vector inputIds); + GenerationOutput createGenerationOutput(); + Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"}; + private: GptSession::Config sessionConfig{1, 1, 1}; SamplingConfig samplingConfig{1}; std::unique_ptr modelConfig; - Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"}; - std::unique_ptr gptSession; std::shared_ptr logger; std::string example_string{ - "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello there<|im_end|>\n<|im_start|>assistant"}; + "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease write a long and sad " + "story<|im_end|>\n<|im_start|>assistant"}; + int batchSize = 1; }; diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h new file mode 100644 index 00000000000..c5dda96eb66 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h @@ -0,0 +1,251 @@ +#pragma once +#include "cstdio" +#include "random" +#include "string" +#include +#include +#include +#include +#include +#include +#include +#include +// Include platform-specific headers +#ifdef _WIN32 +#include +#include +#else +#include +#endif + +namespace nitro_utils { + +inline std::string models_folder = "./models"; + +inline std::string extractBase64(const std::string &input) { + std::regex pattern("base64,(.*)"); + std::smatch match; + + if (std::regex_search(input, match, pattern)) { + std::string base64_data = match[1]; + base64_data = base64_data.substr(0, base64_data.length() - 1); + return base64_data; + } + + return ""; +} + +// Helper function to encode data to Base64 +inline std::string base64Encode(const std::vector &data) { + static const char encodingTable[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + std::string encodedData; + int i = 0; + int j = 0; + unsigned char array3[3]; + unsigned char array4[4]; + + for (unsigned char c : data) { + array3[i++] = c; + if (i == 3) { + array4[0] = (array3[0] & 0xfc) >> 2; + array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4); + array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6); + array4[3] = array3[2] & 0x3f; + + for (i = 0; i < 4; i++) + encodedData += encodingTable[array4[i]]; + i = 0; + } + } + + if (i) { + for (j = i; j < 3; j++) + array3[j] = '\0'; + + array4[0] = (array3[0] & 0xfc) >> 2; + array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4); + array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6); + + for (j = 0; j < i + 1; j++) + encodedData += encodingTable[array4[j]]; + + while (i++ < 3) + encodedData += '='; + } + + return encodedData; +} + +// Function to load an image and convert it to Base64 +inline std::string imageToBase64(const std::string &imagePath) { + std::ifstream imageFile(imagePath, std::ios::binary); + if (!imageFile.is_open()) { + throw std::runtime_error("Could not open the image file."); + } + + std::vector buffer(std::istreambuf_iterator(imageFile), + {}); + return base64Encode(buffer); +} + +// Helper function to generate a unique filename +inline std::string generateUniqueFilename(const std::string &prefix, + const std::string &extension) { + // Get current time as a timestamp + auto now = std::chrono::system_clock::now(); + auto now_ms = std::chrono::time_point_cast(now); + auto epoch = now_ms.time_since_epoch(); + auto value = std::chrono::duration_cast(epoch); + + // Generate a random number + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(1000, 9999); + + std::stringstream ss; + ss << prefix << value.count() << "_" << dis(gen) << extension; + return ss.str(); +} + +inline void +processLocalImage(const std::string &localPath, + std::function callback) { + try { + std::string base64Image = imageToBase64(localPath); + callback(base64Image); // Invoke the callback with the Base64 string + } catch (const std::exception &e) { + std::cerr << "Error during processing: " << e.what() << std::endl; + } +} + +inline std::vector listFilesInDir(const std::string &path) { + std::vector files; + +#ifdef _WIN32 + // Windows-specific code + WIN32_FIND_DATA findFileData; + HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData); + + if (hFind != INVALID_HANDLE_VALUE) { + do { + if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) { + files.push_back(findFileData.cFileName); + } + } while (FindNextFile(hFind, &findFileData) != 0); + FindClose(hFind); + } +#else + // POSIX-specific code (Linux, Unix, MacOS) + DIR *dir; + struct dirent *ent; + + if ((dir = opendir(path.c_str())) != NULL) { + while ((ent = readdir(dir)) != NULL) { + if (ent->d_type == DT_REG) { // Check if it's a regular file + files.push_back(ent->d_name); + } + } + closedir(dir); + } +#endif + + return files; +} + +inline std::string rtrim(const std::string &str) { + size_t end = str.find_last_not_of("\n\t "); + return (end == std::string::npos) ? "" : str.substr(0, end + 1); +} + +inline std::string generate_random_string(std::size_t length) { + const std::string characters = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + std::random_device rd; + std::mt19937 generator(rd()); + + std::uniform_int_distribution<> distribution(0, characters.size() - 1); + + std::string random_string(length, '\0'); + std::generate_n(random_string.begin(), length, + [&]() { return characters[distribution(generator)]; }); + + return random_string; +} + +inline void nitro_logo() { + std::string rainbowColors[] = { + "\033[93m", // Yellow + "\033[94m", // Blue + }; + + std::string resetColor = "\033[0m"; + std::string asciiArt = + " ___ ___ ___ \n" + " /__/ ___ ___ / /\\ / /\\ \n" + " \\ \\:\\ / /\\ / /\\ / /::\\ / /::\\ " + " \n" + " \\ \\:\\ / /:/ / /:/ / /:/\\:\\ / /:/\\:\\ " + " \n" + " _____\\__\\:\\ /__/::\\ / /:/ / /:/ \\:\\ / /:/ " + "\\:\\ \n" + " /__/::::::::\\ \\__\\/\\:\\__ / /::\\ /__/:/ /:/___ /__/:/ " + "\\__\\:\\\n" + " \\ \\:\\~~\\~~\\/ \\ \\:\\/\\ /__/:/\\:\\ \\ \\:\\/:::::/ \\ " + "\\:\\ / /:/\n" + " \\ \\:\\ ~~~ \\__\\::/ \\__\\/ \\:\\ \\ \\::/~~~~ \\ " + "\\:\\ /:/ \n" + " \\ \\:\\ /__/:/ \\ \\:\\ \\ \\:\\ \\ " + "\\:\\/:/ \n" + " \\ \\:\\ \\__\\/ \\__\\/ \\ \\:\\ \\ " + "\\::/ \n" + " \\__\\/ \\__\\/ \\__\\/ " + "\n"; + + int colorIndex = 0; + + for (char c : asciiArt) { + if (c == '\n') { + std::cout << resetColor << c; + colorIndex = 0; + } else { + std::cout << rainbowColors[colorIndex % 2] << c; + colorIndex++; + } + } + + std::cout << resetColor; // Reset color at the endreturn; +} + +inline drogon::HttpResponsePtr nitroHttpResponse() { + auto resp = drogon::HttpResponse::newHttpResponse(); +#ifdef ALLOW_ALL_CORS + LOG_INFO << "Respond for all cors!"; + resp->addHeader("Access-Control-Allow-Origin", "*"); +#endif + return resp; +} + +inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value &data) { + auto resp = drogon::HttpResponse::newHttpJsonResponse(data); +#ifdef ALLOW_ALL_CORS + LOG_INFO << "Respond for all cors!"; + resp->addHeader("Access-Control-Allow-Origin", "*"); +#endif + return resp; +}; + +inline drogon::HttpResponsePtr nitroStreamResponse( + const std::function &callback, + const std::string &attachmentFileName = "") { + auto resp = drogon::HttpResponse::newStreamResponse( + callback, attachmentFileName, drogon::CT_NONE, "text/event-stream"); +#ifdef ALLOW_ALL_CORS + LOG_INFO << "Respond for all cors!"; + resp->addHeader("Access-Control-Allow-Origin", "*"); +#endif + return resp; +} + +} // namespace nitro_utils From 07ee3545eab3abcacd5ffb3439a950e2705a5194 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Thu, 7 Mar 2024 14:22:50 +0000 Subject: [PATCH 09/33] latest demo --- .../nitro/controllers/tensorrtllm.cc | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index 6257f5afccd..5bf68ebf017 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -85,11 +85,8 @@ GenerationOutput tensorrtllm::createGenerationOutput() return generationOutput; } - -void inferenceThread(std::shared_ptr inferState, - std::vector inputIdsHost, - std::function callback, - tensorrtllm* self) +void inferenceThread(std::shared_ptr inferState, std::vector inputIdsHost, + std::function callback, tensorrtllm* self) { const int inputLen = inputIdsHost.size(); const int outputLen = 2048 - inputLen; @@ -111,8 +108,7 @@ void inferenceThread(std::shared_ptr inferState, GenerationOutput generationOutput = self->createGenerationOutput(); - - // Define the callback to stream each generated token + // Define the callback to stream each generated token generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput]( GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) { @@ -141,14 +137,16 @@ void inferenceThread(std::shared_ptr inferState, inferState->prevPos = text.size(); } inferState->prevPos = text.size(); + return; } + std::lock_guard guard(inferState->queueMutex); // Protect access with a lock + inferState->textsToStream.push("[DONE]"); }; // The rest of the logic inside the `chat_completion` remains unchanged... // After finishing the setup, call the inference logic self->gptSession->generate(generationOutput, generationInput, samplingConfig); } - void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function&& callback) { std::shared_ptr inferState = std::make_shared(); @@ -173,16 +171,19 @@ void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function std::size_t { - std::cout << "EMPTY"; if (!pBuffer) { LOG_INFO << "Connection closed or buffer is null. Reset context"; return 0; // Indicate no more data to send } + if (inferState->isFinished) + { + return 0; + } + while (true) // Continuously check if the queue is not empty { std::unique_lock lock(inferState->queueMutex); // Lock the queue for exclusive access @@ -190,6 +191,18 @@ void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::functiontextsToStream.front(); + if (rawText == "[DONE]") + { + LOG_INFO << "End of result"; + const std::string str + = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", "", "stop") + + "\n\n" + "data: [DONE]" + "\n\n"; + + std::size_t nRead = std::min(str.size(), nBuffSize); + memcpy(pBuffer, str.data(), nRead); + inferState->isFinished = true; + return nRead; + } const std::string textToStream = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n"; inferState->textsToStream.pop(); From 3a9a5af49729c2c6568666f68ae6be2fed10c69b Mon Sep 17 00:00:00 2001 From: automaticcat Date: Fri, 8 Mar 2024 02:23:59 +0000 Subject: [PATCH 10/33] openai compatible chat --- 3rdparty/cutlass | 2 +- .../nitro/controllers/tensorrtllm.cc | 55 ++++++++++++++++++- .../nitro/controllers/tensorrtllm.h | 16 ++++-- .../nitro/models/chat_completion_request.h | 36 ++++++++++++ 4 files changed, 100 insertions(+), 9 deletions(-) create mode 100644 cpp/tensorrt_llm/nitro/models/chat_completion_request.h diff --git a/3rdparty/cutlass b/3rdparty/cutlass index 8236f30675b..a8f2c80db05 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit 8236f30675bbe98f81d11c05764b77bfcb25b8cc +Subproject commit a8f2c80db0564c74f4efccac71993b971dfc448b diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index 5bf68ebf017..8990b1aed7d 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -1,4 +1,6 @@ #include "tensorrtllm.h" +#include "models/chat_completion_request.h" +#include "nlohmann/json.hpp" #include "tensorrt_llm/runtime/generationInput.h" #include "tensorrt_llm/runtime/generationOutput.h" #include "tensorrt_llm/runtime/samplingConfig.h" @@ -13,6 +15,8 @@ #include #include +using json = nlohmann::json; + void removeId(std::vector& vec, int id) { vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end()); @@ -147,11 +151,58 @@ void inferenceThread(std::shared_ptr inferState, std::vectorgptSession->generate(generationOutput, generationInput, samplingConfig); } -void tensorrtllm::chat_completion(const HttpRequestPtr& req, std::function&& callback) +void tensorrtllm::chat_completion( + inferences::ChatCompletionRequest&& completion, std::function&& callback) { + + std::string formatted_input = pre_prompt; + + nlohmann::json data; + + data["stream"] = completion.stream; + data["n_predict"] = completion.max_tokens; + data["top_p"] = completion.top_p; + data["temperature"] = completion.temperature; + data["frequency_penalty"] = completion.frequency_penalty; + data["presence_penalty"] = completion.presence_penalty; + const Json::Value& messages = completion.messages; + + // Format the input from user + for (const auto& message : messages) + { + std::string input_role = message["role"].asString(); + std::string role; + if (input_role == "user") + { + role = user_prompt; + std::string content = message["content"].asString(); + formatted_input += role + content; + } + else if (input_role == "assistant") + { + role = ai_prompt; + std::string content = message["content"].asString(); + formatted_input += role + content; + } + else if (input_role == "system") + { + role = system_prompt; + std::string content = message["content"].asString(); + formatted_input = role + content + formatted_input; + } + else + { + role = input_role; + std::string content = message["content"].asString(); + formatted_input += role + content; + } + } + formatted_input += ai_prompt; + // Format the input from user + std::shared_ptr inferState = std::make_shared(); - std::vector inputIdsHost = nitro_tokenizer.encode(example_string); + std::vector inputIdsHost = nitro_tokenizer.encode(formatted_input); const int inputLen = inputIdsHost.size(); const int outputLen = 2048 - inputLen; diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index 9a4c9b7dbc0..bd770c03b58 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -1,5 +1,6 @@ #pragma once +#include "drogon/HttpTypes.h" #include "sentencepiece_processor.h" #include #include @@ -20,6 +21,8 @@ #include #include +#include "models/chat_completion_request.h" + using namespace drogon; using namespace tensorrt_llm::runtime; @@ -76,10 +79,6 @@ class tensorrtllm : public drogon::HttpController public: tensorrtllm() { - std::vector text_input = nitro_tokenizer.encode(example_string); - const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 2000}; // input_length, output_length - logger = std::make_shared(); logger->setLevel(nvinfer1::ILogger::Severity::kINFO); // Fixed settings @@ -110,14 +109,15 @@ class tensorrtllm : public drogon::HttpController // use METHOD_ADD to add your custom processing function here; // METHOD_ADD(tensorrtllm::get, "/{2}/{1}", Get); // path is /tensorrtllm/{arg2}/{arg1} // METHOD_ADD(tensorrtllm::your_method_name, "/{1}/{2}/list", Get); // path is /tensorrtllm/{arg1}/{arg2}/list - ADD_METHOD_TO(tensorrtllm::chat_completion, "/testing", Get); // path is + ADD_METHOD_TO(tensorrtllm::chat_completion, "/v1/chat/completions", Post); // path is // /absolute/path/{arg1}/{arg2}/list METHOD_LIST_END // your declaration of processing function maybe like this: // void get(const HttpRequestPtr& req, std::function &&callback, int p1, std::string // p2); - void chat_completion(const HttpRequestPtr& req, std::function&& callback); + void chat_completion( + inferences::ChatCompletionRequest&& completion, std::function&& callback); std::unique_ptr gptSession; GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken); @@ -133,5 +133,9 @@ class tensorrtllm : public drogon::HttpController std::string example_string{ "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease write a long and sad " "story<|im_end|>\n<|im_start|>assistant"}; + std::string user_prompt{"<|im_end|>\n<|im_start|>user\n"}; + std::string ai_prompt{"<|im_end|>\n<|im_start|>assistant\n"}; + std::string system_prompt{"<|im_start|>system\n"}; + std::string pre_prompt; int batchSize = 1; }; diff --git a/cpp/tensorrt_llm/nitro/models/chat_completion_request.h b/cpp/tensorrt_llm/nitro/models/chat_completion_request.h new file mode 100644 index 00000000000..bd802d67e02 --- /dev/null +++ b/cpp/tensorrt_llm/nitro/models/chat_completion_request.h @@ -0,0 +1,36 @@ +#pragma once +#include + +namespace inferences { +struct ChatCompletionRequest { + bool stream = false; + int max_tokens = 500; + float top_p = 0.95; + float temperature = 0.8; + float frequency_penalty = 0; + float presence_penalty = 0; + Json::Value stop = Json::Value(Json::arrayValue); + Json::Value messages = Json::Value(Json::arrayValue); +}; +} // namespace inferences + +namespace drogon { +template <> +inline inferences::ChatCompletionRequest fromRequest(const HttpRequest& req) { + auto jsonBody = req.getJsonObject(); + inferences::ChatCompletionRequest completion; + if (jsonBody) { + completion.stream = (*jsonBody).get("stream", false).asBool(); + completion.max_tokens = (*jsonBody).get("max_tokens", 500).asInt(); + completion.top_p = (*jsonBody).get("top_p", 0.95).asFloat(); + completion.temperature = (*jsonBody).get("temperature", 0.8).asFloat(); + completion.frequency_penalty = + (*jsonBody).get("frequency_penalty", 0).asFloat(); + completion.presence_penalty = + (*jsonBody).get("presence_penalty", 0).asFloat(); + completion.messages = (*jsonBody)["messages"]; + completion.stop = (*jsonBody)["stop"]; + } + return completion; +} +} // namespace inferences From 1845d8e4278b9e6a7eef63713156c84a4ac7da58 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Fri, 8 Mar 2024 04:37:04 +0000 Subject: [PATCH 11/33] feat: add splash screen and model load --- .../nitro/controllers/tensorrtllm.cc | 57 ++- .../nitro/controllers/tensorrtllm.h | 40 +- cpp/tensorrt_llm/nitro/main.cc | 76 +++- cpp/tensorrt_llm/nitro/utils/nitro_utils.h | 397 ++++++++++-------- 4 files changed, 356 insertions(+), 214 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index 8990b1aed7d..999d7b18a82 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -16,6 +16,7 @@ #include using json = nlohmann::json; +using namespace inferences; void removeId(std::vector& vec, int id) { @@ -127,7 +128,7 @@ void inferenceThread(std::shared_ptr inferState, std::vector outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end()); removeId(outputIdsHostDecode, 0); removeId(outputIdsHostDecode, 32000); - std::string text = self->nitro_tokenizer.decode(outputIdsHostDecode); + std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode); if (inferState->prevPos > 0 && inferState->prevPos < text.size()) { @@ -202,7 +203,7 @@ void tensorrtllm::chat_completion( std::shared_ptr inferState = std::make_shared(); - std::vector inputIdsHost = nitro_tokenizer.encode(formatted_input); + std::vector inputIdsHost = nitro_tokenizer->encode(formatted_input); const int inputLen = inputIdsHost.size(); const int outputLen = 2048 - inputLen; @@ -279,4 +280,56 @@ void tensorrtllm::chat_completion( return; }; +void tensorrtllm::loadModel(const HttpRequestPtr& req, std::function&& callback) +{ + const auto& jsonBody = req->getJsonObject(); + + if (!jsonBody) + { + Json::Value jsonResp; + jsonResp["message"] = "Require params!"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + callback(resp); + return; + } + + const std::filesystem::path engineDir = jsonBody->operator[]("engine_path").asString(); + int ctx_len = jsonBody->get("ctx_len", 2048).asInt(); + + logger = std::make_shared(); + logger->setLevel(nvinfer1::ILogger::Severity::kINFO); + // Fixed settings + const std::string modelName = "mistral"; + initTrtLlmPlugins(logger.get()); + // Load model configuration + std::filesystem::path jsonFileName = engineDir / "config.json"; + std::filesystem::path tokenizerModelName = engineDir / "tokenizer.model"; + + nitro_tokenizer = std::make_unique(tokenizerModelName.string()); + LOG_INFO << "Loaded tokenizer"; + + auto const json = GptJsonConfig::parse(jsonFileName); + auto config = json.getModelConfig(); + modelConfig = std::make_unique(config); + auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); + auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); + LOG_INFO << "Engine Path : " << enginePath.string(); + auto const dtype = modelConfig->getDataType(); + + // Currently doing fixed session config + sessionConfig.maxBatchSize = batchSize; + sessionConfig.maxBeamWidth = 1; // Fixed for simplicity + sessionConfig.maxSequenceLength = ctx_len; + sessionConfig.cudaGraphMode = true; // Fixed for simplicity + + // Init gptSession + gptSession = std::make_unique(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger); + // Model loaded successfully + Json::Value jsonResp; + jsonResp["message"] = "Model loaded successfully"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + callback(resp); + return; +}; + // Add definition of your processing function here diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index bd770c03b58..0ecae873d27 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -74,43 +74,18 @@ class Tokenizer } }; +namespace inferences +{ + class tensorrtllm : public drogon::HttpController { public: - tensorrtllm() - { - logger = std::make_shared(); - logger->setLevel(nvinfer1::ILogger::Severity::kINFO); - // Fixed settings - const std::string modelName = "mistral"; - const std::filesystem::path engineDir = "/app/mistral_engine_3/"; - initTrtLlmPlugins(logger.get()); - // Load model configuration - std::filesystem::path jsonFileName = engineDir / "config.json"; - auto const json = GptJsonConfig::parse(jsonFileName); - auto config = json.getModelConfig(); - modelConfig = std::make_unique(config); - auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); - auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); - auto const dtype = modelConfig->getDataType(); - - // Currently doing fixed session config - sessionConfig.maxBatchSize = batchSize; - sessionConfig.maxBeamWidth = 1; // Fixed for simplicity - sessionConfig.maxSequenceLength = 2048; - sessionConfig.cudaGraphMode = true; // Fixed for simplicity - - // Init gptSession - gptSession - = std::make_unique(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger); - }; + tensorrtllm(){}; METHOD_LIST_BEGIN // use METHOD_ADD to add your custom processing function here; - // METHOD_ADD(tensorrtllm::get, "/{2}/{1}", Get); // path is /tensorrtllm/{arg2}/{arg1} - // METHOD_ADD(tensorrtllm::your_method_name, "/{1}/{2}/list", Get); // path is /tensorrtllm/{arg1}/{arg2}/list ADD_METHOD_TO(tensorrtllm::chat_completion, "/v1/chat/completions", Post); // path is - // /absolute/path/{arg1}/{arg2}/list + METHOD_ADD(tensorrtllm::loadModel, "loadmodel", Post); METHOD_LIST_END // your declaration of processing function maybe like this: @@ -119,11 +94,12 @@ class tensorrtllm : public drogon::HttpController void chat_completion( inferences::ChatCompletionRequest&& completion, std::function&& callback); + void loadModel(const HttpRequestPtr& req, std::function&& callback); std::unique_ptr gptSession; GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken); GenerationInput createGenerationInput(std::vector inputIds); GenerationOutput createGenerationOutput(); - Tokenizer nitro_tokenizer{"./new_chatml_tokenizer.model"}; + std::unique_ptr nitro_tokenizer; private: GptSession::Config sessionConfig{1, 1, 1}; @@ -139,3 +115,5 @@ class tensorrtllm : public drogon::HttpController std::string pre_prompt; int batchSize = 1; }; + +} // namespace inferences diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc index 97c7ddba686..730253f74f3 100644 --- a/cpp/tensorrt_llm/nitro/main.cc +++ b/cpp/tensorrt_llm/nitro/main.cc @@ -1,11 +1,73 @@ +#include "utils/nitro_utils.h" +#include // for PATH_MAX +#include #include -int main() { - //Set HTTP listener address and port - drogon::app().addListener("0.0.0.0", 5555); - //Load config file - //drogon::app().loadConfigFile("../config.json"); - //drogon::app().loadConfigFile("../config.yaml"); - //Run HTTP framework,the method will block in the internal event loop +#include + +#if defined(__APPLE__) && defined(__MACH__) +#include // for dirname() +#include +#elif defined(__linux__) +#include // for dirname() +#include // for readlink() +#elif defined(_WIN32) +#include +#undef max +#else +#error "Unsupported platform!" +#endif + +int main(int argc, char* argv[]) +{ + int thread_num = 1; + std::string host = "127.0.0.1"; + int port = 3928; + std::string uploads_folder_path; + + // Number of nitro threads + if (argc > 1) + { + thread_num = std::atoi(argv[1]); + } + + // Check for host argument + if (argc > 2) + { + host = argv[2]; + } + + // Check for port argument + if (argc > 3) + { + port = std::atoi(argv[3]); // Convert string argument to int + } + + // Uploads folder path + if (argc > 4) + { + uploads_folder_path = argv[4]; + } + + int logical_cores = std::thread::hardware_concurrency(); + int drogon_thread_num = 1; // temporarily set thread num to 1 + nitro_utils::nitro_logo(); +#ifdef NITRO_VERSION + LOG_INFO << "Nitro version: " << NITRO_VERSION; +#else + LOG_INFO << "Nitro version: undefined"; +#endif + LOG_INFO << "Server started, listening at: " << host << ":" << port; + LOG_INFO << "Please load your model"; + drogon::app().addListener(host, port); + drogon::app().setThreadNum(drogon_thread_num); + if (!uploads_folder_path.empty()) + { + LOG_INFO << "Drogon uploads folder is at: " << uploads_folder_path; + drogon::app().setUploadPath(uploads_folder_path); + } + LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum(); + drogon::app().run(); + return 0; } diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h index c5dda96eb66..628dc5f46b1 100644 --- a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h +++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h @@ -12,240 +12,289 @@ #include // Include platform-specific headers #ifdef _WIN32 -#include #include +#include #else #include #endif -namespace nitro_utils { +namespace nitro_utils +{ inline std::string models_folder = "./models"; -inline std::string extractBase64(const std::string &input) { - std::regex pattern("base64,(.*)"); - std::smatch match; +inline std::string extractBase64(const std::string& input) +{ + std::regex pattern("base64,(.*)"); + std::smatch match; - if (std::regex_search(input, match, pattern)) { - std::string base64_data = match[1]; - base64_data = base64_data.substr(0, base64_data.length() - 1); - return base64_data; - } + if (std::regex_search(input, match, pattern)) + { + std::string base64_data = match[1]; + base64_data = base64_data.substr(0, base64_data.length() - 1); + return base64_data; + } - return ""; + return ""; } // Helper function to encode data to Base64 -inline std::string base64Encode(const std::vector &data) { - static const char encodingTable[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - std::string encodedData; - int i = 0; - int j = 0; - unsigned char array3[3]; - unsigned char array4[4]; - - for (unsigned char c : data) { - array3[i++] = c; - if (i == 3) { - array4[0] = (array3[0] & 0xfc) >> 2; - array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4); - array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6); - array4[3] = array3[2] & 0x3f; - - for (i = 0; i < 4; i++) - encodedData += encodingTable[array4[i]]; - i = 0; +inline std::string base64Encode(const std::vector& data) +{ + static const char encodingTable[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + std::string encodedData; + int i = 0; + int j = 0; + unsigned char array3[3]; + unsigned char array4[4]; + + for (unsigned char c : data) + { + array3[i++] = c; + if (i == 3) + { + array4[0] = (array3[0] & 0xfc) >> 2; + array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4); + array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6); + array4[3] = array3[2] & 0x3f; + + for (i = 0; i < 4; i++) + encodedData += encodingTable[array4[i]]; + i = 0; + } } - } - if (i) { - for (j = i; j < 3; j++) - array3[j] = '\0'; + if (i) + { + for (j = i; j < 3; j++) + array3[j] = '\0'; - array4[0] = (array3[0] & 0xfc) >> 2; - array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4); - array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6); + array4[0] = (array3[0] & 0xfc) >> 2; + array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4); + array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6); - for (j = 0; j < i + 1; j++) - encodedData += encodingTable[array4[j]]; + for (j = 0; j < i + 1; j++) + encodedData += encodingTable[array4[j]]; - while (i++ < 3) - encodedData += '='; - } + while (i++ < 3) + encodedData += '='; + } - return encodedData; + return encodedData; } // Function to load an image and convert it to Base64 -inline std::string imageToBase64(const std::string &imagePath) { - std::ifstream imageFile(imagePath, std::ios::binary); - if (!imageFile.is_open()) { - throw std::runtime_error("Could not open the image file."); - } - - std::vector buffer(std::istreambuf_iterator(imageFile), - {}); - return base64Encode(buffer); +inline std::string imageToBase64(const std::string& imagePath) +{ + std::ifstream imageFile(imagePath, std::ios::binary); + if (!imageFile.is_open()) + { + throw std::runtime_error("Could not open the image file."); + } + + std::vector buffer(std::istreambuf_iterator(imageFile), {}); + return base64Encode(buffer); } // Helper function to generate a unique filename -inline std::string generateUniqueFilename(const std::string &prefix, - const std::string &extension) { - // Get current time as a timestamp - auto now = std::chrono::system_clock::now(); - auto now_ms = std::chrono::time_point_cast(now); - auto epoch = now_ms.time_since_epoch(); - auto value = std::chrono::duration_cast(epoch); - - // Generate a random number - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> dis(1000, 9999); - - std::stringstream ss; - ss << prefix << value.count() << "_" << dis(gen) << extension; - return ss.str(); +inline std::string generateUniqueFilename(const std::string& prefix, const std::string& extension) +{ + // Get current time as a timestamp + auto now = std::chrono::system_clock::now(); + auto now_ms = std::chrono::time_point_cast(now); + auto epoch = now_ms.time_since_epoch(); + auto value = std::chrono::duration_cast(epoch); + + // Generate a random number + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(1000, 9999); + + std::stringstream ss; + ss << prefix << value.count() << "_" << dis(gen) << extension; + return ss.str(); } -inline void -processLocalImage(const std::string &localPath, - std::function callback) { - try { - std::string base64Image = imageToBase64(localPath); - callback(base64Image); // Invoke the callback with the Base64 string - } catch (const std::exception &e) { - std::cerr << "Error during processing: " << e.what() << std::endl; - } +inline void processLocalImage(const std::string& localPath, std::function callback) +{ + try + { + std::string base64Image = imageToBase64(localPath); + callback(base64Image); // Invoke the callback with the Base64 string + } + catch (const std::exception& e) + { + std::cerr << "Error during processing: " << e.what() << std::endl; + } } -inline std::vector listFilesInDir(const std::string &path) { - std::vector files; +inline std::vector listFilesInDir(const std::string& path) +{ + std::vector files; #ifdef _WIN32 - // Windows-specific code - WIN32_FIND_DATA findFileData; - HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData); - - if (hFind != INVALID_HANDLE_VALUE) { - do { - if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) { - files.push_back(findFileData.cFileName); - } - } while (FindNextFile(hFind, &findFileData) != 0); - FindClose(hFind); - } + // Windows-specific code + WIN32_FIND_DATA findFileData; + HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData); + + if (hFind != INVALID_HANDLE_VALUE) + { + do + { + if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) + { + files.push_back(findFileData.cFileName); + } + } while (FindNextFile(hFind, &findFileData) != 0); + FindClose(hFind); + } #else - // POSIX-specific code (Linux, Unix, MacOS) - DIR *dir; - struct dirent *ent; - - if ((dir = opendir(path.c_str())) != NULL) { - while ((ent = readdir(dir)) != NULL) { - if (ent->d_type == DT_REG) { // Check if it's a regular file - files.push_back(ent->d_name); - } + // POSIX-specific code (Linux, Unix, MacOS) + DIR* dir; + struct dirent* ent; + + if ((dir = opendir(path.c_str())) != NULL) + { + while ((ent = readdir(dir)) != NULL) + { + if (ent->d_type == DT_REG) + { // Check if it's a regular file + files.push_back(ent->d_name); + } + } + closedir(dir); } - closedir(dir); - } #endif - return files; + return files; } -inline std::string rtrim(const std::string &str) { - size_t end = str.find_last_not_of("\n\t "); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); +inline std::string rtrim(const std::string& str) +{ + size_t end = str.find_last_not_of("\n\t "); + return (end == std::string::npos) ? "" : str.substr(0, end + 1); } -inline std::string generate_random_string(std::size_t length) { - const std::string characters = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; +inline std::string generate_random_string(std::size_t length) +{ + const std::string characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - std::random_device rd; - std::mt19937 generator(rd()); + std::random_device rd; + std::mt19937 generator(rd()); - std::uniform_int_distribution<> distribution(0, characters.size() - 1); + std::uniform_int_distribution<> distribution(0, characters.size() - 1); - std::string random_string(length, '\0'); - std::generate_n(random_string.begin(), length, - [&]() { return characters[distribution(generator)]; }); + std::string random_string(length, '\0'); + std::generate_n(random_string.begin(), length, [&]() { return characters[distribution(generator)]; }); - return random_string; + return random_string; } -inline void nitro_logo() { - std::string rainbowColors[] = { - "\033[93m", // Yellow - "\033[94m", // Blue - }; - - std::string resetColor = "\033[0m"; - std::string asciiArt = - " ___ ___ ___ \n" - " /__/ ___ ___ / /\\ / /\\ \n" - " \\ \\:\\ / /\\ / /\\ / /::\\ / /::\\ " - " \n" - " \\ \\:\\ / /:/ / /:/ / /:/\\:\\ / /:/\\:\\ " - " \n" - " _____\\__\\:\\ /__/::\\ / /:/ / /:/ \\:\\ / /:/ " - "\\:\\ \n" - " /__/::::::::\\ \\__\\/\\:\\__ / /::\\ /__/:/ /:/___ /__/:/ " - "\\__\\:\\\n" - " \\ \\:\\~~\\~~\\/ \\ \\:\\/\\ /__/:/\\:\\ \\ \\:\\/:::::/ \\ " - "\\:\\ / /:/\n" - " \\ \\:\\ ~~~ \\__\\::/ \\__\\/ \\:\\ \\ \\::/~~~~ \\ " - "\\:\\ /:/ \n" - " \\ \\:\\ /__/:/ \\ \\:\\ \\ \\:\\ \\ " - "\\:\\/:/ \n" - " \\ \\:\\ \\__\\/ \\__\\/ \\ \\:\\ \\ " - "\\::/ \n" - " \\__\\/ \\__\\/ \\__\\/ " - "\n"; - - int colorIndex = 0; - - for (char c : asciiArt) { - if (c == '\n') { - std::cout << resetColor << c; - colorIndex = 0; - } else { - std::cout << rainbowColors[colorIndex % 2] << c; - colorIndex++; +inline void nitro_logo() +{ + std::string rainbowColors[] = { + "\033[93m", // Yellow + "\033[94m", // Blue + }; + + std::string resetColor = "\033[0m"; + std::string asciiArt + = " ___ ___ ___ \n" + " /__/ ___ ___ / /\\ / /\\ \n" + " \\ \\:\\ / /\\ / /\\ / /::\\ / /::\\ " + " \n" + " \\ \\:\\ / /:/ / /:/ / /:/\\:\\ / /:/\\:\\ " + " \n" + " _____\\__\\:\\ /__/::\\ / /:/ / /:/ \\:\\ / /:/ " + "\\:\\ \n" + " /__/::::::::\\ \\__\\/\\:\\__ / /::\\ /__/:/ /:/___ /__/:/ " + "\\__\\:\\\n" + " \\ \\:\\~~\\~~\\/ \\ \\:\\/\\ /__/:/\\:\\ \\ \\:\\/:::::/ \\ " + "\\:\\ / /:/\n" + " \\ \\:\\ ~~~ \\__\\::/ \\__\\/ \\:\\ \\ \\::/~~~~ \\ " + "\\:\\ /:/ \n" + " \\ \\:\\ /__/:/ \\ \\:\\ \\ \\:\\ \\ " + "\\:\\/:/ \n" + " \\ \\:\\ \\__\\/ \\__\\/ \\ \\:\\ \\ " + "\\::/ \n" + " \\__\\/ \\__\\/ \\__\\/ " + "\n"; + + std::string asciiArtRTX = R"( +------------------------ + ____ ______ __ __ ________ __ +___/ __ \__ __/_ |/ / __ __ \__ | / / +__/ /_/ /_/ / _\ / _/ / / /_ |/ / +_/ _, _/_/ / _/ | / /_/ /_ /| / +/_/ |_| /_/ /_/|_| \____/ /_/ |_/ + +)"; + + int colorIndex = 0; + + for (char c : asciiArt) + { + if (c == '\n') + { + std::cout << resetColor << c; + colorIndex = 0; + } + else + { + std::cout << rainbowColors[colorIndex % 2] << c; + colorIndex++; + } + } + + std::cout << resetColor; // Reset color at the endreturn; + + for (char c : asciiArtRTX) + { + if (c == '\n') + { + std::cout << resetColor << c; + colorIndex = 0; + } + else + { + std::cout << "\033[1;32m" << c; // bright blue + colorIndex++; + } } - } - std::cout << resetColor; // Reset color at the endreturn; + std::cout << resetColor; // Reset color at the endreturn; } -inline drogon::HttpResponsePtr nitroHttpResponse() { - auto resp = drogon::HttpResponse::newHttpResponse(); +inline drogon::HttpResponsePtr nitroHttpResponse() +{ + auto resp = drogon::HttpResponse::newHttpResponse(); #ifdef ALLOW_ALL_CORS - LOG_INFO << "Respond for all cors!"; - resp->addHeader("Access-Control-Allow-Origin", "*"); + LOG_INFO << "Respond for all cors!"; + resp->addHeader("Access-Control-Allow-Origin", "*"); #endif - return resp; + return resp; } -inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value &data) { - auto resp = drogon::HttpResponse::newHttpJsonResponse(data); +inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value& data) +{ + auto resp = drogon::HttpResponse::newHttpJsonResponse(data); #ifdef ALLOW_ALL_CORS - LOG_INFO << "Respond for all cors!"; - resp->addHeader("Access-Control-Allow-Origin", "*"); + LOG_INFO << "Respond for all cors!"; + resp->addHeader("Access-Control-Allow-Origin", "*"); #endif - return resp; + return resp; }; inline drogon::HttpResponsePtr nitroStreamResponse( - const std::function &callback, - const std::string &attachmentFileName = "") { - auto resp = drogon::HttpResponse::newStreamResponse( - callback, attachmentFileName, drogon::CT_NONE, "text/event-stream"); + const std::function& callback, const std::string& attachmentFileName = "") +{ + auto resp + = drogon::HttpResponse::newStreamResponse(callback, attachmentFileName, drogon::CT_NONE, "text/event-stream"); #ifdef ALLOW_ALL_CORS - LOG_INFO << "Respond for all cors!"; - resp->addHeader("Access-Control-Allow-Origin", "*"); + LOG_INFO << "Respond for all cors!"; + resp->addHeader("Access-Control-Allow-Origin", "*"); #endif - return resp; + return resp; } } // namespace nitro_utils From 514806865bd1dcd6a76b43941f833c69ce58ddbb Mon Sep 17 00:00:00 2001 From: automaticcat Date: Fri, 8 Mar 2024 05:28:21 +0000 Subject: [PATCH 12/33] change logo --- cpp/tensorrt_llm/nitro/utils/nitro_utils.h | 33 +++++++--------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h index 628dc5f46b1..5e382bd82fe 100644 --- a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h +++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h @@ -193,32 +193,19 @@ inline std::string generate_random_string(std::size_t length) inline void nitro_logo() { std::string rainbowColors[] = { - "\033[93m", // Yellow "\033[94m", // Blue }; std::string resetColor = "\033[0m"; - std::string asciiArt - = " ___ ___ ___ \n" - " /__/ ___ ___ / /\\ / /\\ \n" - " \\ \\:\\ / /\\ / /\\ / /::\\ / /::\\ " - " \n" - " \\ \\:\\ / /:/ / /:/ / /:/\\:\\ / /:/\\:\\ " - " \n" - " _____\\__\\:\\ /__/::\\ / /:/ / /:/ \\:\\ / /:/ " - "\\:\\ \n" - " /__/::::::::\\ \\__\\/\\:\\__ / /::\\ /__/:/ /:/___ /__/:/ " - "\\__\\:\\\n" - " \\ \\:\\~~\\~~\\/ \\ \\:\\/\\ /__/:/\\:\\ \\ \\:\\/:::::/ \\ " - "\\:\\ / /:/\n" - " \\ \\:\\ ~~~ \\__\\::/ \\__\\/ \\:\\ \\ \\::/~~~~ \\ " - "\\:\\ /:/ \n" - " \\ \\:\\ /__/:/ \\ \\:\\ \\ \\:\\ \\ " - "\\:\\/:/ \n" - " \\ \\:\\ \\__\\/ \\__\\/ \\ \\:\\ \\ " - "\\::/ \n" - " \\__\\/ \\__\\/ \\__\\/ " - "\n"; + std::string asciiArt = R"( +███╗ ██╗██╗████████╗██████╗ ██████╗ +████╗ ██║██║╚══██╔══╝██╔══██╗██╔═══██╗ +██╔██╗ ██║██║ ██║ ██████╔╝██║ ██║ +██║╚██╗██║██║ ██║ ██╔══██╗██║ ██║ +██║ ╚████║██║ ██║ ██║ ██║╚██████╔╝ +╚═╝ ╚═══╝╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ + + )"; std::string asciiArtRTX = R"( ------------------------ @@ -241,7 +228,7 @@ _/ _, _/_/ / _/ | / /_/ /_ /| / } else { - std::cout << rainbowColors[colorIndex % 2] << c; + std::cout << "\033[94m" << c; colorIndex++; } } From b88bc63fd1d85e392bf4fbe476c25ad8a57f2bb5 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Fri, 8 Mar 2024 06:57:33 +0000 Subject: [PATCH 13/33] remove redundant test --- cpp/tensorrt_llm/nitro/CMakeLists.txt | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt index 6aac914bbb5..419e62b19cb 100644 --- a/cpp/tensorrt_llm/nitro/CMakeLists.txt +++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt @@ -83,18 +83,5 @@ target_include_directories(nitro PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_sources(nitro PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) - - -# test -add_executable(test test.cc) - -target_link_libraries( - test PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece ) - -target_compile_features(test PRIVATE cxx_std_17) -target_compile_definitions(test PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}") -# - - add_dependencies(nitro_proj nitro) From 98bef79a5009c3b6eb975482abc46331eb9b9e54 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Fri, 8 Mar 2024 06:57:51 +0000 Subject: [PATCH 14/33] remove test.cc --- cpp/tensorrt_llm/nitro/test.cc | 181 --------------------------------- 1 file changed, 181 deletions(-) delete mode 100644 cpp/tensorrt_llm/nitro/test.cc diff --git a/cpp/tensorrt_llm/nitro/test.cc b/cpp/tensorrt_llm/nitro/test.cc deleted file mode 100644 index b3e0dd754e6..00000000000 --- a/cpp/tensorrt_llm/nitro/test.cc +++ /dev/null @@ -1,181 +0,0 @@ -#include "sentencepiece_processor.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" -#include "tensorrt_llm/plugins/api/tllmPlugin.h" -#include "tensorrt_llm/runtime/gptJsonConfig.h" -#include "tensorrt_llm/runtime/gptSession.h" -#include "tensorrt_llm/runtime/iTensor.h" -#include "tensorrt_llm/runtime/memoryCounters.h" -#include "tensorrt_llm/runtime/tllmLogger.h" -#include "thread" -#include -#include -#include -#include -#include -using namespace tensorrt_llm::runtime; - -namespace tc = tensorrt_llm::common; -namespace trt = nvinfer1; - -class Tokenizer -{ -private: - sentencepiece::SentencePieceProcessor processor; - - void replaceSubstring(std::string& base, const std::string& from, const std::string& to) - { - size_t start_pos = 0; - while ((start_pos = base.find(from, start_pos)) != std::string::npos) - { - base.replace(start_pos, from.length(), to); - start_pos += to.length(); - } - } - -public: - Tokenizer(const std::string& modelPath) - { - auto status = processor.Load(modelPath); - if (!status.ok()) - { - std::cerr << status.ToString() << std::endl; - } - } - - std::string decodeWithSpace(const int id) - { - std::string text = processor.IdToPiece(id); - replaceSubstring(text, "▁", " "); - return text; - } - - std::vector encode(const std::string& input) - { - std::vector ids; - processor.Encode(input, &ids); - return ids; - } -}; - -namespace -{ -void runBenchmark() -{ - Tokenizer nitro_tokenizer("./tokenizer.model"); - std::vector text_input = nitro_tokenizer.encode("How to survive in the Abyss chapter 1:\n\n "); - - // Fixed settings - const std::string modelName = "mistral"; - const std::filesystem::path engineDir = "/app/mistral_engine_2/"; - const int batchSize = 1; - const int inputLen = text_input.size(); - const std::vector inOutLen = {inputLen, 500}; // input_length, output_length - - // Logger setup - auto logger = std::make_shared(); - logger->setLevel(nvinfer1::ILogger::Severity::kINFO); - - initTrtLlmPlugins(logger.get()); - - // Load model configuration - std::filesystem::path jsonFileName = engineDir / "config.json"; - auto const json = GptJsonConfig::parse(jsonFileName); - auto const modelConfig = json.getModelConfig(); - auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); - auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName); - auto const dtype = modelConfig.getDataType(); - - GptSession::Config sessionConfig{1, 1, 1}; - sessionConfig.maxBatchSize = batchSize; - sessionConfig.maxBeamWidth = 4; // Fixed for simplicity - sessionConfig.maxSequenceLength = inOutLen[0] + inOutLen[1]; - sessionConfig.cudaGraphMode = false; // Fixed for simplicity - - SamplingConfig samplingConfig{1}; // Fixed for simplicity - samplingConfig.temperature = std::vector{0.0f}; - samplingConfig.randomSeed = std::vector{static_cast(42ull)}; - samplingConfig.topK = std::vector{40}; - samplingConfig.topP = std::vector{0.0f}; - samplingConfig.minLength = std::vector{inOutLen[1]}; - samplingConfig.repetitionPenalty = std::vector{1.3f}; - - // Initialize session - GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; - // Generate random input IDs within the model's vocabulary range - std::vector inputIdsHost = text_input; - - std::cout << "Start Nitro testing session: " << std::endl; - // Input preparation - auto& bufferManager = session.getBufferManager(); - GenerationInput::TensorPtr inputIds - = bufferManager.copyFrom(inputIdsHost, ITensor::makeShape({batchSize, inOutLen[0]}), MemoryType::kGPU); - - std::vector inputLengthsHost(batchSize, inOutLen[0]); - GenerationInput::TensorPtr inputLengths - = bufferManager.copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); - - bool inputPacked = modelConfig.usePackedInput(); - - GenerationInput generationInput{0, 0, inputIds, inputLengths, inputPacked}; - - GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), - bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; - // Define the callback to stream each generated token - generationOutput.onTokenGenerated = [&bufferManager, inOutLen, &nitro_tokenizer, &generationOutput]( - GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) - { - if (!finished) - { - // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens - int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape - // Copy output IDs from GPU to host for printing - std::vector outputIdsHost(outputLength); - bufferManager.copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); - // Find the last non-zero value in the output IDs starting from the end of the input sequence - int lastNonZeroIndex = -1; - for (int i = outputLength - 1; i >= inOutLen[0]; --i) - { - if (outputIdsHost[i] != 0) - { - lastNonZeroIndex = i; - break; // Stop at the first non-zero token found from the end - } - } - - // Directly print the last non-zero value if found, without using 'step' - if (lastNonZeroIndex != -1) - { - int outTok = outputIdsHost[lastNonZeroIndex]; - if (outTok == 13) - { - std::cout << "\n"; - } - else - { - std::cout << nitro_tokenizer.decodeWithSpace(outTok); - } - } - } - }; - - session.generate(generationOutput, generationInput, samplingConfig); - bufferManager.getStream().synchronize(); -} - -} // namespace - -int main() -{ - try - { - runBenchmark(); - std::this_thread::sleep_for(std::chrono::seconds(10)); - } - catch (const std::exception& e) - { - std::cerr << "Error: " << e.what() << std::endl; - return 1; - } - return 0; -} From 0500a5ee2cfbed0178a26bbf112499a259007141 Mon Sep 17 00:00:00 2001 From: hiro Date: Fri, 8 Mar 2024 15:05:08 +0700 Subject: [PATCH 15/33] Add Dockerfile and update cmakelist --- BUILD_ENGINE_MODEL.md | 0 BUILD_NITRO.md | 0 Dockerfile.nitro.windows | 237 +++++++++++++++++++++++++ Dockerfile.tensorrt-llm-python.windows | 233 ++++++++++++++++++++++++ cpp/tensorrt_llm/nitro/CMakeLists.txt | 8 +- 5 files changed, 477 insertions(+), 1 deletion(-) create mode 100644 BUILD_ENGINE_MODEL.md create mode 100644 BUILD_NITRO.md create mode 100644 Dockerfile.nitro.windows create mode 100644 Dockerfile.tensorrt-llm-python.windows diff --git a/BUILD_ENGINE_MODEL.md b/BUILD_ENGINE_MODEL.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/BUILD_NITRO.md b/BUILD_NITRO.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows new file mode 100644 index 00000000000..0503267a19f --- /dev/null +++ b/Dockerfile.nitro.windows @@ -0,0 +1,237 @@ +# Use the Windows Server Core 2019 image. +# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022 + +# Use the Windows Server Core 2019 image. +FROM mcr.microsoft.com/windows/servercore:ltsc2019 + +# Restore the default Windows shell for correct batch processing. +# (Used for VS Build Tools installation) +SHELL ["cmd", "/S", "/C"] + +# ----------------------------------------------------------------------------- + +# Install CUDA 12.2 + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \ + --output "cuda_installer.exe"; \ + Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ + Remove-Item cuda_installer.exe -Force + +# ----------------------------------------------------------------------------- + +# Install Python 3.10.11 + +# Download and install Python +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ + Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ + Remove-Item python-3.10.11.exe -Force + +# Add python3 command +RUN powershell -Command \ + cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\"" + +# ----------------------------------------------------------------------------- + +# Install Microsoft MPI + +# The latest version is 10.1.3, but it requires you to get a temporary download +# link. +# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes +# We use 10.1.1 which has a release on the GitHub page +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \ + --output "msmpisetup.exe"; \ + Start-Process .\msmpisetup.exe -Wait ; \ + Remove-Item msmpisetup.exe -Force + +# Add MPI binaries to Path +RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" + +# Download the MSMPI SDK +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \ + --output "msmpisdk.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ + Remove-Item msmpisdk.msi -Force + +# ----------------------------------------------------------------------------- + +# Install CMake + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \ + --output "cmake.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ + Remove-Item cmake.msi -Force + +# Add CMake binaries to Path +RUN setx Path "%Path%;C:\Program Files\CMake\bin" + +# ----------------------------------------------------------------------------- + +# Install VS Build Tools + +RUN \ + # Download the Build Tools bootstrapper. + curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \ + \ + # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \ + --includeRecommended \ + --add Microsoft.VisualStudio.Workload.MSBuildTools \ + --add Microsoft.VisualStudio.Workload.VCTools \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \ + --remove Microsoft.VisualStudio.Component.Windows81SDK \ + || IF "%ERRORLEVEL%"=="3010" EXIT 0) \ + \ + # Cleanup + && del /q vs_buildtools.exe + +# ----------------------------------------------------------------------------- + +# Install Vim (can delete this but it's nice to have) + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \ + --output "install_vim.exe"; \ + Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ + Remove-Item install_vim.exe -Force + +# Add Vim binaries to Path +RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90" + +# ----------------------------------------------------------------------------- + +# Install Chocolatey +# Chocolatey is a package manager for Windows +# I probably could've used it to install some of the above, but I didn't... + +# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation +# https://stackoverflow.com/a/76470753 +ENV chocolateyVersion=1.4.0 + +# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \ + -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \ + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \ + SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin" + +# ----------------------------------------------------------------------------- + +# Install Git via Chocolatey +RUN powershell -Command \ + choco install git -y + +# ----------------------------------------------------------------------------- + +# Install CUDA 11.8 NVTX + +#RUN powershell -Command \ +# $ErrorActionPreference = 'Stop'; \ +# curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \ +# --output "cuda_11_installer.exe"; \ +# Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \ +# Remove-Item cuda_11_installer.exe -Force + +# The above command-line installation method installs NVTX headers at +# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\ +# CMake can't find this location for some reason. +# Instead, we just copy the older NvToolsExt version to where CMake expects. +# This assumes NvToolsExt was installed on the host machine using the +# CUDA 11.8 GUI installer and copied to the build context +COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] + +# ----------------------------------------------------------------------------- + +# Create a working directory +WORKDIR "C:\\\\workspace" + +# ----------------------------------------------------------------------------- + +# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \ + --output TensorRT-9.2.0.5.zip; \ + Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \ + Remove-Item TensorRT-9.2.0.5.zip -Force + +# Add TensorRT libs to Path +RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib" + +# Install TensorRT Python wheel +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl + +# ----------------------------------------------------------------------------- + +# Copy cuDNN into the working directory +# This assumes cuDNN exists on the host machine in the build context +COPY ["cuDNN", "cuDNN"] + +# Add cuDNN libs and bin to Path. +RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" + +# ----------------------------------------------------------------------------- + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +# ----------------------------------------------------------------------------- + +# Additional dependencies to build Nitro + +# This bellow command lt MSVC recognize cuda compiler +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations' + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations' + + +# Set git safe directory for nitro clone dependencies +RUN powershell -Command \ + git config --global --add safe.directory '*' + +# Package for nitro compile +RUN powershell -Command \ + choco install pkgconfiglite -y + +RUN powershell -Command \ + choco install Ninja -y + +# Requirements to build tensorrt-llm on windows +COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt +COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +RUN powershell -Command \ + cd tensorrt-llm-nitro; \ + pip install --no-cache-dir -r .\requirements-dev-windows.txt + +COPY ./.git ./tensorrt-llm-nitro/.git + +COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty + +COPY ./cpp ./tensorrt-llm-nitro/cpp + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] diff --git a/Dockerfile.tensorrt-llm-python.windows b/Dockerfile.tensorrt-llm-python.windows new file mode 100644 index 00000000000..ee61239d001 --- /dev/null +++ b/Dockerfile.tensorrt-llm-python.windows @@ -0,0 +1,233 @@ +# Use the Windows Server Core 2019 image. +# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022 + +# Use the Windows Server Core 2019 image. +FROM mcr.microsoft.com/windows/servercore:ltsc2019 + +# Restore the default Windows shell for correct batch processing. +# (Used for VS Build Tools installation) +SHELL ["cmd", "/S", "/C"] + +# ----------------------------------------------------------------------------- + +# Install CUDA 12.2 + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \ + --output "cuda_installer.exe"; \ + Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ + Remove-Item cuda_installer.exe -Force + +# ----------------------------------------------------------------------------- + +# Install Python 3.10.11 + +# Download and install Python +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ + Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ + Remove-Item python-3.10.11.exe -Force + +# Add python3 command +RUN powershell -Command \ + cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\"" + +# ----------------------------------------------------------------------------- + +# Install Microsoft MPI + +# The latest version is 10.1.3, but it requires you to get a temporary download +# link. +# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes +# We use 10.1.1 which has a release on the GitHub page +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \ + --output "msmpisetup.exe"; \ + Start-Process .\msmpisetup.exe -Wait ; \ + Remove-Item msmpisetup.exe -Force + +# Add MPI binaries to Path +RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" + +# Download the MSMPI SDK +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \ + --output "msmpisdk.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ + Remove-Item msmpisdk.msi -Force + +# ----------------------------------------------------------------------------- + +# Install CMake + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \ + --output "cmake.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ + Remove-Item cmake.msi -Force + +# Add CMake binaries to Path +RUN setx Path "%Path%;C:\Program Files\CMake\bin" + +# ----------------------------------------------------------------------------- + +# Install VS Build Tools + +RUN \ + # Download the Build Tools bootstrapper. + curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \ + \ + # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \ + --includeRecommended \ + --add Microsoft.VisualStudio.Workload.MSBuildTools \ + --add Microsoft.VisualStudio.Workload.VCTools \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \ + --remove Microsoft.VisualStudio.Component.Windows81SDK \ + || IF "%ERRORLEVEL%"=="3010" EXIT 0) \ + \ + # Cleanup + && del /q vs_buildtools.exe + +# ----------------------------------------------------------------------------- + +# Install Vim (can delete this but it's nice to have) + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \ + --output "install_vim.exe"; \ + Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ + Remove-Item install_vim.exe -Force + +# Add Vim binaries to Path +RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90" + +# ----------------------------------------------------------------------------- + +# Install Chocolatey +# Chocolatey is a package manager for Windows +# I probably could've used it to install some of the above, but I didn't... + +# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation +# https://stackoverflow.com/a/76470753 +ENV chocolateyVersion=1.4.0 + +# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \ + -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \ + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \ + SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin" + +# ----------------------------------------------------------------------------- + +# Install Git via Chocolatey +RUN powershell -Command \ + choco install git -y + +# ----------------------------------------------------------------------------- + +# Install CUDA 11.8 NVTX + +#RUN powershell -Command \ +# $ErrorActionPreference = 'Stop'; \ +# curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \ +# --output "cuda_11_installer.exe"; \ +# Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \ +# Remove-Item cuda_11_installer.exe -Force + +# The above command-line installation method installs NVTX headers at +# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\ +# CMake can't find this location for some reason. +# Instead, we just copy the older NvToolsExt version to where CMake expects. +# This assumes NvToolsExt was installed on the host machine using the +# CUDA 11.8 GUI installer and copied to the build context +COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] + +# ----------------------------------------------------------------------------- + +# Create a working directory +WORKDIR "C:\\\\workspace" + +# ----------------------------------------------------------------------------- + +# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \ + --output TensorRT-9.2.0.5.zip; \ + Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \ + Remove-Item TensorRT-9.2.0.5.zip -Force + +# Add TensorRT libs to Path +RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib" + +# Install TensorRT Python wheel +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl + +# ----------------------------------------------------------------------------- + +# Copy cuDNN into the working directory +# This assumes cuDNN exists on the host machine in the build context +COPY ["cuDNN", "cuDNN"] + +# Add cuDNN libs and bin to Path. +RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" + +# ----------------------------------------------------------------------------- + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +# ----------------------------------------------------------------------------- + +# Additional dependencies to build Nitro + +# This bellow command lt MSVC recognize cuda compiler +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations' + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations' + + +# Set git safe directory for nitro clone dependencies +RUN powershell -Command \ + git config --global --add safe.directory '*' + +# Package for nitro compile +RUN powershell -Command \ + choco install pkgconfiglite -y + +RUN powershell -Command \ + choco install Ninja -y + +# Requirements to build tensorrt-llm on windows +COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt +COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +RUN powershell -Command \ + cd tensorrt-llm-nitro; \ + pip install --no-cache-dir -r .\requirements-dev-windows.txt + +COPY . ./tensorrt-llm-nitro/ + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt index 419e62b19cb..a0667eba820 100644 --- a/cpp/tensorrt_llm/nitro/CMakeLists.txt +++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt @@ -47,8 +47,14 @@ find_package(PkgConfig REQUIRED) find_package(Drogon CONFIG REQUIRED) # Use pkg-config to find the SentencePiece library -pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece) +if(NOT WIN32) # Linux + # Use pkg-config to find the SentencePiece library + pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece) +else() # Windows + set(SENTENCEPIECE_INCLUDE_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/include") + set(SENTENCEPIECE_LIBRARY_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/lib") +endif() include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS}) From 7819d27f1b51d26c73d8aa224763c1312f909286 Mon Sep 17 00:00:00 2001 From: Hien To Date: Sat, 9 Mar 2024 16:52:43 +0700 Subject: [PATCH 16/33] Add Dockerfile for github action runner to build tensorrt llm --- .github/runners/Dockerfile.windows.runner | 256 ++++++++++++++++++++++ .github/runners/runner.ps1 | 2 + Dockerfile.nitro.windows | 56 +++-- 3 files changed, 295 insertions(+), 19 deletions(-) create mode 100644 .github/runners/Dockerfile.windows.runner create mode 100644 .github/runners/runner.ps1 diff --git a/.github/runners/Dockerfile.windows.runner b/.github/runners/Dockerfile.windows.runner new file mode 100644 index 00000000000..ac462d37a58 --- /dev/null +++ b/.github/runners/Dockerfile.windows.runner @@ -0,0 +1,256 @@ +# Use the Windows Server Core 2019 image. +# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022 + +# Use the Windows Server Core 2019 image. +FROM mcr.microsoft.com/windows/servercore:ltsc2019 + +# Restore the default Windows shell for correct batch processing. +# (Used for VS Build Tools installation) +SHELL ["cmd", "/S", "/C"] + +# ----------------------------------------------------------------------------- + +# Install CUDA 12.2 + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \ + --output "cuda_installer.exe"; \ + Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ + Remove-Item cuda_installer.exe -Force + +# ----------------------------------------------------------------------------- + +# Install Python 3.10.11 + +# Download and install Python +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ + Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ + Remove-Item python-3.10.11.exe -Force + +# Add python3 command +RUN powershell -Command \ + cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\"" + +# ----------------------------------------------------------------------------- + +# Install Microsoft MPI + +# The latest version is 10.1.3, but it requires you to get a temporary download +# link. +# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes +# We use 10.1.1 which has a release on the GitHub page +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \ + --output "msmpisetup.exe"; \ + Start-Process .\msmpisetup.exe -Wait ; \ + Remove-Item msmpisetup.exe -Force + +# Add MPI binaries to Path +RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" + +# Download the MSMPI SDK +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \ + --output "msmpisdk.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ + Remove-Item msmpisdk.msi -Force + +# ----------------------------------------------------------------------------- + +# Install CMake + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \ + --output "cmake.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ + Remove-Item cmake.msi -Force + +# Add CMake binaries to Path +RUN setx Path "%Path%;C:\Program Files\CMake\bin" + +# ----------------------------------------------------------------------------- + +# Install VS Build Tools + +RUN \ + # Download the Build Tools bootstrapper. + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \ + \ + # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \ + --includeRecommended \ + --add Microsoft.VisualStudio.Workload.MSBuildTools \ + --add Microsoft.VisualStudio.Workload.VCTools \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \ + --remove Microsoft.VisualStudio.Component.Windows81SDK \ + || IF "%ERRORLEVEL%"=="3010" EXIT 0) \ + \ + # Cleanup + && del /q vs_buildtools.exe + +# ----------------------------------------------------------------------------- + +# Install Vim (can delete this but it's nice to have) + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \ + --output "install_vim.exe"; \ + Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ + Remove-Item install_vim.exe -Force + +# Add Vim binaries to Path +RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90" + +# ----------------------------------------------------------------------------- + +# Install Chocolatey +# Chocolatey is a package manager for Windows +# I probably could've used it to install some of the above, but I didn't... + +# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation +# https://stackoverflow.com/a/76470753 +ENV chocolateyVersion=1.4.0 + +# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \ + -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \ + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \ + SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin" + +# ----------------------------------------------------------------------------- + +# Install Git via Chocolatey +RUN powershell -Command \ + choco install git -y + +# ----------------------------------------------------------------------------- + +# Install CUDA 11.8 NVTX + +#RUN powershell -Command \ +# $ErrorActionPreference = 'Stop'; \ +# curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \ +# --output "cuda_11_installer.exe"; \ +# Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \ +# Remove-Item cuda_11_installer.exe -Force + +# The above command-line installation method installs NVTX headers at +# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\ +# CMake can't find this location for some reason. +# Instead, we just copy the older NvToolsExt version to where CMake expects. +# This assumes NvToolsExt was installed on the host machine using the +# CUDA 11.8 GUI installer and copied to the build context + +# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \ + --output NvToolsExt.zip; \ + Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \ + Remove-Item NvToolsExt.zip -Force + +# ----------------------------------------------------------------------------- + +# Create a working directory +WORKDIR "C:\\\\workspace" + +# ----------------------------------------------------------------------------- + +# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \ + --output TensorRT-9.2.0.5.zip; \ + Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \ + Remove-Item TensorRT-9.2.0.5.zip -Force + +# Add TensorRT libs to Path +RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib" + +# Install TensorRT Python wheel +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl + +# ----------------------------------------------------------------------------- + +# Copy cuDNN into the working directory +# This assumes cuDNN exists on the host machine in the build context +# COPY ["cuDNN", "cuDNN"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \ + --output cuDNN.zip; \ + Expand-Archive .\cuDNN.zip -DestinationPath .; \ + Remove-Item cuDNN.zip -Force + +# Add cuDNN libs and bin to Path. +RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" + +# ----------------------------------------------------------------------------- + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +# ----------------------------------------------------------------------------- + +# Additional dependencies to build Nitro + +# This bellow command lt MSVC recognize cuda compiler +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations' + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations' + + +# Set git safe directory for nitro clone dependencies +RUN powershell -Command \ + git config --global --add safe.directory '*' + +# Package for nitro compile +RUN powershell -Command \ + choco install pkgconfiglite --allow-empty-checksums -y + +RUN powershell -Command \ + choco install Ninja -y + +RUN choco install 7zip -y; \ + 7z --help + +# Requirements to build tensorrt-llm on windows +ARG RUNNER_VERSION=2.314.1 + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Invoke-WebRequest \ + -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \ + -OutFile runner.zip; \ + Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \ + Remove-Item -Path .\runner.zip; \ + setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\") + +ADD runner.ps1 ./runner.ps1 + +CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/runners/runner.ps1 b/.github/runners/runner.ps1 new file mode 100644 index 00000000000..a08f3725bf1 --- /dev/null +++ b/.github/runners/runner.ps1 @@ -0,0 +1,2 @@ +.\actions-runner\config.cmd --unattended --replace --url https://github.com/${env:RUNNER_REPO} --pat $env:RUNNER_PAT --runnergroup $env:RUNNER_GROUP --labels $env:RUNNER_LABELS --name $env:RUNNER_NAME --work $env:RUNNER_WORKDIR; +.\actions-runner\run.cmd; \ No newline at end of file diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows index 0503267a19f..616f4b8283f 100644 --- a/Dockerfile.nitro.windows +++ b/Dockerfile.nitro.windows @@ -14,7 +14,7 @@ SHELL ["cmd", "/S", "/C"] RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \ --output "cuda_installer.exe"; \ Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ Remove-Item cuda_installer.exe -Force @@ -26,7 +26,7 @@ RUN powershell -Command \ # Download and install Python RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ Remove-Item python-3.10.11.exe -Force @@ -44,7 +44,7 @@ RUN powershell -Command \ # We use 10.1.1 which has a release on the GitHub page RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \ --output "msmpisetup.exe"; \ Start-Process .\msmpisetup.exe -Wait ; \ Remove-Item msmpisetup.exe -Force @@ -55,7 +55,7 @@ RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" # Download the MSMPI SDK RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \ --output "msmpisdk.msi"; \ Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ Remove-Item msmpisdk.msi -Force @@ -66,7 +66,7 @@ RUN powershell -Command \ RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \ --output "cmake.msi"; \ Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ Remove-Item cmake.msi -Force @@ -80,7 +80,7 @@ RUN setx Path "%Path%;C:\Program Files\CMake\bin" RUN \ # Download the Build Tools bootstrapper. - curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \ + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \ \ # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ @@ -103,7 +103,7 @@ RUN \ RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \ --output "install_vim.exe"; \ Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ Remove-Item install_vim.exe -Force @@ -152,7 +152,14 @@ RUN powershell -Command \ # Instead, we just copy the older NvToolsExt version to where CMake expects. # This assumes NvToolsExt was installed on the host machine using the # CUDA 11.8 GUI installer and copied to the build context -COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] + +# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \ + --output NvToolsExt.zip; \ + Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \ + Remove-Item NvToolsExt.zip -Force # ----------------------------------------------------------------------------- @@ -164,7 +171,7 @@ WORKDIR "C:\\\\workspace" # Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \ --output TensorRT-9.2.0.5.zip; \ Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \ Remove-Item TensorRT-9.2.0.5.zip -Force @@ -181,7 +188,13 @@ RUN powershell -Command \ # Copy cuDNN into the working directory # This assumes cuDNN exists on the host machine in the build context -COPY ["cuDNN", "cuDNN"] +# COPY ["cuDNN", "cuDNN"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \ + --output cuDNN.zip; \ + Expand-Archive .\cuDNN.zip -DestinationPath .; \ + Remove-Item cuDNN.zip -Force # Add cuDNN libs and bin to Path. RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" @@ -213,23 +226,28 @@ RUN powershell -Command \ # Package for nitro compile RUN powershell -Command \ - choco install pkgconfiglite -y + choco install pkgconfiglite --allow-empty-checksums -y RUN powershell -Command \ choco install Ninja -y +RUN choco install 7zip -y; \ + 7z --help + # Requirements to build tensorrt-llm on windows -COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt -COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt -RUN powershell -Command \ - cd tensorrt-llm-nitro; \ - pip install --no-cache-dir -r .\requirements-dev-windows.txt +# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt +# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +# RUN powershell -Command \ +# cd tensorrt-llm-nitro; \ +# pip install --no-cache-dir -r .\requirements-dev-windows.txt + +# COPY ./.git ./tensorrt-llm-nitro/.git -COPY ./.git ./tensorrt-llm-nitro/.git +# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty -COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty +# COPY ./cpp ./tensorrt-llm-nitro/cpp -COPY ./cpp ./tensorrt-llm-nitro/cpp +COPY . ./nitro-tensort-llm # Define the entry point for the docker container. # This entry point launches the 64-bit PowerShell developer shell. From b0051157a76929b71bf964363358e33a7649a61f Mon Sep 17 00:00:00 2001 From: Hien To Date: Sun, 10 Mar 2024 21:09:31 +0700 Subject: [PATCH 17/33] Correct SENTENCEPIECE path nitro cmakelist --- Dockerfile.nitro.windows | 21 ++++++++++++++++----- cpp/tensorrt_llm/nitro/CMakeLists.txt | 4 ++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows index 616f4b8283f..5dcbcde66ae 100644 --- a/Dockerfile.nitro.windows +++ b/Dockerfile.nitro.windows @@ -247,9 +247,20 @@ RUN choco install 7zip -y; \ # COPY ./cpp ./tensorrt-llm-nitro/cpp -COPY . ./nitro-tensort-llm +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + git clone https://github.com/janhq/nitro-tensorrt-llm.git; \ + cd nitro-tensorrt-llm; \ + git checkout tensorrt-llm-nitro-rel; \ + git submodule update --init --recursive; \ + pip install --no-cache-dir -r .\requirements-dev-windows.txt; \ + cd cpp/tensorrt_llm/nitro; \ + cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \ + cmake --build ./build_deps/nitro_deps --config Release -# Define the entry point for the docker container. -# This entry point launches the 64-bit PowerShell developer shell. -# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA -# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] +RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools" + +RUN VsDevCmd.bat -arch=amd64 && \ + powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + +# # ----------------------------------------------------------------------------- \ No newline at end of file diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt index a0667eba820..cf13d40f4c7 100644 --- a/cpp/tensorrt_llm/nitro/CMakeLists.txt +++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt @@ -52,8 +52,8 @@ if(NOT WIN32) # Linux # Use pkg-config to find the SentencePiece library pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece) else() # Windows - set(SENTENCEPIECE_INCLUDE_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/include") - set(SENTENCEPIECE_LIBRARY_DIRS "C:/workspace/tensorrt-llm-nitro/cpp/tensorrt_llm/nitro/build_deps/_install/lib") + set(SENTENCEPIECE_INCLUDE_DIRS "${CMAKE_PREFIX_PATH}/include") + set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib") endif() include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS}) From 89ca0579a385d75299b39728c7cef71ee743bf08 Mon Sep 17 00:00:00 2001 From: Hien To Date: Sun, 10 Mar 2024 21:31:48 +0700 Subject: [PATCH 18/33] Separate dockerfile for ada and ampere arch --- ...ws.runner => Dockerfile.window.runner-ada} | 31 +++++++ .../runners/Dockerfile.window.runner-ampere | 86 +++++++++++++++---- 2 files changed, 101 insertions(+), 16 deletions(-) rename .github/runners/{Dockerfile.windows.runner => Dockerfile.window.runner-ada} (89%) rename Dockerfile.tensorrt-llm-python.windows => .github/runners/Dockerfile.window.runner-ampere (72%) diff --git a/.github/runners/Dockerfile.windows.runner b/.github/runners/Dockerfile.window.runner-ada similarity index 89% rename from .github/runners/Dockerfile.windows.runner rename to .github/runners/Dockerfile.window.runner-ada index ac462d37a58..f5f9d5c3ffe 100644 --- a/.github/runners/Dockerfile.windows.runner +++ b/.github/runners/Dockerfile.window.runner-ada @@ -234,6 +234,37 @@ RUN powershell -Command \ RUN choco install 7zip -y; \ 7z --help +# Requirements to build tensorrt-llm on windows +# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt +# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +# RUN powershell -Command \ +# cd tensorrt-llm-nitro; \ +# pip install --no-cache-dir -r .\requirements-dev-windows.txt + +# COPY ./.git ./tensorrt-llm-nitro/.git + +# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty + +# COPY ./cpp ./tensorrt-llm-nitro/cpp + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + git clone https://github.com/janhq/nitro-tensorrt-llm.git; \ + cd nitro-tensorrt-llm; \ + git checkout tensorrt-llm-nitro-rel; \ + git submodule update --init --recursive; \ + pip install --no-cache-dir -r .\requirements-dev-windows.txt; \ + cd cpp/tensorrt_llm/nitro; \ + cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \ + cmake --build ./build_deps/nitro_deps --config Release + +RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools" + +RUN VsDevCmd.bat -arch=amd64 && \ + powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real;90-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + +# # ----------------------------------------------------------------------------- + # Requirements to build tensorrt-llm on windows ARG RUNNER_VERSION=2.314.1 diff --git a/Dockerfile.tensorrt-llm-python.windows b/.github/runners/Dockerfile.window.runner-ampere similarity index 72% rename from Dockerfile.tensorrt-llm-python.windows rename to .github/runners/Dockerfile.window.runner-ampere index ee61239d001..e957b97ad80 100644 --- a/Dockerfile.tensorrt-llm-python.windows +++ b/.github/runners/Dockerfile.window.runner-ampere @@ -14,7 +14,7 @@ SHELL ["cmd", "/S", "/C"] RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/cuda_12.2.2_537.13_windows.exe \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \ --output "cuda_installer.exe"; \ Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ Remove-Item cuda_installer.exe -Force @@ -26,7 +26,7 @@ RUN powershell -Command \ # Download and install Python RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ Remove-Item python-3.10.11.exe -Force @@ -44,7 +44,7 @@ RUN powershell -Command \ # We use 10.1.1 which has a release on the GitHub page RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisetup.exe \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \ --output "msmpisetup.exe"; \ Start-Process .\msmpisetup.exe -Wait ; \ Remove-Item msmpisetup.exe -Force @@ -55,7 +55,7 @@ RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" # Download the MSMPI SDK RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/msmpisdk.msi \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \ --output "msmpisdk.msi"; \ Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ Remove-Item msmpisdk.msi -Force @@ -66,7 +66,7 @@ RUN powershell -Command \ RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/cmake-3.27.7-windows-x86_64.msi \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \ --output "cmake.msi"; \ Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ Remove-Item cmake.msi -Force @@ -80,7 +80,7 @@ RUN setx Path "%Path%;C:\Program Files\CMake\bin" RUN \ # Download the Build Tools bootstrapper. - curl.exe -SL --output vs_buildtools.exe https://delta.jan.ai/windows-container-dependencies/vs_buildtools.exe \ + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \ \ # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ @@ -103,7 +103,7 @@ RUN \ RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/gvim90.exe \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \ --output "install_vim.exe"; \ Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ Remove-Item install_vim.exe -Force @@ -152,7 +152,14 @@ RUN powershell -Command \ # Instead, we just copy the older NvToolsExt version to where CMake expects. # This assumes NvToolsExt was installed on the host machine using the # CUDA 11.8 GUI installer and copied to the build context -COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] + +# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \ + --output NvToolsExt.zip; \ + Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \ + Remove-Item NvToolsExt.zip -Force # ----------------------------------------------------------------------------- @@ -164,7 +171,7 @@ WORKDIR "C:\\\\workspace" # Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM RUN powershell -Command \ $ErrorActionPreference = 'Stop'; \ - curl.exe https://delta.jan.ai/windows-container-dependencies/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \ --output TensorRT-9.2.0.5.zip; \ Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \ Remove-Item TensorRT-9.2.0.5.zip -Force @@ -181,7 +188,13 @@ RUN powershell -Command \ # Copy cuDNN into the working directory # This assumes cuDNN exists on the host machine in the build context -COPY ["cuDNN", "cuDNN"] +# COPY ["cuDNN", "cuDNN"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \ + --output cuDNN.zip; \ + Expand-Archive .\cuDNN.zip -DestinationPath .; \ + Remove-Item cuDNN.zip -Force # Add cuDNN libs and bin to Path. RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" @@ -213,21 +226,62 @@ RUN powershell -Command \ # Package for nitro compile RUN powershell -Command \ - choco install pkgconfiglite -y + choco install pkgconfiglite --allow-empty-checksums -y RUN powershell -Command \ choco install Ninja -y +RUN choco install 7zip -y; \ + 7z --help + # Requirements to build tensorrt-llm on windows -COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt -COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt +# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +# RUN powershell -Command \ +# cd tensorrt-llm-nitro; \ +# pip install --no-cache-dir -r .\requirements-dev-windows.txt + +# COPY ./.git ./tensorrt-llm-nitro/.git + +# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty + +# COPY ./cpp ./tensorrt-llm-nitro/cpp + RUN powershell -Command \ - cd tensorrt-llm-nitro; \ - pip install --no-cache-dir -r .\requirements-dev-windows.txt + $ErrorActionPreference = 'Stop'; \ + git clone https://github.com/janhq/nitro-tensorrt-llm.git; \ + cd nitro-tensorrt-llm; \ + git checkout tensorrt-llm-nitro-rel; \ + git submodule update --init --recursive; \ + pip install --no-cache-dir -r .\requirements-dev-windows.txt; \ + cd cpp/tensorrt_llm/nitro; \ + cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \ + cmake --build ./build_deps/nitro_deps --config Release + +RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools" + +RUN VsDevCmd.bat -arch=amd64 && \ + powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" -COPY . ./tensorrt-llm-nitro/ +# # ----------------------------------------------------------------------------- + +# Requirements to build tensorrt-llm on windows +ARG RUNNER_VERSION=2.314.1 # Define the entry point for the docker container. # This entry point launches the 64-bit PowerShell developer shell. # We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA # ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Invoke-WebRequest \ + -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \ + -OutFile runner.zip; \ + Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \ + Remove-Item -Path .\runner.zip; \ + setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\") + +ADD runner.ps1 ./runner.ps1 + +CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file From 85e6bb3229ff6eb1aad6ec60622985737ff475b0 Mon Sep 17 00:00:00 2001 From: Hien To Date: Sun, 10 Mar 2024 22:44:49 +0700 Subject: [PATCH 19/33] Add CI for nitro tensorrt-llm windows ampere --- .github/workflows/windows-build.yml | 79 +++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .github/workflows/windows-build.yml diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml new file mode 100644 index 00000000000..bd494b84e5f --- /dev/null +++ b/.github/workflows/windows-build.yml @@ -0,0 +1,79 @@ +name: Build for Windows +on: + push: + branches: + - tensorrt-llm-nitro-rel + +jobs: + windows-ampere-build: + runs-on: windows-nitro-tensorrt-llm-ampere + permissions: + contents: write + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: remove existing build folder + shell: powershell + run: | + Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue + Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue + Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue + Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue + + - name: Copy build cache `build` + shell: powershell + continue-on-error: true + run: | + robocopy 'C:\workspace\nitro-tensorrt-llm\build' '.' /E + + - name: Copy build cache `cpp build` + shell: powershell + continue-on-error: true + run: | + robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E + + - name: Copy build cache `nitro build_deps` + shell: powershell + continue-on-error: true + run: | + robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E + + - name: Build Python + shell: powershell + run: | + VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + + - name: Upload Artifact + uses: actions/upload-artifact@v2 + with: + name: python-wheel + path: ./build + + - name: Build nitro + shell: powershell + run: | + VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass 'cd cpp\build; + cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja; + cmake --build . --parallel 2 --config Release' + + - name: create nitro artifact with dll file + shell: powershell + run: | + mkdir build_nitro + cp .\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro + cp .\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro + cp .\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro + cp .\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro + cp .\C:\workspace\cuDNN\cudnn_ops_infer64_8.dll .\build_nitro + cp .\C:\workspace\cuDNN\cudnn64_8.dll .\build_nitro + ls .\build_nitro + + - name: Upload Artifact + uses: actions/upload-artifact@v2 + with: + name: nitro-tensorrt-llm-windows-ampere + path: ./build_nitro \ No newline at end of file From d45051bff524476a7ee3d23d6c1f03dd273ed5c1 Mon Sep 17 00:00:00 2001 From: Hien To Date: Sun, 10 Mar 2024 23:29:35 +0700 Subject: [PATCH 20/33] Correct build script --- .github/workflows/windows-build.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index bd494b84e5f..3f545d71526 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -15,6 +15,7 @@ jobs: uses: actions/checkout@v3 with: submodules: recursive + lfs: true - name: remove existing build folder shell: powershell @@ -45,7 +46,7 @@ jobs: - name: Build Python shell: powershell run: | - VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\' - name: Upload Artifact uses: actions/upload-artifact@v2 @@ -56,8 +57,8 @@ jobs: - name: Build nitro shell: powershell run: | - VsDevCmd.bat -arch=amd64 && powershell.exe -NoLogo -ExecutionPolicy Bypass 'cd cpp\build; - cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja; + cd cpp\build + cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja cmake --build . --parallel 2 --config Release' - name: create nitro artifact with dll file From 048735c42f10839f1d70dc9f65ba38a959cba8c6 Mon Sep 17 00:00:00 2001 From: Hien To Date: Sun, 10 Mar 2024 23:53:30 +0700 Subject: [PATCH 21/33] Install nitro_deps instead of using cache --- .github/workflows/windows-build.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 3f545d71526..c5858402588 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -21,9 +21,7 @@ jobs: shell: powershell run: | Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue - Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue - Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue - name: Copy build cache `build` shell: powershell @@ -37,12 +35,13 @@ jobs: run: | robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E - - name: Copy build cache `nitro build_deps` + - name: install nitro deps shell: powershell - continue-on-error: true run: | - robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E - + cd cpp\tensorrt_llm\nitro + cmake -S ./nitro_deps -B ./build_deps/nitro_deps + cmake --build ./build_deps/nitro_deps --config Release + - name: Build Python shell: powershell run: | @@ -59,7 +58,7 @@ jobs: run: | cd cpp\build cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja - cmake --build . --parallel 2 --config Release' + cmake --build . --parallel 2 --config Release - name: create nitro artifact with dll file shell: powershell From 13675e07734b8d08cc5203b166362578bfccf97c Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 00:52:14 +0700 Subject: [PATCH 22/33] nitro deps build using cache --- .github/workflows/windows-build.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index c5858402588..7cebd229af2 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -21,7 +21,9 @@ jobs: shell: powershell run: | Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue + Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue + Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue - name: Copy build cache `build` shell: powershell @@ -35,6 +37,13 @@ jobs: run: | robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E + - name: Copy build cache `nitro build_deps` + shell: powershell + continue-on-error: true + run: | + robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E + + - name: install nitro deps shell: powershell run: | From 3aa1e50b71eabb214c27a5401f4517db61717268 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 02:12:02 +0700 Subject: [PATCH 23/33] Fix error Longpath on windows --- .github/runners/Dockerfile.window.runner-ada | 2 ++ .github/runners/Dockerfile.window.runner-ampere | 2 ++ .github/workflows/windows-build.yml | 12 +++++------- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/runners/Dockerfile.window.runner-ada b/.github/runners/Dockerfile.window.runner-ada index f5f9d5c3ffe..b921ad43b00 100644 --- a/.github/runners/Dockerfile.window.runner-ada +++ b/.github/runners/Dockerfile.window.runner-ada @@ -284,4 +284,6 @@ RUN powershell -Command \ ADD runner.ps1 ./runner.ps1 +RUN New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force + CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/runners/Dockerfile.window.runner-ampere b/.github/runners/Dockerfile.window.runner-ampere index e957b97ad80..9d88fe168cf 100644 --- a/.github/runners/Dockerfile.window.runner-ampere +++ b/.github/runners/Dockerfile.window.runner-ampere @@ -284,4 +284,6 @@ RUN powershell -Command \ ADD runner.ps1 ./runner.ps1 +RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force + CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 7cebd229af2..ba472fd2b09 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -23,7 +23,6 @@ jobs: Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue - Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue - name: Copy build cache `build` shell: powershell @@ -42,19 +41,18 @@ jobs: continue-on-error: true run: | robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E - - name: install nitro deps shell: powershell run: | cd cpp\tensorrt_llm\nitro - cmake -S ./nitro_deps -B ./build_deps/nitro_deps - cmake --build ./build_deps/nitro_deps --config Release + powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps + powershell -Command cmake --build ./build_deps/nitro_deps --config Release - name: Build Python shell: powershell run: | - python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\' + powershell -Command python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\' - name: Upload Artifact uses: actions/upload-artifact@v2 @@ -66,8 +64,8 @@ jobs: shell: powershell run: | cd cpp\build - cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja - cmake --build . --parallel 2 --config Release + powershell -Command cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja + powershell -Command cmake --build . --parallel 2 --config Release - name: create nitro artifact with dll file shell: powershell From d746a4965e15d48c71e13f6ad8784fe5d9029b69 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 02:57:12 +0700 Subject: [PATCH 24/33] Fix error build nitro deps --- .github/workflows/windows-build.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index ba472fd2b09..55663cb2f88 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -42,12 +42,13 @@ jobs: run: | robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E - - name: install nitro deps - shell: powershell - run: | - cd cpp\tensorrt_llm\nitro - powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps - powershell -Command cmake --build ./build_deps/nitro_deps --config Release + - uses: nick-fields/retry@v3 + with: + timeout_minutes: 45 + max_attempts: 3 + shell: powershell + command: | + cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release - name: Build Python shell: powershell From b63c8e1767bdaca8c89c9bfa559611297819eb55 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 14:05:03 +0700 Subject: [PATCH 25/33] nitro build_deps change to use bash --- .github/runners/Dockerfile.window.runner-ada | 6 ++++-- .github/runners/Dockerfile.window.runner-ampere | 2 ++ .github/workflows/windows-build.yml | 17 +++++++++-------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/runners/Dockerfile.window.runner-ada b/.github/runners/Dockerfile.window.runner-ada index b921ad43b00..4ed2145599d 100644 --- a/.github/runners/Dockerfile.window.runner-ada +++ b/.github/runners/Dockerfile.window.runner-ada @@ -261,7 +261,7 @@ RUN powershell -Command \ RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools" RUN VsDevCmd.bat -arch=amd64 && \ - powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real;90-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" # # ----------------------------------------------------------------------------- @@ -284,6 +284,8 @@ RUN powershell -Command \ ADD runner.ps1 ./runner.ps1 -RUN New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force +RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force + +RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/runners/Dockerfile.window.runner-ampere b/.github/runners/Dockerfile.window.runner-ampere index 9d88fe168cf..c41eb6205e9 100644 --- a/.github/runners/Dockerfile.window.runner-ampere +++ b/.github/runners/Dockerfile.window.runner-ampere @@ -286,4 +286,6 @@ ADD runner.ps1 ./runner.ps1 RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force +RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T + CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 55663cb2f88..76b4ba844c7 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -23,32 +23,33 @@ jobs: Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue - + Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue + - name: Copy build cache `build` - shell: powershell + shell: bash continue-on-error: true run: | - robocopy 'C:\workspace\nitro-tensorrt-llm\build' '.' /E + cp -r /c/workspace/nitro-tensorrt-llm/build '.' - name: Copy build cache `cpp build` - shell: powershell + shell: bash continue-on-error: true run: | - robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\build' '.\cpp' /E + cp -r /c/workspace/nitro-tensorrt-llm/cpp/build' './cpp' - name: Copy build cache `nitro build_deps` shell: powershell continue-on-error: true run: | - robocopy 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' '.\cpp\tensorrt_llm\nitro\' /E + cp -r /c/workspace/nitro-tensorrt-llm/cpp/tensorrt_llm/nitro/build_deps' './cpp/tensorrt_llm/nitro/' - uses: nick-fields/retry@v3 with: timeout_minutes: 45 max_attempts: 3 - shell: powershell + shell: bash command: | - cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release + cd ./cpp/tensorrt_llm/nitro && cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release - name: Build Python shell: powershell From 28aa3dafe33248b23c475955ec7f1fbc96f793f3 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 14:23:52 +0700 Subject: [PATCH 26/33] nitro build_deps change to use powershell --- .github/workflows/windows-build.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 76b4ba844c7..cc2a089d3b9 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -26,35 +26,35 @@ jobs: Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue - name: Copy build cache `build` - shell: bash + shell: powershell continue-on-error: true run: | - cp -r /c/workspace/nitro-tensorrt-llm/build '.' + Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\build' -Destination '.' -Recurse - name: Copy build cache `cpp build` - shell: bash + shell: powershell continue-on-error: true run: | - cp -r /c/workspace/nitro-tensorrt-llm/cpp/build' './cpp' + Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse - name: Copy build cache `nitro build_deps` shell: powershell continue-on-error: true run: | - cp -r /c/workspace/nitro-tensorrt-llm/cpp/tensorrt_llm/nitro/build_deps' './cpp/tensorrt_llm/nitro/' + Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse - uses: nick-fields/retry@v3 with: timeout_minutes: 45 max_attempts: 3 - shell: bash + shell: powershell command: | - cd ./cpp/tensorrt_llm/nitro && cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release + cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release - name: Build Python shell: powershell run: | - powershell -Command python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\' + powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" - name: Upload Artifact uses: actions/upload-artifact@v2 From 4e035f89bdcf355ab3613d8f6da7101d964f8975 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 16:42:23 +0700 Subject: [PATCH 27/33] Add remove CMakeCache file --- .github/workflows/windows-build.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index cc2a089d3b9..a83731423d3 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -36,12 +36,14 @@ jobs: continue-on-error: true run: | Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse + rm .\cpp\build\CMakeCache.txt - name: Copy build cache `nitro build_deps` shell: powershell continue-on-error: true run: | Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse + rm .\cpp\tensorrt_llm\nitro\build_deps\nitro_deps\CMakeCache.txt - uses: nick-fields/retry@v3 with: @@ -66,8 +68,8 @@ jobs: shell: powershell run: | cd cpp\build - powershell -Command cmake .. -DCMAKE_CUDA_ARCHITECTURES="80-real;86-real" -DTRT_LIB_DIR="C:/workspace/TensorRT-9.2.0.5/lib" -DTRT_INCLUDE_DIR="C:/workspace/TensorRT-9.2.0.5/include" -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe" -DENABLE_MULTI_DEVICE=0 -G Ninja - powershell -Command cmake --build . --parallel 2 --config Release + powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja" + powershell -Command "cmake --build . --parallel 2 --config Release" - name: create nitro artifact with dll file shell: powershell From 58972be96d5682b7a31c96cd5da959a9e610bcd0 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 16:52:45 +0700 Subject: [PATCH 28/33] Add update CMakeCache.txt path --- .github/workflows/windows-build.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index a83731423d3..48e86886d8a 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -36,14 +36,19 @@ jobs: continue-on-error: true run: | Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse - rm .\cpp\build\CMakeCache.txt - name: Copy build cache `nitro build_deps` shell: powershell continue-on-error: true run: | Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse - rm .\cpp\tensorrt_llm\nitro\build_deps\nitro_deps\CMakeCache.txt + + - name: Override path in CMakeCache.txt + shell: powershell + run: | + Get-ChildItem .\cpp -Recurse -Filter CMakeCache.txt | ForEach-Object { + (Get-Content $_.FullName) -replace [regex]::Escape("c:/workspace/nitro-tensorrt-llm"), "c:/w/nitro-tensorrt-llm/nitro-tensorrt-llm" | Set-Content $_.FullName + } - uses: nick-fields/retry@v3 with: From 713b54b626cd0103f9f3be9d77392739e9c9f189 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 18:03:03 +0700 Subject: [PATCH 29/33] Change folder git to build CMAKEList --- .github/workflows/windows-build.yml | 65 +++++++-------------------- cpp/tensorrt_llm/nitro/CMakeLists.txt | 5 ++- 2 files changed, 21 insertions(+), 49 deletions(-) diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 48e86886d8a..a173bf7aca7 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -17,38 +17,7 @@ jobs: submodules: recursive lfs: true - - name: remove existing build folder - shell: powershell - run: | - Remove-Item -Path '.\build' -Recurse -ErrorAction SilentlyContinue - Remove-Item -Path '.\build_nitro' -Recurse -ErrorAction SilentlyContinue - Remove-Item -Path '.\cpp\build' -Recurse -ErrorAction SilentlyContinue - Remove-Item -Path '.\cpp\tensorrt_llm\nitro\build_deps' -Recurse -ErrorAction SilentlyContinue - - - name: Copy build cache `build` - shell: powershell - continue-on-error: true - run: | - Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\build' -Destination '.' -Recurse - - - name: Copy build cache `cpp build` - shell: powershell - continue-on-error: true - run: | - Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\build' -Destination '.\cpp' -Recurse - - - name: Copy build cache `nitro build_deps` - shell: powershell - continue-on-error: true - run: | - Copy-Item -Path 'C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps' -Destination '.\cpp\tensorrt_llm\nitro' -Recurse - - - name: Override path in CMakeCache.txt - shell: powershell - run: | - Get-ChildItem .\cpp -Recurse -Filter CMakeCache.txt | ForEach-Object { - (Get-Content $_.FullName) -replace [regex]::Escape("c:/workspace/nitro-tensorrt-llm"), "c:/w/nitro-tensorrt-llm/nitro-tensorrt-llm" | Set-Content $_.FullName - } + - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm C:\workspace - uses: nick-fields/retry@v3 with: @@ -56,23 +25,17 @@ jobs: max_attempts: 3 shell: powershell command: | - cd cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release + cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release - name: Build Python shell: powershell run: | - powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: python-wheel - path: ./build + cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" - name: Build nitro shell: powershell run: | - cd cpp\build + cd C:\workspace\nitro-tensorrt-llm\cpp\build powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja" powershell -Command "cmake --build . --parallel 2 --config Release" @@ -80,16 +43,22 @@ jobs: shell: powershell run: | mkdir build_nitro - cp .\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro - cp .\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro - cp .\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro - cp .\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro - cp .\C:\workspace\cuDNN\cudnn_ops_infer64_8.dll .\build_nitro - cp .\C:\workspace\cuDNN\cudnn64_8.dll .\build_nitro + cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro + cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro + cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro + cp C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro + cp C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro + cp C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro ls .\build_nitro - name: Upload Artifact uses: actions/upload-artifact@v2 with: name: nitro-tensorrt-llm-windows-ampere - path: ./build_nitro \ No newline at end of file + path: ./build_nitro + + - name: Upload Artifact + uses: actions/upload-artifact@v2 + with: + name: python-wheel + path: C:/workspace/nitro-tensorrt-llm/build diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt index cf13d40f4c7..5b852afab13 100644 --- a/cpp/tensorrt_llm/nitro/CMakeLists.txt +++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt @@ -56,6 +56,9 @@ else() # Windows set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib") endif() +message(STATUS "SentencePiece library dirs: ${SENTENCEPIECE_LIBRARY_DIRS}") +message(STATUS "SentencePiece header dirs: ${SENTENCEPIECE_INCLUDE_DIRS}") + include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS}) link_directories(${SENTENCEPIECE_LIBRARY_DIRS}) @@ -71,7 +74,7 @@ add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts) add_executable(nitro main.cc) target_link_libraries( - nitro PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} ) + nitro PUBLIC ${SHARED_TARGET} sentencepiece nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} ) target_compile_features(nitro PRIVATE cxx_std_17) From 2326e6d14ba8857d4550cce18190c7d1525709b4 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 21:05:44 +0700 Subject: [PATCH 30/33] Add CI for build ada and ampere --- .../runners/Dockerfile.window.runner-turing | 291 ++++++++++++++++++ .github/workflows/windows-build.yml | 34 +- 2 files changed, 312 insertions(+), 13 deletions(-) create mode 100644 .github/runners/Dockerfile.window.runner-turing diff --git a/.github/runners/Dockerfile.window.runner-turing b/.github/runners/Dockerfile.window.runner-turing new file mode 100644 index 00000000000..ee35f0428c1 --- /dev/null +++ b/.github/runners/Dockerfile.window.runner-turing @@ -0,0 +1,291 @@ +# Use the Windows Server Core 2019 image. +# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022 + +# Use the Windows Server Core 2019 image. +FROM mcr.microsoft.com/windows/servercore:ltsc2019 + +# Restore the default Windows shell for correct batch processing. +# (Used for VS Build Tools installation) +SHELL ["cmd", "/S", "/C"] + +# ----------------------------------------------------------------------------- + +# Install CUDA 12.2 + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \ + --output "cuda_installer.exe"; \ + Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ + Remove-Item cuda_installer.exe -Force + +# ----------------------------------------------------------------------------- + +# Install Python 3.10.11 + +# Download and install Python +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \ + Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ + Remove-Item python-3.10.11.exe -Force + +# Add python3 command +RUN powershell -Command \ + cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\"" + +# ----------------------------------------------------------------------------- + +# Install Microsoft MPI + +# The latest version is 10.1.3, but it requires you to get a temporary download +# link. +# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes +# We use 10.1.1 which has a release on the GitHub page +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \ + --output "msmpisetup.exe"; \ + Start-Process .\msmpisetup.exe -Wait ; \ + Remove-Item msmpisetup.exe -Force + +# Add MPI binaries to Path +RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" + +# Download the MSMPI SDK +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \ + --output "msmpisdk.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ + Remove-Item msmpisdk.msi -Force + +# ----------------------------------------------------------------------------- + +# Install CMake + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \ + --output "cmake.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ + Remove-Item cmake.msi -Force + +# Add CMake binaries to Path +RUN setx Path "%Path%;C:\Program Files\CMake\bin" + +# ----------------------------------------------------------------------------- + +# Install VS Build Tools + +RUN \ + # Download the Build Tools bootstrapper. + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \ + \ + # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \ + --includeRecommended \ + --add Microsoft.VisualStudio.Workload.MSBuildTools \ + --add Microsoft.VisualStudio.Workload.VCTools \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \ + --remove Microsoft.VisualStudio.Component.Windows81SDK \ + || IF "%ERRORLEVEL%"=="3010" EXIT 0) \ + \ + # Cleanup + && del /q vs_buildtools.exe + +# ----------------------------------------------------------------------------- + +# Install Vim (can delete this but it's nice to have) + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \ + --output "install_vim.exe"; \ + Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ + Remove-Item install_vim.exe -Force + +# Add Vim binaries to Path +RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90" + +# ----------------------------------------------------------------------------- + +# Install Chocolatey +# Chocolatey is a package manager for Windows +# I probably could've used it to install some of the above, but I didn't... + +# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation +# https://stackoverflow.com/a/76470753 +ENV chocolateyVersion=1.4.0 + +# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \ + -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \ + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \ + SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin" + +# ----------------------------------------------------------------------------- + +# Install Git via Chocolatey +RUN powershell -Command \ + choco install git -y + +# ----------------------------------------------------------------------------- + +# Install CUDA 11.8 NVTX + +#RUN powershell -Command \ +# $ErrorActionPreference = 'Stop'; \ +# curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \ +# --output "cuda_11_installer.exe"; \ +# Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \ +# Remove-Item cuda_11_installer.exe -Force + +# The above command-line installation method installs NVTX headers at +# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\ +# CMake can't find this location for some reason. +# Instead, we just copy the older NvToolsExt version to where CMake expects. +# This assumes NvToolsExt was installed on the host machine using the +# CUDA 11.8 GUI installer and copied to the build context + +# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \ + --output NvToolsExt.zip; \ + Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \ + Remove-Item NvToolsExt.zip -Force + +# ----------------------------------------------------------------------------- + +# Create a working directory +WORKDIR "C:\\\\workspace" + +# ----------------------------------------------------------------------------- + +# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \ + --output TensorRT-9.2.0.5.zip; \ + Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \ + Remove-Item TensorRT-9.2.0.5.zip -Force + +# Add TensorRT libs to Path +RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib" + +# Install TensorRT Python wheel +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl + +# ----------------------------------------------------------------------------- + +# Copy cuDNN into the working directory +# This assumes cuDNN exists on the host machine in the build context +# COPY ["cuDNN", "cuDNN"] +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \ + --output cuDNN.zip; \ + Expand-Archive .\cuDNN.zip -DestinationPath .; \ + Remove-Item cuDNN.zip -Force + +# Add cuDNN libs and bin to Path. +RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" + +# ----------------------------------------------------------------------------- + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +# ----------------------------------------------------------------------------- + +# Additional dependencies to build Nitro + +# This bellow command lt MSVC recognize cuda compiler +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations' + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations' + + +# Set git safe directory for nitro clone dependencies +RUN powershell -Command \ + git config --global --add safe.directory '*' + +# Package for nitro compile +RUN powershell -Command \ + choco install pkgconfiglite --allow-empty-checksums -y + +RUN powershell -Command \ + choco install Ninja -y + +RUN choco install 7zip -y; \ + 7z --help + +# Requirements to build tensorrt-llm on windows +# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt +# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt +# RUN powershell -Command \ +# cd tensorrt-llm-nitro; \ +# pip install --no-cache-dir -r .\requirements-dev-windows.txt + +# COPY ./.git ./tensorrt-llm-nitro/.git + +# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty + +# COPY ./cpp ./tensorrt-llm-nitro/cpp + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + git clone https://github.com/janhq/nitro-tensorrt-llm.git; \ + cd nitro-tensorrt-llm; \ + git checkout tensorrt-llm-nitro-rel; \ + git submodule update --init --recursive; \ + pip install --no-cache-dir -r .\requirements-dev-windows.txt; \ + cd cpp/tensorrt_llm/nitro; \ + cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \ + cmake --build ./build_deps/nitro_deps --config Release + +RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools" + +RUN VsDevCmd.bat -arch=amd64 && \ + powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '75-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + +# # ----------------------------------------------------------------------------- + +# Requirements to build tensorrt-llm on windows +ARG RUNNER_VERSION=2.314.1 + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Invoke-WebRequest \ + -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \ + -OutFile runner.zip; \ + Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \ + Remove-Item -Path .\runner.zip; \ + setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\") + +ADD runner.ps1 ./runner.ps1 + +RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force + +RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T + +CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index a173bf7aca7..f1a0626f7fa 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -3,10 +3,18 @@ on: push: branches: - tensorrt-llm-nitro-rel + - rel jobs: - windows-ampere-build: - runs-on: windows-nitro-tensorrt-llm-ampere + windows-build: + runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }} + strategy: + matrix: + include: + - cuda_arch: '80-real;86-real' + cuda_arch_name: 'ampere' + - cuda_arch: '89-real' + cuda_arch_name: 'ada' permissions: contents: write steps: @@ -17,7 +25,7 @@ jobs: submodules: recursive lfs: true - - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm C:\workspace + - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm - uses: nick-fields/retry@v3 with: @@ -30,35 +38,35 @@ jobs: - name: Build Python shell: powershell run: | - cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" - name: Build nitro shell: powershell run: | cd C:\workspace\nitro-tensorrt-llm\cpp\build - powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja" + powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja" powershell -Command "cmake --build . --parallel 2 --config Release" - name: create nitro artifact with dll file shell: powershell run: | mkdir build_nitro - cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro - cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro - cp C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro - cp C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro - cp C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro - cp C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro + cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro + cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro ls .\build_nitro - name: Upload Artifact uses: actions/upload-artifact@v2 with: - name: nitro-tensorrt-llm-windows-ampere + name: nitro-tensorrt-llm-windows-${{ matrix.cuda_arch_name }} path: ./build_nitro - name: Upload Artifact uses: actions/upload-artifact@v2 with: - name: python-wheel + name: python-tensorrt-llm-${{ matrix.cuda_arch }}-wheel path: C:/workspace/nitro-tensorrt-llm/build From f9862020df0f940b45e821dfd4e2bfb97c5eb597 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 23:18:30 +0700 Subject: [PATCH 31/33] Add CI release --- .../python-windows-build-release.yml | 87 +++++++++++++++ ...ows-build.yml => windows-build-manual.yml} | 8 +- .github/workflows/windows-build-release.yml | 103 ++++++++++++++++++ 3 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/python-windows-build-release.yml rename .github/workflows/{windows-build.yml => windows-build-manual.yml} (91%) create mode 100644 .github/workflows/windows-build-release.yml diff --git a/.github/workflows/python-windows-build-release.yml b/.github/workflows/python-windows-build-release.yml new file mode 100644 index 00000000000..ef21da6b909 --- /dev/null +++ b/.github/workflows/python-windows-build-release.yml @@ -0,0 +1,87 @@ +name: Release for Windows +on: + push: + tags: ["python-windows-*"] + +jobs: + create-draft-release: + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + version: ${{ steps.get_version.outputs.version }} + permissions: + contents: write + steps: + - name: Extract tag name prefix + id: get_version + run: echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/}" + env: + GITHUB_REF: ${{ github.ref }} + - name: Create Draft Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref_name }} + release_name: "${{ env.VERSION }}" + draft: true + prerelease: false + windows-build: + needs: create-draft-release + runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }} + strategy: + matrix: + include: + - cuda_arch: '80-real;86-real' + cuda_arch_name: 'ampere' + - cuda_arch: '89-real' + cuda_arch_name: 'ada' + - cuda_arch: '75-real' + cuda_arch_name: 'turing' + permissions: + contents: write + steps: + - uses: actions/setup-dotnet@v3 + with: + dotnet-version: "6.0.x" + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + lfs: true + + - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm + + - uses: nick-fields/retry@v3 + with: + timeout_minutes: 45 + max_attempts: 3 + shell: powershell + command: | + cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release + + - name: Build Python + shell: powershell + run: | + cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + + - name: Build nitro + shell: powershell + run: | + cd C:\workspace\nitro-tensorrt-llm\cpp\build + powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja" + powershell -Command "cmake --build . --parallel 2 --config Release" + tar -czvf python.tar.gz .\build\*.whl + + - uses: actions/upload-release-asset@v1.0.1 + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + asset_path: ./python.tar.gz + asset_name: ${{ needs.create-draft-release.outputs.version }}-tensorrt-llm-${{ matrix.cuda_arch_name }}.tar.gz + asset_content_type: application/gzip diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build-manual.yml similarity index 91% rename from .github/workflows/windows-build.yml rename to .github/workflows/windows-build-manual.yml index f1a0626f7fa..d5dc1ebb59f 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build-manual.yml @@ -1,9 +1,9 @@ -name: Build for Windows +name: Manuall Build for Windows on: push: branches: - tensorrt-llm-nitro-rel - - rel + workflow_dispatch: jobs: windows-build: @@ -15,6 +15,8 @@ jobs: cuda_arch_name: 'ampere' - cuda_arch: '89-real' cuda_arch_name: 'ada' + - cuda_arch: '75-real' + cuda_arch_name: 'turing' permissions: contents: write steps: @@ -56,6 +58,8 @@ jobs: cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro + cp -Force C:\workspace\TensorRT-9.2.0.5\lib\nvinfer.dll .\build_nitro + cp -Force C:\Windows\SysWOW64\msmpi.dll .\build_nitro cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro ls .\build_nitro diff --git a/.github/workflows/windows-build-release.yml b/.github/workflows/windows-build-release.yml new file mode 100644 index 00000000000..d4922a537a1 --- /dev/null +++ b/.github/workflows/windows-build-release.yml @@ -0,0 +1,103 @@ +name: Release for Windows +on: + push: + tags: ["windows-v[0-9]+.[0-9]+.[0-9]+"] + +jobs: + create-draft-release: + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + version: ${{ steps.get_version.outputs.version }} + permissions: + contents: write + steps: + - name: Extract tag name without v prefix + id: get_version + run: echo "VERSION=${GITHUB_REF#refs/tags/windows-v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/windows-v}" + env: + GITHUB_REF: ${{ github.ref }} + - name: Create Draft Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref_name }} + release_name: "${{ env.VERSION }}" + draft: true + prerelease: false + windows-build: + needs: create-draft-release + runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }} + strategy: + matrix: + include: + - cuda_arch: '80-real;86-real' + cuda_arch_name: 'ampere' + - cuda_arch: '89-real' + cuda_arch_name: 'ada' + - cuda_arch: '75-real' + cuda_arch_name: 'turing' + permissions: + contents: write + steps: + - uses: actions/setup-dotnet@v3 + with: + dotnet-version: "6.0.x" + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + lfs: true + + - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm + + - uses: nick-fields/retry@v3 + with: + timeout_minutes: 45 + max_attempts: 3 + shell: powershell + command: | + cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release + + - name: Build Python + shell: powershell + run: | + cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'" + + - name: Build nitro + shell: powershell + run: | + cd C:\workspace\nitro-tensorrt-llm\cpp\build + powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja" + powershell -Command "cmake --build . --parallel 2 --config Release" + + - name: create nitro artifact with dll file + shell: powershell + run: | + mkdir build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro + cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro + cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro + cp -Force C:\workspace\TensorRT-9.2.0.5\lib\nvinfer.dll .\build_nitro + cp -Force C:\Windows\SysWOW64\msmpi.dll .\build_nitro + cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro + ls .\build_nitro + dotnet tool install --global AzureSignTool + %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build_nitro\nitro.exe" + tar -czvf nitro.tar.gz .\build_nitro + + - uses: actions/upload-release-asset@v1.0.1 + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + asset_path: ./nitro.tar.gz + asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-amd64-tensorrt-llm-${{ matrix.cuda_arch_name }}.tar.gz + asset_content_type: application/gzip From f9ba94b4bb0773ed0868e00f33c1118d5af18c65 Mon Sep 17 00:00:00 2001 From: Hien To Date: Mon, 11 Mar 2024 23:19:38 +0700 Subject: [PATCH 32/33] Remove debug CI --- .github/workflows/python-windows-build-release.yml | 2 +- .github/workflows/windows-build-manual.yml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/python-windows-build-release.yml b/.github/workflows/python-windows-build-release.yml index ef21da6b909..fbfe5e76ba6 100644 --- a/.github/workflows/python-windows-build-release.yml +++ b/.github/workflows/python-windows-build-release.yml @@ -1,4 +1,4 @@ -name: Release for Windows +name: Release for python Windows on: push: tags: ["python-windows-*"] diff --git a/.github/workflows/windows-build-manual.yml b/.github/workflows/windows-build-manual.yml index d5dc1ebb59f..b3e324ae6ed 100644 --- a/.github/workflows/windows-build-manual.yml +++ b/.github/workflows/windows-build-manual.yml @@ -1,8 +1,5 @@ name: Manuall Build for Windows on: - push: - branches: - - tensorrt-llm-nitro-rel workflow_dispatch: jobs: From 186eee30ebc06b17df277df7f8559294fa515ad0 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Mon, 11 Mar 2024 17:02:07 +0700 Subject: [PATCH 33/33] Merge pull request #14 from janhq/10-epic-add-proper-handler-for-stop-words Add naive hiding stop words case --- .../nitro/controllers/tensorrtllm.cc | 108 +++++++++++++----- .../nitro/controllers/tensorrtllm.h | 1 + 2 files changed, 82 insertions(+), 27 deletions(-) diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc index 999d7b18a82..c3891440dd3 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc @@ -26,11 +26,52 @@ void removeId(std::vector& vec, int id) struct inferenceState { int prevPos{0}; + std::string prevText; bool isFinished; std::queue textsToStream; std::mutex queueMutex; // Mutex to protect access to textsToStream + + size_t stopWordMatchLen = 0; + std::vector sequence{"<", "|", "im", "_", "end", "|", ">"}; + + void reset() + { + stopWordMatchLen = 0; + prevText = ""; + } + + bool isComplete() const + { + return stopWordMatchLen >= sequence.size(); + } }; +bool handleMatch(const std::string& rawText, std::shared_ptr inferState) +{ + if (inferState->isComplete()) + { + return true; + } + + if (rawText == inferState->sequence[inferState->stopWordMatchLen]) + { + inferState->stopWordMatchLen++; // Move to next state + inferState->prevText = rawText; + return true; + } + else if (inferState->stopWordMatchLen > 0 && rawText == inferState->sequence[0]) + { + inferState->stopWordMatchLen = 1; // Restart from first match if sequence breaks but matches start + inferState->prevText = rawText; + return true; + } + else + { + inferState->reset(); + return false; // Reset to start if sequence breaks + } +} + // Only support single token stopping point now std::string create_return_json(const std::string& id, const std::string& model, const std::string& content, Json::Value finish_reason = Json::Value()) @@ -67,6 +108,13 @@ GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToke return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU); } +GenerationInput::TensorPtr tensorrtllm::getTensorChatMLStopWordList() +{ + std::vector stopWordsTokens = {28789, 28766, 321, 28730, 416, 28766, 28767, 32000, 6, 8, -1, -1, -1, -1, + -1, -1}; // Extend with -1 for increased length + return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 8}), MemoryType::kGPU); +} + GenerationInput tensorrtllm::createGenerationInput(std::vector inputIdsHost) { int inputLen = inputIdsHost.size(); @@ -78,7 +126,7 @@ GenerationInput tensorrtllm::createGenerationInput(std::vector inputIds GenerationInput generationInput{0, 0, inputIds, inputLengths, modelConfig->usePackedInput()}; - generationInput.stopWordsList = getTensorSingleStopWordList(32000); + generationInput.stopWordsList = getTensorChatMLStopWordList(); return generationInput; } @@ -117,35 +165,35 @@ void inferenceThread(std::shared_ptr inferState, std::vectorgetShape().d[2]; // Get the length of output IDs based on the tensor shape + // Copy output IDs from GPU to host for printing + std::vector outputIdsHost(outputLength); + self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); + // Find the last non-zero value in the output IDs starting from the end of the input sequence + std::vector outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end()); + removeId(outputIdsHostDecode, 0); + std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode); + + if (inferState->prevPos > 0 && inferState->prevPos < text.size()) + { + // Valid prevPos, proceed with slicing the string from prevPos to the end + std::string stringTok(text.begin() + inferState->prevPos, text.end()); + std::lock_guard guard(inferState->queueMutex); // Protect access with a lock + inferState->textsToStream.push(stringTok); + } + else if (inferState->prevPos >= text.size()) { - // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens - int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape - // Copy output IDs from GPU to host for printing - std::vector outputIdsHost(outputLength); - self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU); - // Find the last non-zero value in the output IDs starting from the end of the input sequence - std::vector outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end()); - removeId(outputIdsHostDecode, 0); - removeId(outputIdsHostDecode, 32000); - std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode); - - if (inferState->prevPos > 0 && inferState->prevPos < text.size()) - { - // Valid prevPos, proceed with slicing the string from prevPos to the end - std::string stringTok(text.begin() + inferState->prevPos, text.end()); - std::lock_guard guard(inferState->queueMutex); // Protect access with a lock - inferState->textsToStream.push(stringTok); - } - else if (inferState->prevPos >= text.size()) - { - inferState->prevPos = text.size(); - } inferState->prevPos = text.size(); + } + inferState->prevPos = text.size(); + if (finished) + { + + std::lock_guard guard(inferState->queueMutex); // Protect access with a lock + inferState->textsToStream.push("[DONE]"); return; } - std::lock_guard guard(inferState->queueMutex); // Protect access with a lock - inferState->textsToStream.push("[DONE]"); }; // The rest of the logic inside the `chat_completion` remains unchanged... // After finishing the setup, call the inference logic @@ -243,6 +291,12 @@ void tensorrtllm::chat_completion( { std::string rawText = inferState->textsToStream.front(); + inferState->textsToStream.pop(); + if (handleMatch(rawText, inferState)) + { + continue; + }; + if (rawText == "[DONE]") { LOG_INFO << "End of result"; @@ -257,7 +311,6 @@ void tensorrtllm::chat_completion( } const std::string textToStream = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n"; - inferState->textsToStream.pop(); lock.unlock(); // Unlock as soon as possible // Ensure we do not exceed the buffer size. Truncate if necessary. @@ -265,6 +318,7 @@ void tensorrtllm::chat_completion( // Copy the text to the provided buffer std::memcpy(pBuffer, textToStream.data(), bytesToWrite); + inferState->prevText = rawText; return bytesToWrite; // Return the number of bytes written to the buffer } else diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h index 0ecae873d27..40454829f6b 100644 --- a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h +++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h @@ -100,6 +100,7 @@ class tensorrtllm : public drogon::HttpController GenerationInput createGenerationInput(std::vector inputIds); GenerationOutput createGenerationOutput(); std::unique_ptr nitro_tokenizer; + GenerationInput::TensorPtr getTensorChatMLStopWordList(); private: GptSession::Config sessionConfig{1, 1, 1};