diff --git a/.github/runners/Dockerfile.window.runner-ada b/.github/runners/Dockerfile.window.runner-ada
new file mode 100644
index 00000000000..4ed2145599d
--- /dev/null
+++ b/.github/runners/Dockerfile.window.runner-ada
@@ -0,0 +1,291 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite --allow-empty-checksums -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+RUN choco install 7zip -y; \
+    7z --help
+
+# Requirements to build tensorrt-llm on windows
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '89-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
+
+# Requirements to build tensorrt-llm on windows
+ARG RUNNER_VERSION=2.314.1
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip; \
+    setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\")
+
+ADD runner.ps1 ./runner.ps1
+
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
+RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/Dockerfile.window.runner-ampere b/.github/runners/Dockerfile.window.runner-ampere
new file mode 100644
index 00000000000..c41eb6205e9
--- /dev/null
+++ b/.github/runners/Dockerfile.window.runner-ampere
@@ -0,0 +1,291 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite --allow-empty-checksums -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+RUN choco install 7zip -y; \
+    7z --help
+
+# Requirements to build tensorrt-llm on windows
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
+
+# Requirements to build tensorrt-llm on windows
+ARG RUNNER_VERSION=2.314.1
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip; \
+    setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\")
+
+ADD runner.ps1 ./runner.ps1
+
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
+RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/Dockerfile.window.runner-turing b/.github/runners/Dockerfile.window.runner-turing
new file mode 100644
index 00000000000..ee35f0428c1
--- /dev/null
+++ b/.github/runners/Dockerfile.window.runner-turing
@@ -0,0 +1,291 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite --allow-empty-checksums -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+RUN choco install 7zip -y; \
+    7z --help
+
+# Requirements to build tensorrt-llm on windows
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '75-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
+
+# Requirements to build tensorrt-llm on windows
+ARG RUNNER_VERSION=2.314.1
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip; \
+    setx /M PATH $(${Env:PATH} + \";${Env:ProgramFiles}\Git\bin\")
+
+ADD runner.ps1 ./runner.ps1
+
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
+RUN powershell -Command icacls 'C:\workspace\nitro-tensorrt-llm' /grant 'Everyone:F' /T
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/runner.ps1 b/.github/runners/runner.ps1
new file mode 100644
index 00000000000..a08f3725bf1
--- /dev/null
+++ b/.github/runners/runner.ps1
@@ -0,0 +1,2 @@
+.\actions-runner\config.cmd --unattended --replace --url https://github.com/${env:RUNNER_REPO} --pat $env:RUNNER_PAT --runnergroup $env:RUNNER_GROUP --labels $env:RUNNER_LABELS --name $env:RUNNER_NAME --work $env:RUNNER_WORKDIR;
+.\actions-runner\run.cmd;
\ No newline at end of file
diff --git a/.github/workflows/python-windows-build-release.yml b/.github/workflows/python-windows-build-release.yml
new file mode 100644
index 00000000000..fbfe5e76ba6
--- /dev/null
+++ b/.github/workflows/python-windows-build-release.yml
@@ -0,0 +1,87 @@
+name: Release for python Windows
+on:
+  push:
+    tags: ["python-windows-*"]
+
+jobs:
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
+    steps:
+      - name: Extract tag name prefix
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref_name }}
+          release_name: "${{ env.VERSION }}"
+          draft: true
+          prerelease: false
+  windows-build:
+    needs: create-draft-release
+    runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }}
+    strategy:
+      matrix:
+        include:
+          - cuda_arch: '80-real;86-real'
+            cuda_arch_name: 'ampere'
+          - cuda_arch: '89-real'
+            cuda_arch_name: 'ada'
+          - cuda_arch: '75-real'
+            cuda_arch_name: 'turing'
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/setup-dotnet@v3
+        with:
+          dotnet-version: "6.0.x"
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm
+
+      - uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 45
+          max_attempts: 3
+          shell: powershell
+          command: |
+            cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+
+      - name: Build Python
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+      - name: Build nitro
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm\cpp\build
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake --build . --parallel 2 --config Release"
+          tar -czvf python.tar.gz .\build\*.whl
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./python.tar.gz
+          asset_name: ${{ needs.create-draft-release.outputs.version }}-tensorrt-llm-${{ matrix.cuda_arch_name }}.tar.gz
+          asset_content_type: application/gzip
diff --git a/.github/workflows/windows-build-manual.yml b/.github/workflows/windows-build-manual.yml
new file mode 100644
index 00000000000..b3e324ae6ed
--- /dev/null
+++ b/.github/workflows/windows-build-manual.yml
@@ -0,0 +1,73 @@
+name: Manuall Build for Windows
+on:
+  workflow_dispatch:
+
+jobs:
+  windows-build:
+    runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }}
+    strategy:
+      matrix:
+        include:
+          - cuda_arch: '80-real;86-real'
+            cuda_arch_name: 'ampere'
+          - cuda_arch: '89-real'
+            cuda_arch_name: 'ada'
+          - cuda_arch: '75-real'
+            cuda_arch_name: 'turing'
+    permissions:
+      contents: write
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm
+
+      - uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 45
+          max_attempts: 3
+          shell: powershell
+          command: |
+            cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+
+      - name: Build Python
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+      - name: Build nitro
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm\cpp\build
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake --build . --parallel 2 --config Release"
+      
+      - name: create nitro artifact with dll file
+        shell: powershell
+        run: |
+          mkdir build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
+          cp -Force C:\workspace\TensorRT-9.2.0.5\lib\nvinfer.dll .\build_nitro
+          cp -Force C:\Windows\SysWOW64\msmpi.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
+          ls .\build_nitro
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: nitro-tensorrt-llm-windows-${{ matrix.cuda_arch_name }}
+          path: ./build_nitro
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: python-tensorrt-llm-${{ matrix.cuda_arch }}-wheel
+          path: C:/workspace/nitro-tensorrt-llm/build
diff --git a/.github/workflows/windows-build-release.yml b/.github/workflows/windows-build-release.yml
new file mode 100644
index 00000000000..d4922a537a1
--- /dev/null
+++ b/.github/workflows/windows-build-release.yml
@@ -0,0 +1,103 @@
+name: Release for Windows
+on:
+  push:
+    tags: ["windows-v[0-9]+.[0-9]+.[0-9]+"]
+
+jobs:
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
+    steps:
+      - name: Extract tag name without v prefix
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/windows-v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/windows-v}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref_name }}
+          release_name: "${{ env.VERSION }}"
+          draft: true
+          prerelease: false
+  windows-build:
+    needs: create-draft-release
+    runs-on: windows-nitro-tensorrt-llm-${{ matrix.cuda_arch_name }}
+    strategy:
+      matrix:
+        include:
+          - cuda_arch: '80-real;86-real'
+            cuda_arch_name: 'ampere'
+          - cuda_arch: '89-real'
+            cuda_arch_name: 'ada'
+          - cuda_arch: '75-real'
+            cuda_arch_name: 'turing'
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/setup-dotnet@v3
+        with:
+          dotnet-version: "6.0.x"
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - run: cp -r -Force C:\w\nitro-tensorrt-llm\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm
+
+      - uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 45
+          max_attempts: 3
+          shell: powershell
+          command: |
+            cd C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro; powershell -Command cmake -S ./nitro_deps -B ./build_deps/nitro_deps; powershell -Command cmake --build ./build_deps/nitro_deps --config Release
+
+      - name: Build Python
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm; powershell -Command "python .\scripts\build_wheel.py -a '${{ matrix.cuda_arch }}' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+      - name: Build nitro
+        shell: powershell
+        run: |
+          cd C:\workspace\nitro-tensorrt-llm\cpp\build
+          powershell -Command "cmake .. -DCMAKE_CUDA_ARCHITECTURES='${{ matrix.cuda_arch }}' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.2.0.5/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.2.0.5/include' -DBUILD_NITRO=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -G Ninja"
+          powershell -Command "cmake --build . --parallel 2 --config Release"
+      
+      - name: create nitro artifact with dll file
+        shell: powershell
+        run: |
+          mkdir build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\nitro\nitro.exe .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll .\build_nitro
+          cp -Force C:\workspace\nitro-tensorrt-llm\cpp\tensorrt_llm\nitro\build_deps\_install\bin\zlib.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll .\build_nitro
+          cp -Force C:\workspace\TensorRT-9.2.0.5\lib\nvinfer.dll .\build_nitro
+          cp -Force C:\Windows\SysWOW64\msmpi.dll .\build_nitro
+          cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll .\build_nitro
+          ls .\build_nitro
+          dotnet tool install --global AzureSignTool
+          %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build_nitro\nitro.exe"
+          tar -czvf nitro.tar.gz .\build_nitro
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./nitro.tar.gz
+          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-amd64-tensorrt-llm-${{ matrix.cuda_arch_name }}.tar.gz
+          asset_content_type: application/gzip
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 39c6a83f231..a8f2c80db05 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 39c6a83f231d6db2bc6b9c251e7add77d68cbfb4
+Subproject commit a8f2c80db0564c74f4efccac71993b971dfc448b
diff --git a/BUILD_ENGINE_MODEL.md b/BUILD_ENGINE_MODEL.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/BUILD_NITRO.md b/BUILD_NITRO.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/Dockerfile.nitro.windows b/Dockerfile.nitro.windows
new file mode 100644
index 00000000000..5dcbcde66ae
--- /dev/null
+++ b/Dockerfile.nitro.windows
@@ -0,0 +1,266 @@
+# Use the Windows Server Core 2019 image.
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuda_12.2.2_537.13_windows.exe \
+    --output "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/python-3.10.11-amd64.exe --output python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisetup.exe \
+    --output "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/msmpisdk.msi \
+    --output "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cmake-3.27.7-windows-x86_64.msi \
+    --output "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/gvim90.exe \
+    --output "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+
+#RUN powershell -Command \
+#    $ErrorActionPreference = 'Stop'; \
+#    curl.exe https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+#    --output "cuda_11_installer.exe"; \
+#    Start-Process cuda_11_installer.exe -Wait -ArgumentList '-s nvtx_11.8'; \
+#    Remove-Item cuda_11_installer.exe -Force
+
+# The above command-line installation method installs NVTX headers at
+# C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include\nvtx3\
+# CMake can't find this location for some reason.
+# Instead, we just copy the older NvToolsExt version to where CMake expects.
+# This assumes NvToolsExt was installed on the host machine using the
+# CUDA 11.8 GUI installer and copied to the build context
+
+# COPY ["NvToolsExt", "C:\\\\Program Files\\\\NVIDIA Corporation\\\\NvToolsExt"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/NvToolsExt.zip \
+    --output NvToolsExt.zip; \
+    Expand-Archive .\NvToolsExt.zip -DestinationPath 'C:\Program Files\NVIDIA Corporation\'; \
+    Remove-Item NvToolsExt.zip -Force
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.2.0.5 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/TensorRT-9.2.0.5.Windows10.x86_64.cuda-12.2.llm.beta.zip \
+    --output TensorRT-9.2.0.5.zip; \
+    Expand-Archive .\TensorRT-9.2.0.5.zip -DestinationPath .; \
+    Remove-Item TensorRT-9.2.0.5.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.2.0.5\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.2.0.5\python\tensorrt-9.2.0.post12.dev5-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Copy cuDNN into the working directory
+# This assumes cuDNN exists on the host machine in the build context
+# COPY ["cuDNN", "cuDNN"]
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    curl.exe https://delta.jan.ai/dist/windows-container-dependencies/1/cuDNN.zip \
+    --output cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+
+# Additional dependencies to build Nitro
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+
+# Set git safe directory for nitro clone dependencies
+RUN powershell -Command \
+    git config --global --add safe.directory '*'
+
+# Package for nitro compile
+RUN powershell -Command \
+    choco install pkgconfiglite --allow-empty-checksums -y
+
+RUN powershell -Command \
+    choco install Ninja -y
+
+RUN choco install 7zip -y; \
+    7z --help
+
+# Requirements to build tensorrt-llm on windows
+# COPY ./requirements-windows.txt ./tensorrt-llm-nitro/requirements-windows.txt
+# COPY ./requirements-dev-windows.txt ./tensorrt-llm-nitro/requirements-dev-windows.txt
+# RUN powershell -Command \
+#     cd tensorrt-llm-nitro; \
+#     pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+# COPY ./.git ./tensorrt-llm-nitro/.git
+
+# COPY ./3rdparty ./tensorrt-llm-nitro/3rdparty
+
+# COPY ./cpp ./tensorrt-llm-nitro/cpp
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    git clone https://github.com/janhq/nitro-tensorrt-llm.git; \
+    cd nitro-tensorrt-llm; \
+    git checkout tensorrt-llm-nitro-rel; \
+    git submodule update --init --recursive; \
+    pip install --no-cache-dir -r .\requirements-dev-windows.txt; \
+    cd cpp/tensorrt_llm/nitro; \
+    cmake -S ./nitro_deps -B ./build_deps/nitro_deps; \
+    cmake --build ./build_deps/nitro_deps --config Release
+
+RUN setx Path "%Path%;C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools"
+
+RUN VsDevCmd.bat -arch=amd64 && \
+    powershell.exe -NoLogo -ExecutionPolicy Bypass "cd nitro-tensorrt-llm; python .\scripts\build_wheel.py -a '80-real;86-real' --trt_root 'C:\workspace\TensorRT-9.2.0.5\'"
+
+# # -----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6ef4b374a4f..37adf9dd9f3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -17,6 +17,7 @@
 
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_BUILD_TYPE Release)
 
 include(CheckLanguage)
 include(cmake/modules/set_ifndef.cmake)
@@ -29,9 +30,10 @@ project(tensorrt_llm LANGUAGES CXX)
 # Build options
 option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
 option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager"
-       ON)
-option(BUILD_TESTS "Build Google tests" ON)
-option(BUILD_BENCHMARKS "Build benchmarks" ON)
+  OFF)
+option(BUILD_TESTS "Build Google tests" OFF)
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+option(BUILD_NITRO "Build nitro" ON)
 option(NVTX_DISABLE "Disable all NVTX features" ON)
 option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
 option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF)
@@ -44,12 +46,7 @@ else()
   message(STATUS "NVTX is enabled")
 endif()
 
-if(EXISTS
-   "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt")
-  set(BUILD_BATCH_MANAGER_DEFAULT ON)
-else()
-  set(BUILD_BATCH_MANAGER_DEFAULT OFF)
-endif()
+set(BUILD_BATCH_MANAGER_DEFAULT OFF)
 
 option(BUILD_BATCH_MANAGER "Build batch manager from source"
        ${BUILD_BATCH_MANAGER_DEFAULT})
@@ -129,9 +126,9 @@ endif()
 # Initialize CMAKE_CUDA_ARCHITECTURES before enabling CUDA
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
-    set(CMAKE_CUDA_ARCHITECTURES 70-real 80-real 86-real 89-real 90-real)
+    set(CMAKE_CUDA_ARCHITECTURES 89-real) 
   else()
-    set(CMAKE_CUDA_ARCHITECTURES 70-real 80-real 86-real)
+    set(CMAKE_CUDA_ARCHITECTURES 89-real)
   endif()
 endif()
 
@@ -177,8 +174,8 @@ include_directories(
   ${3RDPARTY_DIR}/json/include)
 
 # TRT dependencies
-set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
-set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
+set_ifndef(TRT_LIB_DIR /usr/local/tensorrt/lib)
+set_ifndef(TRT_INCLUDE_DIR /usr/local/tensorrt/include)
 set(TRT_LIB nvinfer)
 find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
 
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index bcbf107e04a..29583f0f6c9 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -188,3 +188,7 @@ if(BUILD_PYBIND)
 endif()
 
 add_subdirectory(plugins)
+
+if(BUILD_NITRO)
+  add_subdirectory(nitro)
+endif()
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/nitro/CMakeLists.txt b/cpp/tensorrt_llm/nitro/CMakeLists.txt
new file mode 100644
index 00000000000..5b852afab13
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/CMakeLists.txt
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# C++17
+# NItro init
+include(CheckIncludeFileCXX)
+
+check_include_file_cxx(any HAS_ANY)
+check_include_file_cxx(string_view HAS_STRING_VIEW)
+check_include_file_cxx(coroutine HAS_COROUTINE)
+if(HAS_ANY
+   AND HAS_STRING_VIEW
+   AND HAS_COROUTINE)
+  set(CMAKE_CXX_STANDARD 20)
+elseif(HAS_ANY AND HAS_STRING_VIEW)
+  set(CMAKE_CXX_STANDARD 17)
+else()
+  set(CMAKE_CXX_STANDARD 14)
+endif()
+
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install)
+
+message(STATUS "Current Source Directory NITRO: ${CMAKE_CURRENT_SOURCE_DIR}")
+message(STATUS "Current Cmake Prefix Path of NITRO: ${CMAKE_PREFIX_PATH}")
+
+
+set(OPENSSL_USE_STATIC_LIBS TRUE)
+
+
+# Enable pkg-config support in CMake
+find_package(PkgConfig REQUIRED)
+find_package(Drogon CONFIG REQUIRED)
+
+# Use pkg-config to find the SentencePiece library
+
+if(NOT WIN32) # Linux
+  # Use pkg-config to find the SentencePiece library
+  pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
+else() # Windows
+  set(SENTENCEPIECE_INCLUDE_DIRS "${CMAKE_PREFIX_PATH}/include")
+  set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib")
+endif()
+
+message(STATUS "SentencePiece library dirs: ${SENTENCEPIECE_LIBRARY_DIRS}")
+message(STATUS "SentencePiece header dirs: ${SENTENCEPIECE_INCLUDE_DIRS}")
+
+include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})
+
+link_directories(${SENTENCEPIECE_LIBRARY_DIRS})
+
+set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
+
+add_custom_target(nitro_proj)
+
+set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
+add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+
+# main
+add_executable(nitro main.cc)
+
+target_link_libraries(
+  nitro PUBLIC ${SHARED_TARGET} sentencepiece nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT} )
+
+
+target_compile_features(nitro PRIVATE cxx_std_17)
+target_compile_definitions(nitro PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
+
+
+
+aux_source_directory(controllers CTL_SRC)
+aux_source_directory(common COMMON_SRC)
+aux_source_directory(context CONTEXT_SRC)
+aux_source_directory(models MODEL_SRC)
+
+target_include_directories(nitro PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+# ${CMAKE_CURRENT_SOURCE_DIR}/models)
+target_sources(nitro PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC})
+
+
+add_dependencies(nitro_proj nitro)
+
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
new file mode 100644
index 00000000000..c3891440dd3
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
@@ -0,0 +1,389 @@
+#include "tensorrtllm.h"
+#include "models/chat_completion_request.h"
+#include "nlohmann/json.hpp"
+#include "tensorrt_llm/runtime/generationInput.h"
+#include "tensorrt_llm/runtime/generationOutput.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "utils/nitro_utils.h"
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <queue>
+#include <string>
+#include <trantor/utils/Logger.h>
+#include <vector>
+
+using json = nlohmann::json;
+using namespace inferences;
+
+void removeId(std::vector<int>& vec, int id)
+{
+    vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end());
+}
+
+struct inferenceState
+{
+    int prevPos{0};
+    std::string prevText;
+    bool isFinished;
+    std::queue<std::string> textsToStream;
+    std::mutex queueMutex; // Mutex to protect access to textsToStream
+
+    size_t stopWordMatchLen = 0;
+    std::vector<std::string> sequence{"<", "|", "im", "_", "end", "|", ">"};
+
+    void reset()
+    {
+        stopWordMatchLen = 0;
+        prevText = "";
+    }
+
+    bool isComplete() const
+    {
+        return stopWordMatchLen >= sequence.size();
+    }
+};
+
+bool handleMatch(const std::string& rawText, std::shared_ptr<inferenceState> inferState)
+{
+    if (inferState->isComplete())
+    {
+        return true;
+    }
+
+    if (rawText == inferState->sequence[inferState->stopWordMatchLen])
+    {
+        inferState->stopWordMatchLen++; // Move to next state
+        inferState->prevText = rawText;
+        return true;
+    }
+    else if (inferState->stopWordMatchLen > 0 && rawText == inferState->sequence[0])
+    {
+        inferState->stopWordMatchLen = 1; // Restart from first match if sequence breaks but matches start
+        inferState->prevText = rawText;
+        return true;
+    }
+    else
+    {
+        inferState->reset();
+        return false; // Reset to start if sequence breaks
+    }
+}
+
+// Only support single token stopping point now
+std::string create_return_json(const std::string& id, const std::string& model, const std::string& content,
+    Json::Value finish_reason = Json::Value())
+{
+    Json::Value root;
+
+    root["id"] = id;
+    root["model"] = model;
+    root["created"] = static_cast<int>(std::time(nullptr));
+    root["object"] = "chat.completion.chunk";
+
+    Json::Value choicesArray(Json::arrayValue);
+    Json::Value choice;
+
+    choice["index"] = 0;
+    Json::Value delta;
+    delta["content"] = content;
+    choice["delta"] = delta;
+    choice["finish_reason"] = finish_reason;
+
+    choicesArray.append(choice);
+    root["choices"] = choicesArray;
+
+    Json::StreamWriterBuilder writer;
+    writer["indentation"] = ""; // This sets the indentation to an empty string,
+                                // producing compact output.
+    return Json::writeString(writer, root);
+}
+
+GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToken)
+{
+
+    std::vector<int32_t> stopWordsTokens = {stopToken, -1, 1, -1}; // Extend with -1 for increased length
+    return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU);
+}
+
+GenerationInput::TensorPtr tensorrtllm::getTensorChatMLStopWordList()
+{
+    std::vector<int32_t> stopWordsTokens = {28789, 28766, 321, 28730, 416, 28766, 28767, 32000, 6, 8, -1, -1, -1, -1,
+        -1, -1}; // Extend with -1 for increased length
+    return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 8}), MemoryType::kGPU);
+}
+
+GenerationInput tensorrtllm::createGenerationInput(std::vector<int32_t> inputIdsHost)
+{
+    int inputLen = inputIdsHost.size();
+    std::vector<int32_t> inputLengthsHost(batchSize, inputLen);
+    GenerationInput::TensorPtr inputLengths
+        = gptSession->getBufferManager().copyFrom(inputLengthsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
+    GenerationInput::TensorPtr inputIds = gptSession->getBufferManager().copyFrom(
+        inputIdsHost, ITensor::makeShape({batchSize, inputLen}), MemoryType::kGPU);
+
+    GenerationInput generationInput{0, 0, inputIds, inputLengths, modelConfig->usePackedInput()};
+
+    generationInput.stopWordsList = getTensorChatMLStopWordList();
+    return generationInput;
+}
+
+GenerationOutput tensorrtllm::createGenerationOutput()
+{
+    GenerationOutput generationOutput{
+        gptSession->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
+        gptSession->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+    return generationOutput;
+}
+
+void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int32_t> inputIdsHost,
+    std::function<void(const HttpResponsePtr&)> callback, tensorrtllm* self)
+{
+    const int inputLen = inputIdsHost.size();
+    const int outputLen = 2048 - inputLen;
+
+    // Create sampling config
+    SamplingConfig samplingConfig{1};
+    samplingConfig.temperature = std::vector{0.0f};
+    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+    samplingConfig.topK = std::vector{40};
+    samplingConfig.topP = std::vector{0.0f};
+    samplingConfig.minLength = std::vector{outputLen};
+    samplingConfig.repetitionPenalty = std::vector{1.3f};
+
+    std::cout << "Start Nitro testing session: " << std::endl;
+
+    // Input preparation
+
+    GenerationInput generationInput = self->createGenerationInput(inputIdsHost);
+
+    GenerationOutput generationOutput = self->createGenerationOutput();
+
+    // Define the callback to stream each generated token
+    generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput](
+                                            GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
+    {
+        // Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
+        int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
+        // Copy output IDs from GPU to host for printing
+        std::vector<int32_t> outputIdsHost(outputLength);
+        self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
+        // Find the last non-zero value in the output IDs starting from the end of the input sequence
+        std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
+        removeId(outputIdsHostDecode, 0);
+        std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode);
+
+        if (inferState->prevPos > 0 && inferState->prevPos < text.size())
+        {
+            // Valid prevPos, proceed with slicing the string from prevPos to the end
+            std::string stringTok(text.begin() + inferState->prevPos, text.end());
+            std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
+            inferState->textsToStream.push(stringTok);
+        }
+        else if (inferState->prevPos >= text.size())
+        {
+            inferState->prevPos = text.size();
+        }
+        inferState->prevPos = text.size();
+        if (finished)
+        {
+
+            std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
+            inferState->textsToStream.push("[DONE]");
+            return;
+        }
+    };
+    // The rest of the logic inside the `chat_completion` remains unchanged...
+    // After finishing the setup, call the inference logic
+    self->gptSession->generate(generationOutput, generationInput, samplingConfig);
+}
+
+void tensorrtllm::chat_completion(
+    inferences::ChatCompletionRequest&& completion, std::function<void(const HttpResponsePtr&)>&& callback)
+{
+
+    std::string formatted_input = pre_prompt;
+
+    nlohmann::json data;
+
+    data["stream"] = completion.stream;
+    data["n_predict"] = completion.max_tokens;
+    data["top_p"] = completion.top_p;
+    data["temperature"] = completion.temperature;
+    data["frequency_penalty"] = completion.frequency_penalty;
+    data["presence_penalty"] = completion.presence_penalty;
+    const Json::Value& messages = completion.messages;
+
+    // Format the input from user
+    for (const auto& message : messages)
+    {
+        std::string input_role = message["role"].asString();
+        std::string role;
+        if (input_role == "user")
+        {
+            role = user_prompt;
+            std::string content = message["content"].asString();
+            formatted_input += role + content;
+        }
+        else if (input_role == "assistant")
+        {
+            role = ai_prompt;
+            std::string content = message["content"].asString();
+            formatted_input += role + content;
+        }
+        else if (input_role == "system")
+        {
+            role = system_prompt;
+            std::string content = message["content"].asString();
+            formatted_input = role + content + formatted_input;
+        }
+        else
+        {
+            role = input_role;
+            std::string content = message["content"].asString();
+            formatted_input += role + content;
+        }
+    }
+    formatted_input += ai_prompt;
+    // Format the input from user
+
+    std::shared_ptr<inferenceState> inferState = std::make_shared<inferenceState>();
+
+    std::vector<int32_t> inputIdsHost = nitro_tokenizer->encode(formatted_input);
+    const int inputLen = inputIdsHost.size();
+    const int outputLen = 2048 - inputLen;
+
+    // Create sampling config
+    SamplingConfig samplingConfig{1};
+    samplingConfig.temperature = std::vector{0.0f};
+    samplingConfig.randomSeed = std::vector{static_cast<uint64_t>(42ull)};
+    samplingConfig.topK = std::vector{40};
+    samplingConfig.topP = std::vector{0.0f};
+    samplingConfig.minLength = std::vector{outputLen};
+    samplingConfig.repetitionPenalty = std::vector{1.3f};
+
+    std::cout << "Start Nitro testing session: " << std::endl;
+
+    // Input preparation
+
+    std::thread infThread(inferenceThread, inferState, inputIdsHost, callback, this);
+    infThread.detach(); // Detach the thread to allow it to run independently
+
+    auto chunked_content_provider = [inferState](char* pBuffer, std::size_t nBuffSize) -> std::size_t
+    {
+        if (!pBuffer)
+        {
+            LOG_INFO << "Connection closed or buffer is null. Reset context";
+            return 0; // Indicate no more data to send
+        }
+
+        if (inferState->isFinished)
+        {
+            return 0;
+        }
+
+        while (true) // Continuously check if the queue is not empty
+        {
+            std::unique_lock<std::mutex> lock(inferState->queueMutex); // Lock the queue for exclusive access
+            if (!inferState->textsToStream.empty())
+            {
+
+                std::string rawText = inferState->textsToStream.front();
+                inferState->textsToStream.pop();
+                if (handleMatch(rawText, inferState))
+                {
+                    continue;
+                };
+
+                if (rawText == "[DONE]")
+                {
+                    LOG_INFO << "End of result";
+                    const std::string str
+                        = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", "", "stop")
+                        + "\n\n" + "data: [DONE]" + "\n\n";
+
+                    std::size_t nRead = std::min(str.size(), nBuffSize);
+                    memcpy(pBuffer, str.data(), nRead);
+                    inferState->isFinished = true;
+                    return nRead;
+                }
+                const std::string textToStream
+                    = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n";
+                lock.unlock(); // Unlock as soon as possible
+
+                // Ensure we do not exceed the buffer size. Truncate if necessary.
+                std::size_t bytesToWrite = std::min(nBuffSize, textToStream.size());
+
+                // Copy the text to the provided buffer
+                std::memcpy(pBuffer, textToStream.data(), bytesToWrite);
+                inferState->prevText = rawText;
+                return bytesToWrite; // Return the number of bytes written to the buffer
+            }
+            else
+            {
+                // If the queue is empty, release the lock and wait before trying again
+                lock.unlock();
+            }
+        }
+    };
+
+    auto streamResponse = nitro_utils::nitroStreamResponse(chunked_content_provider);
+    callback(streamResponse);
+    return;
+};
+
+void tensorrtllm::loadModel(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback)
+{
+    const auto& jsonBody = req->getJsonObject();
+
+    if (!jsonBody)
+    {
+        Json::Value jsonResp;
+        jsonResp["message"] = "Require params!";
+        auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+        callback(resp);
+        return;
+    }
+
+    const std::filesystem::path engineDir = jsonBody->operator[]("engine_path").asString();
+    int ctx_len = jsonBody->get("ctx_len", 2048).asInt();
+
+    logger = std::make_shared<TllmLogger>();
+    logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
+    // Fixed settings
+    const std::string modelName = "mistral";
+    initTrtLlmPlugins(logger.get());
+    // Load model configuration
+    std::filesystem::path jsonFileName = engineDir / "config.json";
+    std::filesystem::path tokenizerModelName = engineDir / "tokenizer.model";
+
+    nitro_tokenizer = std::make_unique<Tokenizer>(tokenizerModelName.string());
+    LOG_INFO << "Loaded tokenizer";
+
+    auto const json = GptJsonConfig::parse(jsonFileName);
+    auto config = json.getModelConfig();
+    modelConfig = std::make_unique<GptModelConfig>(config);
+    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
+    auto const enginePath = engineDir / json.engineFilename(worldConfig, modelName);
+    LOG_INFO << "Engine Path : " << enginePath.string();
+    auto const dtype = modelConfig->getDataType();
+
+    // Currently doing fixed session config
+    sessionConfig.maxBatchSize = batchSize;
+    sessionConfig.maxBeamWidth = 1; // Fixed for simplicity
+    sessionConfig.maxSequenceLength = ctx_len;
+    sessionConfig.cudaGraphMode = true; // Fixed for simplicity
+
+    // Init gptSession
+    gptSession = std::make_unique<GptSession>(sessionConfig, *modelConfig, worldConfig, enginePath.string(), logger);
+    // Model loaded successfully
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model loaded successfully";
+    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+    callback(resp);
+    return;
+};
+
+// Add definition of your processing function here
diff --git a/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
new file mode 100644
index 00000000000..40454829f6b
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include "drogon/HttpTypes.h"
+#include "sentencepiece_processor.h"
+#include <cstdint>
+#include <drogon/HttpController.h>
+
+#include "sentencepiece_processor.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
+#include "tensorrt_llm/runtime/generationInput.h"
+#include "tensorrt_llm/runtime/generationOutput.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/gptModelConfig.h"
+#include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "tensorrt_llm/runtime/tllmLogger.h"
+#include <NvInfer.h>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "models/chat_completion_request.h"
+
+using namespace drogon;
+
+using namespace tensorrt_llm::runtime;
+
+class Tokenizer
+{
+private:
+    sentencepiece::SentencePieceProcessor processor;
+
+    void replaceSubstring(std::string& base, const std::string& from, const std::string& to)
+    {
+        size_t start_pos = 0;
+        while ((start_pos = base.find(from, start_pos)) != std::string::npos)
+        {
+            base.replace(start_pos, from.length(), to);
+            start_pos += to.length();
+        }
+    }
+
+public:
+    Tokenizer(const std::string& modelPath)
+    {
+        auto status = processor.Load(modelPath);
+        if (!status.ok())
+        {
+            std::cerr << status.ToString() << std::endl;
+        }
+        LOG_INFO << "Successully loaded the tokenizer";
+    }
+
+    std::string decodeWithSpace(const int id)
+    {
+        std::string text = processor.IdToPiece(id);
+        replaceSubstring(text, "▁", " ");
+        return text;
+    }
+
+    std::string decode(const std::vector<int32_t> ids)
+    {
+        std::string text = processor.DecodeIds(ids);
+        return text;
+    }
+
+    std::vector<int> encode(const std::string& input)
+    {
+        std::vector<int> ids;
+        processor.Encode(input, &ids);
+        return ids;
+    }
+};
+
+namespace inferences
+{
+
+class tensorrtllm : public drogon::HttpController<tensorrtllm>
+{
+public:
+    tensorrtllm(){};
+
+    METHOD_LIST_BEGIN
+    // use METHOD_ADD to add your custom processing function here;
+    ADD_METHOD_TO(tensorrtllm::chat_completion, "/v1/chat/completions", Post); // path is
+    METHOD_ADD(tensorrtllm::loadModel, "loadmodel", Post);
+
+    METHOD_LIST_END
+    // your declaration of processing function maybe like this:
+    // void get(const HttpRequestPtr& req, std::function<void (const HttpResponsePtr &)> &&callback, int p1, std::string
+    // p2);
+    void chat_completion(
+        inferences::ChatCompletionRequest&& completion, std::function<void(const HttpResponsePtr&)>&& callback);
+
+    void loadModel(const HttpRequestPtr& req, std::function<void(const HttpResponsePtr&)>&& callback);
+    std::unique_ptr<GptSession> gptSession;
+    GenerationInput::TensorPtr getTensorSingleStopWordList(int stopToken);
+    GenerationInput createGenerationInput(std::vector<int32_t> inputIds);
+    GenerationOutput createGenerationOutput();
+    std::unique_ptr<Tokenizer> nitro_tokenizer;
+    GenerationInput::TensorPtr getTensorChatMLStopWordList();
+
+private:
+    GptSession::Config sessionConfig{1, 1, 1};
+    SamplingConfig samplingConfig{1};
+    std::unique_ptr<GptModelConfig> modelConfig;
+    std::shared_ptr<TllmLogger> logger;
+    std::string example_string{
+        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nPlease write a long and sad "
+        "story<|im_end|>\n<|im_start|>assistant"};
+    std::string user_prompt{"<|im_end|>\n<|im_start|>user\n"};
+    std::string ai_prompt{"<|im_end|>\n<|im_start|>assistant\n"};
+    std::string system_prompt{"<|im_start|>system\n"};
+    std::string pre_prompt;
+    int batchSize = 1;
+};
+
+} // namespace inferences
diff --git a/cpp/tensorrt_llm/nitro/install_deps.sh b/cpp/tensorrt_llm/nitro/install_deps.sh
new file mode 100755
index 00000000000..d43257aa08e
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/install_deps.sh
@@ -0,0 +1,3 @@
+cmake -S ./nitro_deps -B ./build_deps/nitro_deps
+make -C ./build_deps/nitro_deps -j 10
+rm -rf ./build_deps/nitro_deps
diff --git a/cpp/tensorrt_llm/nitro/main.cc b/cpp/tensorrt_llm/nitro/main.cc
new file mode 100644
index 00000000000..730253f74f3
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/main.cc
@@ -0,0 +1,73 @@
+#include "utils/nitro_utils.h"
+#include <climits> // for PATH_MAX
+#include <drogon/HttpAppFramework.h>
+#include <drogon/drogon.h>
+#include <iostream>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <libgen.h> // for dirname()
+#include <mach-o/dyld.h>
+#elif defined(__linux__)
+#include <libgen.h> // for dirname()
+#include <unistd.h> // for readlink()
+#elif defined(_WIN32)
+#include <windows.h>
+#undef max
+#else
+#error "Unsupported platform!"
+#endif
+
+int main(int argc, char* argv[])
+{
+    int thread_num = 1;
+    std::string host = "127.0.0.1";
+    int port = 3928;
+    std::string uploads_folder_path;
+
+    // Number of nitro threads
+    if (argc > 1)
+    {
+        thread_num = std::atoi(argv[1]);
+    }
+
+    // Check for host argument
+    if (argc > 2)
+    {
+        host = argv[2];
+    }
+
+    // Check for port argument
+    if (argc > 3)
+    {
+        port = std::atoi(argv[3]); // Convert string argument to int
+    }
+
+    // Uploads folder path
+    if (argc > 4)
+    {
+        uploads_folder_path = argv[4];
+    }
+
+    int logical_cores = std::thread::hardware_concurrency();
+    int drogon_thread_num = 1; // temporarily set thread num to 1
+    nitro_utils::nitro_logo();
+#ifdef NITRO_VERSION
+    LOG_INFO << "Nitro version: " << NITRO_VERSION;
+#else
+    LOG_INFO << "Nitro version: undefined";
+#endif
+    LOG_INFO << "Server started, listening at: " << host << ":" << port;
+    LOG_INFO << "Please load your model";
+    drogon::app().addListener(host, port);
+    drogon::app().setThreadNum(drogon_thread_num);
+    if (!uploads_folder_path.empty())
+    {
+        LOG_INFO << "Drogon uploads folder is at: " << uploads_folder_path;
+        drogon::app().setUploadPath(uploads_folder_path);
+    }
+    LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum();
+
+    drogon::app().run();
+
+    return 0;
+}
diff --git a/cpp/tensorrt_llm/nitro/models/chat_completion_request.h b/cpp/tensorrt_llm/nitro/models/chat_completion_request.h
new file mode 100644
index 00000000000..bd802d67e02
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/models/chat_completion_request.h
@@ -0,0 +1,36 @@
+#pragma once
+#include <drogon/HttpController.h>
+
+namespace inferences {
+struct ChatCompletionRequest {
+  bool stream = false;
+  int max_tokens = 500;
+  float top_p = 0.95;
+  float temperature = 0.8;
+  float frequency_penalty = 0;
+  float presence_penalty = 0;
+  Json::Value stop = Json::Value(Json::arrayValue);
+  Json::Value messages = Json::Value(Json::arrayValue);
+};
+}  // namespace inferences
+
+namespace drogon {
+template <>
+inline inferences::ChatCompletionRequest fromRequest(const HttpRequest& req) {
+  auto jsonBody = req.getJsonObject();
+  inferences::ChatCompletionRequest completion;
+  if (jsonBody) {
+    completion.stream = (*jsonBody).get("stream", false).asBool();
+    completion.max_tokens = (*jsonBody).get("max_tokens", 500).asInt();
+    completion.top_p = (*jsonBody).get("top_p", 0.95).asFloat();
+    completion.temperature = (*jsonBody).get("temperature", 0.8).asFloat();
+    completion.frequency_penalty =
+        (*jsonBody).get("frequency_penalty", 0).asFloat();
+    completion.presence_penalty =
+        (*jsonBody).get("presence_penalty", 0).asFloat();
+    completion.messages = (*jsonBody)["messages"];
+    completion.stop = (*jsonBody)["stop"];
+  }
+  return completion;
+}
+}  // namespace inferences
diff --git a/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
new file mode 100644
index 00000000000..cd0d76a719e
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/nitro_deps/CMakeLists.txt
@@ -0,0 +1,108 @@
+cmake_minimum_required(VERSION 3.22)  # Required for FetchContent
+
+project(MyProject)
+
+include(ExternalProject)
+
+# Define variables
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(THIRD_PARTY_INSTALL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/_install)
+#if(NOT THIRD_PARTY_INSTALL_PATH )
+#  message(FATAL_ERROR "TRITON_THIRD_PARTY_INSTALL_PREFIX must be set")
+#endif() # TRITON_THIRD_PARTY_INSTALL_PREFIX
+# To force the find_package to look for .a inside self installed version
+#set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+#set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+#set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+#
+# Add the external project
+set(ZLIB_USE_STATIC_LIBS OFF)
+find_package(ZLIB)
+if(NOT ZLIB_FOUND)
+    set(ZLIB_USE_STATIC_LIBS ON)
+    ExternalProject_Add(
+        zlib
+	GIT_REPOSITORY https://github.com/madler/zlib.git
+	GIT_TAG v1.2.11
+	CMAKE_ARGS
+	    -DBUILD_SHARED_LIBS=OFF
+	    -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+    )
+endif()
+
+ExternalProject_Add(
+    brotli
+    GIT_REPOSITORY https://github.com/google/brotli
+    GIT_TAG v1.1.0
+    CMAKE_ARGS 
+	-DCMAKE_BUILD_TYPE=Release
+	-DBUILD_SHARED_LIBS=OFF
+	-DSHARE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/share
+	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    jsoncpp
+    GIT_REPOSITORY https://github.com/open-source-parsers/jsoncpp
+    GIT_TAG 1.9.5
+    CMAKE_ARGS 
+    	-DBUILD_SHARED_LIBS=OFF
+    	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    c-ares
+    GIT_REPOSITORY https://github.com/c-ares/c-ares
+    GIT_TAG cares-1_26_0
+    CMAKE_ARGS
+    	-DCARES_SHARED=OFF
+	-DCARES_STATIC=ON
+    	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    drogon
+    GIT_REPOSITORY https://github.com/drogonframework/drogon
+    GIT_TAG v1.9.2
+    CMAKE_ARGS
+	-DCMAKE_BUILD_TYPE=release
+	-DOPENSSL_USE_STATIC_LIBS=TRUE
+	-DZLIB_USE_STATIC_LIBS=${ZLIB_USE_STATIC_LIBS}
+	-DBUILD_ORM=OFF
+	-DBUILD_YAML_CONFIG=OFF
+	-DBUILD_EXAMPLES=OFF
+	-DBUILD_CTL=OFF
+	-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+	-DBUILD_BROTLI=ON
+	-DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH}
+	# -DCMAKE_FIND_ROOT_PATH=${THIRD_PARTY_INSTALL_PATH} # To set the dir (that will be used to force the look for .a)
+	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+ExternalProject_Add(
+    sentencepiece
+    GIT_REPOSITORY https://github.com/google/sentencepiece
+    GIT_TAG v0.2.0
+    CMAKE_ARGS
+    	-DSPM_ENABLE_SHARED=OFF
+    	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
+)
+
+# Fix trantor cmakelists to link c-ares on Windows
+if(WIN32)
+    set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/nitro_deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt)
+    ExternalProject_Add_Step(drogon trantor_custom_target
+        COMMAND ${CMAKE_COMMAND} -E echo add_definitions(-DCARES_STATICLIB) >> ${TRANTOR_CMAKE_FILE}
+	DEPENDEES download
+    )
+endif()
+
+include_directories(${THIRD_PARTY_INSTALL_PATH}/include)
+link_directories(${THIRD_PARTY_INSTALL_PATH}/lib)
+# Optionally link or add dependencies to your targets
+add_dependencies(drogon c-ares jsoncpp brotli)
+
+if(ZLIB_USE_STATIC_LIBS)
+    add_dependencies(drogon zlib)
+endif()
+# target_link_libraries(<your-target> ...)
diff --git a/cpp/tensorrt_llm/nitro/utils/nitro_utils.h b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
new file mode 100644
index 00000000000..5e382bd82fe
--- /dev/null
+++ b/cpp/tensorrt_llm/nitro/utils/nitro_utils.h
@@ -0,0 +1,287 @@
+#pragma once
+#include "cstdio"
+#include "random"
+#include "string"
+#include <algorithm>
+#include <drogon/HttpClient.h>
+#include <drogon/HttpResponse.h>
+#include <fstream>
+#include <iostream>
+#include <ostream>
+#include <regex>
+#include <vector>
+// Include platform-specific headers
+#ifdef _WIN32
+#include <windows.h>
+#include <winsock2.h>
+#else
+#include <dirent.h>
+#endif
+
+namespace nitro_utils
+{
+
+inline std::string models_folder = "./models";
+
+inline std::string extractBase64(const std::string& input)
+{
+    std::regex pattern("base64,(.*)");
+    std::smatch match;
+
+    if (std::regex_search(input, match, pattern))
+    {
+        std::string base64_data = match[1];
+        base64_data = base64_data.substr(0, base64_data.length() - 1);
+        return base64_data;
+    }
+
+    return "";
+}
+
+// Helper function to encode data to Base64
+inline std::string base64Encode(const std::vector<unsigned char>& data)
+{
+    static const char encodingTable[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    std::string encodedData;
+    int i = 0;
+    int j = 0;
+    unsigned char array3[3];
+    unsigned char array4[4];
+
+    for (unsigned char c : data)
+    {
+        array3[i++] = c;
+        if (i == 3)
+        {
+            array4[0] = (array3[0] & 0xfc) >> 2;
+            array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
+            array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
+            array4[3] = array3[2] & 0x3f;
+
+            for (i = 0; i < 4; i++)
+                encodedData += encodingTable[array4[i]];
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j < 3; j++)
+            array3[j] = '\0';
+
+        array4[0] = (array3[0] & 0xfc) >> 2;
+        array4[1] = ((array3[0] & 0x03) << 4) + ((array3[1] & 0xf0) >> 4);
+        array4[2] = ((array3[1] & 0x0f) << 2) + ((array3[2] & 0xc0) >> 6);
+
+        for (j = 0; j < i + 1; j++)
+            encodedData += encodingTable[array4[j]];
+
+        while (i++ < 3)
+            encodedData += '=';
+    }
+
+    return encodedData;
+}
+
+// Function to load an image and convert it to Base64
+inline std::string imageToBase64(const std::string& imagePath)
+{
+    std::ifstream imageFile(imagePath, std::ios::binary);
+    if (!imageFile.is_open())
+    {
+        throw std::runtime_error("Could not open the image file.");
+    }
+
+    std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(imageFile), {});
+    return base64Encode(buffer);
+}
+
+// Helper function to generate a unique filename
+inline std::string generateUniqueFilename(const std::string& prefix, const std::string& extension)
+{
+    // Get current time as a timestamp
+    auto now = std::chrono::system_clock::now();
+    auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
+    auto epoch = now_ms.time_since_epoch();
+    auto value = std::chrono::duration_cast<std::chrono::milliseconds>(epoch);
+
+    // Generate a random number
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(1000, 9999);
+
+    std::stringstream ss;
+    ss << prefix << value.count() << "_" << dis(gen) << extension;
+    return ss.str();
+}
+
+inline void processLocalImage(const std::string& localPath, std::function<void(const std::string&)> callback)
+{
+    try
+    {
+        std::string base64Image = imageToBase64(localPath);
+        callback(base64Image); // Invoke the callback with the Base64 string
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "Error during processing: " << e.what() << std::endl;
+    }
+}
+
+inline std::vector<std::string> listFilesInDir(const std::string& path)
+{
+    std::vector<std::string> files;
+
+#ifdef _WIN32
+    // Windows-specific code
+    WIN32_FIND_DATA findFileData;
+    HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData);
+
+    if (hFind != INVALID_HANDLE_VALUE)
+    {
+        do
+        {
+            if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
+            {
+                files.push_back(findFileData.cFileName);
+            }
+        } while (FindNextFile(hFind, &findFileData) != 0);
+        FindClose(hFind);
+    }
+#else
+    // POSIX-specific code (Linux, Unix, MacOS)
+    DIR* dir;
+    struct dirent* ent;
+
+    if ((dir = opendir(path.c_str())) != NULL)
+    {
+        while ((ent = readdir(dir)) != NULL)
+        {
+            if (ent->d_type == DT_REG)
+            { // Check if it's a regular file
+                files.push_back(ent->d_name);
+            }
+        }
+        closedir(dir);
+    }
+#endif
+
+    return files;
+}
+
+inline std::string rtrim(const std::string& str)
+{
+    size_t end = str.find_last_not_of("\n\t ");
+    return (end == std::string::npos) ? "" : str.substr(0, end + 1);
+}
+
+inline std::string generate_random_string(std::size_t length)
+{
+    const std::string characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::uniform_int_distribution<> distribution(0, characters.size() - 1);
+
+    std::string random_string(length, '\0');
+    std::generate_n(random_string.begin(), length, [&]() { return characters[distribution(generator)]; });
+
+    return random_string;
+}
+
+inline void nitro_logo()
+{
+    std::string rainbowColors[] = {
+        "\033[94m", // Blue
+    };
+
+    std::string resetColor = "\033[0m";
+    std::string asciiArt = R"(
+███╗   ██╗██╗████████╗██████╗  ██████╗ 
+████╗  ██║██║╚══██╔══╝██╔══██╗██╔═══██╗
+██╔██╗ ██║██║   ██║   ██████╔╝██║   ██║
+██║╚██╗██║██║   ██║   ██╔══██╗██║   ██║
+██║ ╚████║██║   ██║   ██║  ██║╚██████╔╝
+╚═╝  ╚═══╝╚═╝   ╚═╝   ╚═╝  ╚═╝ ╚═════╝ 
+ 
+  )";
+
+    std::string asciiArtRTX = R"( 
+------------------------
+    ____  ______ __  __   ________   __      
+___/ __ \__  __/_  |/ /    __  __ \__  | / /
+__/ /_/ /_/ /  _\    /     _/ / / /_   |/ / 
+_/ _, _/_/ /   _/   |      / /_/ /_  /|  /  
+/_/ |_| /_/    /_/|_|      \____/ /_/ |_/   
+                                            
+)";
+
+    int colorIndex = 0;
+
+    for (char c : asciiArt)
+    {
+        if (c == '\n')
+        {
+            std::cout << resetColor << c;
+            colorIndex = 0;
+        }
+        else
+        {
+            std::cout << "\033[94m" << c;
+            colorIndex++;
+        }
+    }
+
+    std::cout << resetColor; // Reset color at the endreturn;
+
+    for (char c : asciiArtRTX)
+    {
+        if (c == '\n')
+        {
+            std::cout << resetColor << c;
+            colorIndex = 0;
+        }
+        else
+        {
+            std::cout << "\033[1;32m" << c; // bright blue
+            colorIndex++;
+        }
+    }
+
+    std::cout << resetColor; // Reset color at the endreturn;
+}
+
+inline drogon::HttpResponsePtr nitroHttpResponse()
+{
+    auto resp = drogon::HttpResponse::newHttpResponse();
+#ifdef ALLOW_ALL_CORS
+    LOG_INFO << "Respond for all cors!";
+    resp->addHeader("Access-Control-Allow-Origin", "*");
+#endif
+    return resp;
+}
+
+inline drogon::HttpResponsePtr nitroHttpJsonResponse(const Json::Value& data)
+{
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(data);
+#ifdef ALLOW_ALL_CORS
+    LOG_INFO << "Respond for all cors!";
+    resp->addHeader("Access-Control-Allow-Origin", "*");
+#endif
+    return resp;
+};
+
+inline drogon::HttpResponsePtr nitroStreamResponse(
+    const std::function<std::size_t(char*, std::size_t)>& callback, const std::string& attachmentFileName = "")
+{
+    auto resp
+        = drogon::HttpResponse::newStreamResponse(callback, attachmentFileName, drogon::CT_NONE, "text/event-stream");
+#ifdef ALLOW_ALL_CORS
+    LOG_INFO << "Respond for all cors!";
+    resp->addHeader("Access-Control-Allow-Origin", "*");
+#endif
+    return resp;
+}
+
+} // namespace nitro_utils