diff --git a/.github/scripts/e2e-test-python-linux-and-mac.sh b/.github/scripts/e2e-test-python-linux-and-mac.sh new file mode 100755 index 000000000..586e1024c --- /dev/null +++ b/.github/scripts/e2e-test-python-linux-and-mac.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +## Example run command +# ./e2e-test-python-linux-and-mac.sh '../../examples/build/server' './e2e-test.py' + +# Check for required arguments +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +BINARY_PATH=$1 +PYTHON_FILE_EXECUTION_PATH=$2 + +rm /tmp/python-file-execution-res.log /tmp/server.log + +# Random port to ensure it's not used +min=10000 +max=11000 +range=$((max - min + 1)) +PORT=$((RANDOM % range + min)) + +# Install numpy for Python +export PYTHONHOME=$(pwd)/engines/cortex.python/python/ +export LD_LIBRARY_PATH="$PYTHONHOME:$LD_LIBRARY_PATH" +export DYLD_FALLBACK_LIBRARY_PATH="$PYTHONHOME:$DYLD_FALLBACK_LIBRARY_PATH" +echo "Set Python HOME to $PYTHONHOME" +echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" +./engines/cortex.python/python/bin/python3 -m ensurepip +./engines/cortex.python/python/bin/python3 -m pip install --upgrade pip +./engines/cortex.python/python/bin/python3 -m pip install numpy --target=$PYTHONHOME/lib/python/site-packages/ + +# Start the binary file +"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/server.log & + +pid=$! + +if ! ps -p $pid >/dev/null; then + echo "server failed to start. Logs:" + cat /tmp/server.log + exit 1 +fi + +# Wait for a few seconds to let the server start +sleep 3 + +# Run the curl commands +response1=$(curl --connect-timeout 60 -o /tmp/python-file-execution-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/fine_tuning/job" \ + --header 'Content-Type: application/json' \ + --data '{ + "file_execution_path": "'$PYTHON_FILE_EXECUTION_PATH'" + }') + +error_occurred=0 + +# Verify the response +if [[ "$response1" -ne 200 ]]; then + echo "The python file execution curl command failed with status code: $response1" + cat /tmp/python-file-execution-res.log + error_occurred=1 +fi + +# Verify the output of the Python file in output.txt +OUTPUT_FILE="./output.txt" +EXPECTED_OUTPUT="1 2 3" # Replace with the expected content + +if [[ -f "$OUTPUT_FILE" ]]; then + actual_output=$(cat "$OUTPUT_FILE") + if [[ "$actual_output" != "$EXPECTED_OUTPUT" ]]; then + echo "The output of the Python file does not match the expected output." + echo "Expected: $EXPECTED_OUTPUT" + echo "Actual: $actual_output" + error_occurred=1 + else + echo "The output of the Python file matches the expected output." + fi +else + echo "Output file $OUTPUT_FILE does not exist." + error_occurred=1 +fi + + +if [[ "$error_occurred" -eq 1 ]]; then + echo "Server test run failed!!!!!!!!!!!!!!!!!!!!!!" + echo "Server Error Logs:" + cat /tmp/server.log + kill $pid + echo "An error occurred while running the server." + exit 1 +fi + +echo "----------------------" +echo "Log server:" +cat /tmp/server.log + +echo "Server test run successfully!" + +# Kill the server process +kill $pid \ No newline at end of file diff --git a/.github/scripts/e2e-test-python-windows.bat b/.github/scripts/e2e-test-python-windows.bat new file mode 100755 index 000000000..2e673f58f --- /dev/null +++ b/.github/scripts/e2e-test-python-windows.bat @@ -0,0 +1,119 @@ +@echo off + +setlocal enabledelayedexpansion + +set "TEMP=C:\Users\%UserName%\AppData\Local\Temp" + +rem Check for required arguments +if "%~2"=="" ( + echo Usage: %~0 ^ ^ + exit /b 1 +) + +set "BINARY_PATH=%~1" +set "PYTHON_FILE_EXECUTION_PATH=%~2" + +for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi" + +echo BINARY_NAME=%BINARY_NAME% + +del %TEMP%\response1.log 2>nul +del %TEMP%\server.log 2>nul + +set /a min=9999 +set /a max=11000 +set /a range=max-min+1 +set /a PORT=%min% + %RANDOM% %% %range% + +rem Install numpy for Python +set "PYTHONHOME=%cd%\engines\cortex.python\python" +echo Set Python HOME to %PYTHONHOME% +%PYTHONHOME%\python.exe -m ensurepip +%PYTHONHOME%\python.exe -m pip install --upgrade pip +%PYTHONHOME%\python.exe -m pip install numpy --target=%PYTHONHOME%\Lib\site-packages\ + +rem Start the binary file +start "" /B "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > "%TEMP%\server.log" 2>&1 + +ping -n 3 127.0.0.1 > nul + +rem Capture the PID of the started process with "server" in its name +for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do ( + set "pid=%%a" +) + +echo pid=%pid% + +if not defined pid ( + echo server failed to start. Logs: + type %TEMP%\server.log + echo. + exit /b 1 +) + +rem Wait for a few seconds to let the server start + +rem Define JSON strings for curl data +call set "PYTHON_FILE_EXECUTION_PATH_STRING=%%PYTHON_FILE_EXECUTION_PATH:\=\\%%" +set "curl_data1={\"file_execution_path\":\"%PYTHON_FILE_EXECUTION_PATH_STRING%\"}" + +rem Print the values of curl_data for debugging +echo curl_data1=%curl_data1% + +rem Run the curl commands and capture the status code +curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/fine_tuning/job" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1 + +set "error_occurred=0" + +rem Read the status code directly from the response file +set "response1=" +for /f %%a in (%TEMP%\response1.log) do set "response1=%%a" + +if "%response1%" neq "200" ( + echo The first curl command failed with status code: %response1% + type %TEMP%\response1.log + echo. + set "error_occurred=1" +) + +echo ---------------------- +echo Log python file execution: +type %TEMP%\response1.log +echo. + +rem Verification step: Check the contents of output.txt +set "expected_output=1 2 3" +set "actual_output=" +if exist "output.txt" ( + for /f "delims=" %%x in (output.txt) do set "actual_output=%%x" + if "!actual_output!"=="!expected_output!" ( + echo Verification succeeded: output.txt contains the expected data. + ) else ( + echo Verification failed: output.txt does not contain the expected data. + echo Expected: !expected_output! + echo Actual: !actual_output! + set "error_occurred=1" + ) +) else ( + echo Verification failed: output.txt does not exist. + set "error_occurred=1" +) + +echo ---------------------- +echo Server logs: +type %TEMP%\server.log +echo. + +if "%error_occurred%"=="1" ( + echo Server test run failed!!!!!!!!!!!!!!!!!!!!!! + taskkill /f /pid %pid% + echo An error occurred while running the server. + exit /b 1 +) + +echo Server test run successfully! + +rem Kill the server process +taskkill /f /im server.exe 2>nul || exit /B 0 + +endlocal \ No newline at end of file diff --git a/.github/scripts/python-file-to-test.py b/.github/scripts/python-file-to-test.py new file mode 100644 index 000000000..e897baf42 --- /dev/null +++ b/.github/scripts/python-file-to-test.py @@ -0,0 +1,9 @@ +import sys; +for path in sys.path: + print(path) + +import numpy as np +print("Numpy version: " + np.__version__) + +with open('output.txt', 'w') as file: + file.write(' '.join(map(str, np.array([1, 2, 3])))) \ No newline at end of file diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml index 33c8a4533..e84efa4e7 100644 --- a/.github/workflows/cortex-cpp-quality-gate.yml +++ b/.github/workflows/cortex-cpp-quality-gate.yml @@ -12,6 +12,7 @@ on: env: LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf + PYTHON_FILE_EXECUTION_PATH: "python-file-to-test.py" jobs: build-and-test: @@ -26,107 +27,126 @@ jobs: runs-on: "ubuntu-18-04" cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" run-e2e: true + run-python-e2e: true - os: "linux" name: "amd64-avx" runs-on: "ubuntu-18-04" cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" run-e2e: false + run-python-e2e: false - os: "linux" name: "amd64-avx512" runs-on: "ubuntu-18-04" cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" run-e2e: false + run-python-e2e: false - os: "linux" name: "amd64-vulkan" runs-on: "ubuntu-18-04-cuda-11-7" cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF" run-e2e: false + run-python-e2e: false - os: "linux" name: "amd64-cuda-11-7" runs-on: "ubuntu-18-04-cuda-11-7" cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON" run-e2e: false + run-python-e2e: false - os: "linux" name: "amd64-cuda-12-0" runs-on: "ubuntu-18-04-cuda-12-0" cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON" run-e2e: false + run-python-e2e: false - os: "mac" name: "amd64" runs-on: "macos-13" cmake-flags: "" run-e2e: true + run-python-e2e: true - os: "mac" name: "arm64" runs-on: "mac-silicon" cmake-flags: "-DMAC_ARM64=ON" run-e2e: true + run-python-e2e: true - os: "windows" name: "amd64-avx2" runs-on: "windows-latest" cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: true + run-python-e2e: true - os: "windows" name: "amd64-avx" runs-on: "windows-latest" cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-avx512" runs-on: "windows-latest" cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-vulkan" runs-on: "windows-latest" cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-avx2-cuda-12-0" runs-on: "windows-cuda-12-0" cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-avx-cuda-12-0" runs-on: "windows-cuda-12-0" cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-avx512-cuda-12-0" runs-on: "windows-cuda-12-0" cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-avx2-cuda-11-7" runs-on: "windows-cuda-11-7" cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false - os: "windows" name: "amd64-avx-cuda-11-7" runs-on: "windows-cuda-11-7" cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false + - os: "windows" name: "amd64-avx512-cuda-11-7" runs-on: "windows-cuda-11-7" cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" run-e2e: false + run-python-e2e: false steps: - name: Clone @@ -161,6 +181,12 @@ jobs: cd cortex-cpp make run-e2e-test RUN_TESTS=true LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }} + - name: Run python e2e testing + if: ${{ matrix.run-python-e2e }} + run: | + cd cortex-cpp + make run-python-e2e-test RUN_TESTS=true PYTHON_FILE_EXECUTION_PATH=${{ env.PYTHON_FILE_EXECUTION_PATH }} + - name: Upload Artifact uses: actions/upload-artifact@v2 with: diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt index 8c01d2256..4e2092afe 100644 --- a/cortex-cpp/CMakeLists.txt +++ b/cortex-cpp/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.5) project(cortex-cpp C CXX) include(engines/cortex.llamacpp/engine.cmake) +if(NOT LLAMA_CUDA AND (LLAMA_AVX2 OR APPLE)) + include(engines/cortex.python/engine.cmake) +endif() include(CheckIncludeFileCXX) check_include_file_cxx(any HAS_ANY) diff --git a/cortex-cpp/Makefile b/cortex-cpp/Makefile index 98486f023..20cd4da7f 100644 --- a/cortex-cpp/Makefile +++ b/cortex-cpp/Makefile @@ -6,6 +6,7 @@ CMAKE_EXTRA_FLAGS ?= "" RUN_TESTS ?= false LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf" EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf" +PYTHON_FILE_EXECUTION_PATH ?= "python-file-to-test.py" CODE_SIGN ?= false AZURE_KEY_VAULT_URI ?= xxxx AZURE_CLIENT_ID ?= xxxx @@ -97,6 +98,30 @@ else rm -rf uploads/; endif +run-python-e2e-test: +ifeq ($(RUN_TESTS),false) + @echo "Skipping tests" + @exit 0 +endif +ifeq ($(OS),Windows_NT) + @powershell -Command "cp -r build\engines\cortex.python cortex-cpp\engines\;" + @powershell -Command "cd cortex-cpp;..\..\.github\scripts\e2e-test-python-windows.bat cortex-cpp.exe ..\..\.github\scripts\$(PYTHON_FILE_EXECUTION_PATH);" + @powershell -Command "cd .\cortex-cpp\; rm *.txt;" + @powershell -Command "mkdir python-temp; cd .\cortex-cpp\engines; Move-Item -Path .\cortex.python -Destination ..\..\python-temp;" +else ifeq ($(shell uname -s),Linux) + @cp -rf build/engines/cortex.python cortex-cpp/engines/; \ + cd cortex-cpp; \ + chmod +x ../../.github/scripts/e2e-test-python-linux-and-mac.sh && ../../.github/scripts/e2e-test-python-linux-and-mac.sh ./cortex-cpp ../../.github/scripts/$(PYTHON_FILE_EXECUTION_PATH); \ + rm -rf uploads/; \ + rm -rf ./engines/cortex.python; +else + @cp -rf build/engines/cortex.python cortex-cpp/engines/; \ + cd cortex-cpp; \ + chmod +x ../../.github/scripts/e2e-test-python-linux-and-mac.sh && ../../.github/scripts/e2e-test-python-linux-and-mac.sh ./cortex-cpp ../../.github/scripts/$(PYTHON_FILE_EXECUTION_PATH); \ + rm -rf uploads/; \ + rm -rf ./engines/cortex.python; +endif + clean: ifeq ($(OS),Windows_NT) @powershell -Command "rm -rf build; rm -rf build-deps; rm -rf cortex-cpp; rm -rf cortex-cpp.tar.gz;" diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h index c6e3aa6a9..382f21b80 100644 --- a/cortex-cpp/common/base.h +++ b/cortex-cpp/common/base.h @@ -8,16 +8,23 @@ class BaseModel { virtual ~BaseModel() {} // Model management - virtual void LoadModel(const HttpRequestPtr& req, - std::function&& callback) = 0; + virtual void LoadModel( + const HttpRequestPtr& req, + std::function&& callback) = 0; virtual void UnloadModel( const HttpRequestPtr& req, std::function&& callback) = 0; virtual void ModelStatus( const HttpRequestPtr& req, std::function&& callback) = 0; - - virtual void GetModels(const HttpRequestPtr& req, + virtual void GetModels( + const HttpRequestPtr& req, + std::function&& callback) = 0; + virtual void GetEngines( + const HttpRequestPtr& req, + std::function&& callback) = 0; + virtual void FineTuning( + const HttpRequestPtr& req, std::function&& callback) = 0; }; diff --git a/cortex-cpp/controllers/server.cc b/cortex-cpp/controllers/server.cc index 1fdb74509..225775d3d 100644 --- a/cortex-cpp/controllers/server.cc +++ b/cortex-cpp/controllers/server.cc @@ -13,24 +13,25 @@ using json = nlohmann::json; namespace inferences { namespace { constexpr static auto kLlamaEngine = "cortex.llamacpp"; -constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp"; +constexpr static auto kPythonRuntimeEngine = "cortex.python"; } // namespace -server::server() - : engine_{nullptr} { +server::server(){ - // Some default values for now below - // log_disable(); // Disable the log to file feature, reduce bloat for - // target - // system () - }; + // Some default values for now below + // log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () +}; server::~server() {} void server::ChatCompletion( const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -44,10 +45,11 @@ void server::ChatCompletion( auto json_body = req->getJsonObject(); bool is_stream = (*json_body).get("stream", false).asBool(); auto q = std::make_shared(); - engine_->HandleChatCompletion(json_body, - [q](Json::Value status, Json::Value res) { - q->push(std::make_pair(status, res)); - }); + std::get(engines_[engine_type].engine) + ->HandleChatCompletion(json_body, + [q](Json::Value status, Json::Value res) { + q->push(std::make_pair(status, res)); + }); LOG_TRACE << "Wait to chat completion responses"; if (is_stream) { ProcessStreamRes(std::move(callback), q); @@ -59,7 +61,9 @@ void server::ChatCompletion( void server::Embedding(const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -71,10 +75,11 @@ void server::Embedding(const HttpRequestPtr& req, LOG_TRACE << "Start embedding"; SyncQueue q; - engine_->HandleEmbedding(req->getJsonObject(), - [&q](Json::Value status, Json::Value res) { - q.push(std::make_pair(status, res)); - }); + std::get(engines_[engine_type].engine) + ->HandleEmbedding(req->getJsonObject(), + [&q](Json::Value status, Json::Value res) { + q.push(std::make_pair(status, res)); + }); LOG_TRACE << "Wait to embedding"; ProcessNonStreamRes(std::move(callback), q); LOG_TRACE << "Done embedding"; @@ -83,7 +88,9 @@ void server::Embedding(const HttpRequestPtr& req, void server::UnloadModel( const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -93,21 +100,24 @@ void server::UnloadModel( return; } LOG_TRACE << "Start unload model"; - engine_->UnloadModel( - req->getJsonObject(), - [cb = std::move(callback)](Json::Value status, Json::Value res) { - auto resp = cortex_utils::nitroHttpJsonResponse(res); - resp->setStatusCode( - static_cast(status["status_code"].asInt())); - cb(resp); - }); + std::get(engines_[engine_type].engine) + ->UnloadModel( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(static_cast( + status["status_code"].asInt())); + cb(resp); + }); LOG_TRACE << "Done unload model"; } void server::ModelStatus( const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -118,20 +128,23 @@ void server::ModelStatus( } LOG_TRACE << "Start to get model status"; - engine_->GetModelStatus( - req->getJsonObject(), - [cb = std::move(callback)](Json::Value status, Json::Value res) { - auto resp = cortex_utils::nitroHttpJsonResponse(res); - resp->setStatusCode( - static_cast(status["status_code"].asInt())); - cb(resp); - }); + std::get(engines_[engine_type].engine) + ->GetModelStatus( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(static_cast( + status["status_code"].asInt())); + cb(resp); + }); LOG_TRACE << "Done get model status"; } void server::GetModels(const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -142,8 +155,9 @@ void server::GetModels(const HttpRequestPtr& req, } LOG_TRACE << "Start to get models"; - if (engine_->IsSupported("GetModels")) { - engine_->GetModels( + auto& en = std::get(engines_[engine_type].engine); + if (en->IsSupported("GetModels")) { + en->GetModels( req->getJsonObject(), [cb = std::move(callback)](Json::Value status, Json::Value res) { auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -163,52 +177,127 @@ void server::GetModels(const HttpRequestPtr& req, LOG_TRACE << "Done get models"; } +void server::GetEngines( + const HttpRequestPtr& req, + std::function&& callback) { + Json::Value res; + Json::Value engine_array(Json::arrayValue); + for (const auto& [s, _] : engines_) { + Json::Value val; + val["id"] = s; + val["object"] = "engine"; + engine_array.append(val); + } + + res["object"] = "list"; + res["data"] = engine_array; + + auto resp = cortex_utils::nitroHttpJsonResponse(res); + callback(resp); +} + +void server::FineTuning( + const HttpRequestPtr& req, + std::function&& callback) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kPythonRuntimeEngine).asString(); + + if (engines_.find(engine_type) == engines_.end()) { + try { + std::string abs_path = + cortex_utils::GetCurrentPath() + cortex_utils::kPythonRuntimeLibPath; + engines_[engine_type].dl = + std::make_unique(abs_path, "engine"); + } catch (const cortex_cpp::dylib::load_error& e) { + + LOG_ERROR << "Could not load engine: " << e.what(); + engines_.erase(engine_type); + + Json::Value res; + res["message"] = "Could not load engine " + engine_type; + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k500InternalServerError); + callback(resp); + return; + } + + auto func = engines_[engine_type].dl->get_function( + "get_engine"); + engines_[engine_type].engine = func(); + LOG_INFO << "Loaded engine: " << engine_type; + } + + LOG_TRACE << "Start to fine-tuning"; + auto& en = std::get(engines_[engine_type].engine); + if (en->IsSupported("HandlePythonFileExecutionRequest")) { + en->HandlePythonFileExecutionRequest( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(static_cast( + status["status_code"].asInt())); + cb(resp); + }); + } else { + Json::Value res; + res["message"] = "Method is not supported yet"; + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k500InternalServerError); + callback(resp); + LOG_WARN << "Method is not supported yet"; + } + LOG_TRACE << "Done fine-tuning"; +} + void server::LoadModel(const HttpRequestPtr& req, std::function&& callback) { auto engine_type = (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); - if (!dylib_ || engine_type != cur_engine_name_) { - cur_engine_name_ = engine_type; - // TODO: change this when we get more engines + + // We have not loaded engine yet, should load it before using it + if (engines_.find(engine_type) == engines_.end()) { + // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time. + // So need an unload engine machanism to handle. auto get_engine_path = [](std::string_view e) { if (e == kLlamaEngine) { - return kLlamaLibPath; + return cortex_utils::kLlamaLibPath; } - return kLlamaLibPath; + return cortex_utils::kLlamaLibPath; }; try { std::string abs_path = - cortex_utils::GetCurrentPath() + get_engine_path(cur_engine_name_); - dylib_ = std::make_unique(abs_path, "engine"); + cortex_utils::GetCurrentPath() + get_engine_path(engine_type); + engines_[engine_type].dl = + std::make_unique(abs_path, "engine"); + } catch (const cortex_cpp::dylib::load_error& e) { LOG_ERROR << "Could not load engine: " << e.what(); - dylib_.reset(); - engine_ = nullptr; - } + engines_.erase(engine_type); - if (!dylib_) { Json::Value res; - res["message"] = "Could not load engine " + cur_engine_name_; + res["message"] = "Could not load engine " + engine_type; auto resp = cortex_utils::nitroHttpJsonResponse(res); resp->setStatusCode(k500InternalServerError); callback(resp); return; } - auto func = dylib_->get_function("get_engine"); - engine_ = func(); - LOG_INFO << "Loaded engine: " << cur_engine_name_; + + auto func = + engines_[engine_type].dl->get_function("get_engine"); + engines_[engine_type].engine = func(); + LOG_INFO << "Loaded engine: " << engine_type; } LOG_TRACE << "Load model"; - engine_->LoadModel( - req->getJsonObject(), - [cb = std::move(callback)](Json::Value status, Json::Value res) { - auto resp = cortex_utils::nitroHttpJsonResponse(res); - resp->setStatusCode( - static_cast(status["status_code"].asInt())); - cb(resp); - }); + auto& en = std::get(engines_[engine_type].engine); + en->LoadModel(req->getJsonObject(), [cb = std::move(callback)]( + Json::Value status, Json::Value res) { + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode( + static_cast(status["status_code"].asInt())); + cb(resp); + }); LOG_TRACE << "Done load model"; } @@ -255,8 +344,8 @@ void server::ProcessNonStreamRes(std::function cb, cb(resp); } -bool server::IsEngineLoaded() { - return !!engine_; +bool server::IsEngineLoaded(const std::string& e) { + return engines_.find(e) != engines_.end(); } } // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/controllers/server.h b/cortex-cpp/controllers/server.h index c6d67ee30..3f214a9f0 100644 --- a/cortex-cpp/controllers/server.h +++ b/cortex-cpp/controllers/server.h @@ -14,9 +14,11 @@ #include #include #include +#include #include "common/base.h" #include "cortex-common/EngineI.h" +#include "cortex-common/cortexpythoni.h" #include "trantor/utils/SerialTaskQueue.h" #include "utils/dylib.h" #include "utils/json.hpp" @@ -31,9 +33,9 @@ using namespace drogon; namespace inferences { class server : public drogon::HttpController, - public BaseModel, - public BaseChatCompletion, - public BaseEmbedding { + public BaseModel, + public BaseChatCompletion, + public BaseEmbedding { struct SyncQueue; public: @@ -47,11 +49,16 @@ class server : public drogon::HttpController, METHOD_ADD(server::UnloadModel, "unloadmodel", Post); METHOD_ADD(server::ModelStatus, "modelstatus", Post); METHOD_ADD(server::GetModels, "models", Get); - + METHOD_ADD(server::GetEngines, "engines", Get); + + // cortex.python API + METHOD_ADD(server::FineTuning, "finetuning", Post); // Openai compatible path ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Post); ADD_METHOD_TO(server::GetModels, "/v1/models", Get); + ADD_METHOD_TO(server::FineTuning, "/v1/fine_tuning/job", Post); + // ADD_METHOD_TO(server::handlePrelight, "/v1/chat/completions", Options); // NOTE: prelight will be added back when browser support is properly planned @@ -78,13 +85,19 @@ class server : public drogon::HttpController, void GetModels( const HttpRequestPtr& req, std::function&& callback) override; + void GetEngines( + const HttpRequestPtr& req, + std::function&& callback) override; + void FineTuning( + const HttpRequestPtr& req, + std::function&& callback) override; private: void ProcessStreamRes(std::function cb, std::shared_ptr q); void ProcessNonStreamRes(std::function cb, SyncQueue& q); - bool IsEngineLoaded(); + bool IsEngineLoaded(const std::string& e); private: struct SyncQueue { @@ -126,8 +139,11 @@ class server : public drogon::HttpController, }; private: - std::unique_ptr dylib_; - EngineI* engine_; - std::string cur_engine_name_; + using EngineV = std::variant; + struct EngineInfo { + std::unique_ptr dl; + EngineV engine; + }; + std::unordered_map engines_; }; }; // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/cortex-common/EngineI.h b/cortex-cpp/cortex-common/EngineI.h index 4246c8ade..c5dcc8afe 100644 --- a/cortex-cpp/cortex-common/EngineI.h +++ b/cortex-cpp/cortex-common/EngineI.h @@ -10,20 +10,21 @@ class EngineI { public: virtual ~EngineI() {} + // cortex.llamacpp interface virtual void HandleChatCompletion( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void HandleEmbedding( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void LoadModel( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void UnloadModel( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void GetModelStatus( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; // For backward compatible checking diff --git a/cortex-cpp/cortex-common/cortexpythoni.h b/cortex-cpp/cortex-common/cortexpythoni.h new file mode 100644 index 000000000..06a79838f --- /dev/null +++ b/cortex-cpp/cortex-common/cortexpythoni.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +#include "json/value.h" + +class CortexPythonEngineI { + public: + virtual ~CortexPythonEngineI() {} + + virtual bool IsSupported(const std::string& f) = 0; + + virtual void ExecutePythonFile(std::string binary_execute_path, + std::string file_execution_path, + std::string python_library_path) = 0; + + virtual void HandlePythonFileExecutionRequest( + std::shared_ptr json_body, + std::function&& callback) = 0; +}; + diff --git a/cortex-cpp/cortex-cpp-deps/CMakeLists.txt b/cortex-cpp/cortex-cpp-deps/CMakeLists.txt index 4e080a026..d6feb5991 100644 --- a/cortex-cpp/cortex-cpp-deps/CMakeLists.txt +++ b/cortex-cpp/cortex-cpp-deps/CMakeLists.txt @@ -66,7 +66,7 @@ ExternalProject_Add( ExternalProject_Add( drogon GIT_REPOSITORY https://github.com/drogonframework/drogon - GIT_TAG v1.9.2 + GIT_TAG v1.9.4 CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DOPENSSL_USE_STATIC_LIBS=TRUE diff --git a/cortex-cpp/engines/cortex.python/engine.cmake b/cortex-cpp/engines/cortex.python/engine.cmake new file mode 100644 index 000000000..fa6705fde --- /dev/null +++ b/cortex-cpp/engines/cortex.python/engine.cmake @@ -0,0 +1,38 @@ +# cortex.python release version +set(VERSION 0.1.5) +set(ENGINE_VERSION v${VERSION}) +set(ENGINE_NAME cortex.python) + +# MESSAGE("ENGINE_VERSION=" ${ENGINE_VERSION}) + +# Download library based on instructions +if(UNIX AND NOT APPLE) + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-linux-amd64.tar.gz) +elseif(UNIX) + if(MAC_ARM64) + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-mac-arm64.tar.gz) + else() + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-mac-amd64.tar.gz) + endif() +else() + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-windows-amd64.tar.gz) +endif() + + +set(LIBPYTHONRUNTIME_ENGINE_URL https://github.com/janhq/cortex.python/releases/download/${ENGINE_VERSION}/${LIBRARY_NAME}) +MESSAGE("LIBPYTHONRUNTIME_ENGINE_URL=" ${LIBPYTHONRUNTIME_ENGINE_URL}) +MESSAGE("LIBARRY_NAME=" ${LIBRARY_NAME}) +set(LIBPYTHONRUNTIME_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) + +# MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR}) + +file(DOWNLOAD ${LIBPYTHONRUNTIME_ENGINE_URL} ${LIBPYTHONRUNTIME_ENGINE_PATH} STATUS LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS) +list(GET LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS 0 LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS_NO) +# MESSAGE("file = " ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) + +if(LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS_NO) + message(STATUS "Pre-built library not downloaded. (${LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS})") +else() + message(STATUS "Linking downloaded pre-built library.") + file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME} DESTINATION ${CMAKE_BINARY_DIR}/engines/) +endif() \ No newline at end of file diff --git a/cortex-cpp/main.cc b/cortex-cpp/main.cc index 12cabeb0c..04190d92d 100644 --- a/cortex-cpp/main.cc +++ b/cortex-cpp/main.cc @@ -2,7 +2,9 @@ #include #include // for PATH_MAX #include +#include "cortex-common/cortexpythoni.h" #include "utils/cortex_utils.h" +#include "utils/dylib.h" #if defined(__APPLE__) && defined(__MACH__) #include // for dirname() @@ -18,6 +20,27 @@ #endif int main(int argc, char* argv[]) { + // Check if this process is for python execution + if (argc > 1) { + if (strcmp(argv[1], "--run_python_file") == 0) { + std::string py_home_path = (argc > 3) ? argv[3] : ""; + std::unique_ptr dl; + try { + std::string abs_path = cortex_utils::GetCurrentPath() + + cortex_utils::kPythonRuntimeLibPath; + dl = std::make_unique(abs_path, "engine"); + } catch (const cortex_cpp::dylib::load_error& e) { + LOG_ERROR << "Could not load engine: " << e.what(); + return 1; + } + + auto func = dl->get_function("get_engine"); + auto e = func(); + e->ExecutePythonFile(argv[0], argv[2], py_home_path); + return 0; + } + } + int thread_num = 1; std::string host = "127.0.0.1"; int port = 3928; diff --git a/cortex-cpp/utils/cortex_utils.h b/cortex-cpp/utils/cortex_utils.h index 3c4fdd2dd..c0670a431 100644 --- a/cortex-cpp/utils/cortex_utils.h +++ b/cortex-cpp/utils/cortex_utils.h @@ -25,6 +25,8 @@ #endif namespace cortex_utils { +constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp"; +constexpr static auto kPythonRuntimeLibPath = "/engines/cortex.python"; inline std::string models_folder = "./models";