Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 81 additions & 27 deletions cpp/tensorrt_llm/nitro/controllers/tensorrtllm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,52 @@ void removeId(std::vector<int>& vec, int id)
struct inferenceState
{
int prevPos{0};
std::string prevText;
bool isFinished;
std::queue<std::string> textsToStream;
std::mutex queueMutex; // Mutex to protect access to textsToStream

size_t stopWordMatchLen = 0;
std::vector<std::string> sequence{"<", "|", "im", "_", "end", "|", ">"};

void reset()
{
stopWordMatchLen = 0;
prevText = "";
}

bool isComplete() const
{
return stopWordMatchLen >= sequence.size();
}
};

bool handleMatch(const std::string& rawText, std::shared_ptr<inferenceState> inferState)
{
if (inferState->isComplete())
{
return true;
}

if (rawText == inferState->sequence[inferState->stopWordMatchLen])
{
inferState->stopWordMatchLen++; // Move to next state
inferState->prevText = rawText;
return true;
}
else if (inferState->stopWordMatchLen > 0 && rawText == inferState->sequence[0])
{
inferState->stopWordMatchLen = 1; // Restart from first match if sequence breaks but matches start
inferState->prevText = rawText;
return true;
}
else
{
inferState->reset();
return false; // Reset to start if sequence breaks
}
}

// Only support single token stopping point now
std::string create_return_json(const std::string& id, const std::string& model, const std::string& content,
Json::Value finish_reason = Json::Value())
Expand Down Expand Up @@ -67,6 +108,13 @@ GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToke
return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU);
}

GenerationInput::TensorPtr tensorrtllm::getTensorChatMLStopWordList()
{
std::vector<int32_t> stopWordsTokens = {28789, 28766, 321, 28730, 416, 28766, 28767, 32000, 6, 8, -1, -1, -1, -1,
-1, -1}; // Extend with -1 for increased length
return gptSession->getBufferManager().copyFrom(stopWordsTokens, ITensor::makeShape({1, 2, 8}), MemoryType::kGPU);
}

GenerationInput tensorrtllm::createGenerationInput(std::vector<int32_t> inputIdsHost)
{
int inputLen = inputIdsHost.size();
Expand All @@ -78,7 +126,7 @@ GenerationInput tensorrtllm::createGenerationInput(std::vector<int32_t> inputIds

GenerationInput generationInput{0, 0, inputIds, inputLengths, modelConfig->usePackedInput()};

generationInput.stopWordsList = getTensorSingleStopWordList(32000);
generationInput.stopWordsList = getTensorChatMLStopWordList();
return generationInput;
}

Expand Down Expand Up @@ -117,35 +165,35 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int
generationOutput.onTokenGenerated = [&inferState, inputLen, outputLen, self, &generationOutput](
GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
{
if (!finished)
// Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
// Copy output IDs from GPU to host for printing
std::vector<int32_t> outputIdsHost(outputLength);
self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
// Find the last non-zero value in the output IDs starting from the end of the input sequence
std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
removeId(outputIdsHostDecode, 0);
std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode);

if (inferState->prevPos > 0 && inferState->prevPos < text.size())
{
// Valid prevPos, proceed with slicing the string from prevPos to the end
std::string stringTok(text.begin() + inferState->prevPos, text.end());
std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
inferState->textsToStream.push(stringTok);
}
else if (inferState->prevPos >= text.size())
{
// Assuming the shape of outputIds tensor is (1, 1, 160), where 160 is the number of tokens
int outputLength = outputIds->getShape().d[2]; // Get the length of output IDs based on the tensor shape
// Copy output IDs from GPU to host for printing
std::vector<int32_t> outputIdsHost(outputLength);
self->gptSession->getBufferManager().copy(*outputIds, outputIdsHost.data(), MemoryType::kCPU);
// Find the last non-zero value in the output IDs starting from the end of the input sequence
std::vector<int> outputIdsHostDecode(outputIdsHost.begin() + inputLen, outputIdsHost.end());
removeId(outputIdsHostDecode, 0);
removeId(outputIdsHostDecode, 32000);
std::string text = self->nitro_tokenizer->decode(outputIdsHostDecode);

if (inferState->prevPos > 0 && inferState->prevPos < text.size())
{
// Valid prevPos, proceed with slicing the string from prevPos to the end
std::string stringTok(text.begin() + inferState->prevPos, text.end());
std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
inferState->textsToStream.push(stringTok);
}
else if (inferState->prevPos >= text.size())
{
inferState->prevPos = text.size();
}
inferState->prevPos = text.size();
}
inferState->prevPos = text.size();
if (finished)
{

std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
inferState->textsToStream.push("[DONE]");
return;
}
std::lock_guard<std::mutex> guard(inferState->queueMutex); // Protect access with a lock
inferState->textsToStream.push("[DONE]");
};
// The rest of the logic inside the `chat_completion` remains unchanged...
// After finishing the setup, call the inference logic
Expand Down Expand Up @@ -243,6 +291,12 @@ void tensorrtllm::chat_completion(
{

std::string rawText = inferState->textsToStream.front();
inferState->textsToStream.pop();
if (handleMatch(rawText, inferState))
{
continue;
};

if (rawText == "[DONE]")
{
LOG_INFO << "End of result";
Expand All @@ -257,14 +311,14 @@ void tensorrtllm::chat_completion(
}
const std::string textToStream
= "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", rawText) + "\n\n";
inferState->textsToStream.pop();
lock.unlock(); // Unlock as soon as possible

// Ensure we do not exceed the buffer size. Truncate if necessary.
std::size_t bytesToWrite = std::min(nBuffSize, textToStream.size());

// Copy the text to the provided buffer
std::memcpy(pBuffer, textToStream.data(), bytesToWrite);
inferState->prevText = rawText;
return bytesToWrite; // Return the number of bytes written to the buffer
}
else
Expand Down
1 change: 1 addition & 0 deletions cpp/tensorrt_llm/nitro/controllers/tensorrtllm.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class tensorrtllm : public drogon::HttpController<tensorrtllm>
GenerationInput createGenerationInput(std::vector<int32_t> inputIds);
GenerationOutput createGenerationOutput();
std::unique_ptr<Tokenizer> nitro_tokenizer;
GenerationInput::TensorPtr getTensorChatMLStopWordList();

private:
GptSession::Config sessionConfig{1, 1, 1};
Expand Down