refactor: move llama-server-context out of controller #456

louis-jan · 2024-02-29T01:26:16Z

Description

Move struct def out of controller - llamacpp

#include <string>
#if defined(_WIN32)
#define NOMINMAX
#endif

#pragma once
#define LOG_TARGET stdout

#include <drogon/HttpController.h>

#include "stb_image.h"
#include "context/llama_server_context.h"

#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
#define CPPHTTPLIB_NO_EXCEPTIONS 1
#endif

#include <trantor/utils/ConcurrentTaskQueue.h>
#include "common/base.h"
#include "utils/json.hpp"

// auto generated files (update with ./deps.sh)

#include <cstddef>
#include <thread>

#include <cstddef>
#include <thread>

#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif


using json = nlohmann::json;

using namespace drogon;

namespace inferences {
class llamaCPP : public drogon::HttpController<llamaCPP>,
                 public BaseModel,
                 public BaseChatCompletion,
                 public BaseEmbedding {
 public:
  llamaCPP();
  ~llamaCPP();
  METHOD_LIST_BEGIN
  // list path definitions here;
  METHOD_ADD(llamaCPP::ChatCompletion, "chat_completion", Post);
  METHOD_ADD(llamaCPP::Embedding, "embedding", Post);
  METHOD_ADD(llamaCPP::LoadModel, "loadmodel", Post);
  METHOD_ADD(llamaCPP::UnloadModel, "unloadmodel", Get);
  METHOD_ADD(llamaCPP::ModelStatus, "modelstatus", Get);

  // Openai compatible path
  ADD_METHOD_TO(llamaCPP::ChatCompletion, "/v1/chat/completions", Post);
  // ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options);
  // NOTE: prelight will be added back when browser support is properly planned

  ADD_METHOD_TO(llamaCPP::Embedding, "/v1/embeddings", Post);
  // ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options);

  // PATH_ADD("/llama/chat_completion", Post);
  METHOD_LIST_END
  void ChatCompletion(
      const HttpRequestPtr& req,
      std::function<void(const HttpResponsePtr&)>&& callback) override;
  void Embedding(
      const HttpRequestPtr& req,
      std::function<void(const HttpResponsePtr&)>&& callback) override;
  void LoadModel(
      const HttpRequestPtr& req,
      std::function<void(const HttpResponsePtr&)>&& callback) override;
  void UnloadModel(
      const HttpRequestPtr& req,
      std::function<void(const HttpResponsePtr&)>&& callback) override;
  void ModelStatus(
      const HttpRequestPtr& req,
      std::function<void(const HttpResponsePtr&)>&& callback) override;

 private:
  llama_server_context llama;
  // std::atomic<bool> model_loaded = false;
  size_t sent_count = 0;
  size_t sent_token_probs_index = 0;
  std::thread backgroundThread;
  std::string user_prompt;
  std::string ai_prompt;
  std::string system_prompt;
  std::string pre_prompt;
  int repeat_last_n;
  bool caching_enabled;
  std::atomic<int> no_of_chats = 0;
  int clean_cache_threshold;
  std::string grammar_file_content;

  /**
   * Queue to handle the inference tasks
   */
  trantor::ConcurrentTaskQueue* queue;

  bool LoadModelImpl(std::shared_ptr<Json::Value> jsonBody);
  void InferenceImpl(std::shared_ptr<Json::Value> jsonBody,
                     std::function<void(const HttpResponsePtr&)>& callback);
  void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
                     std::function<void(const HttpResponsePtr&)>& callback);
  bool CheckModelLoaded(std::function<void(const HttpResponsePtr&)>& callback);
  void WarmupModel();
  void BackgroundTask();
  void StopBackgroundTask();
};
};  // namespace inferences

context/llama_server_context.h

+                prompt_tokens.begin() + slot.params.n_keep);
+            new_tokens.insert(new_tokens.end(),
+                              prompt_tokens.begin() + slot.params.n_keep +
+                                  erased_blocks * n_block_size,


tikikun

LGTM

refactor: move llama-server-context out of controller

626403e

louis-jan requested a review from tikikun February 29, 2024 01:26

github-advanced-security bot found potential problems Feb 29, 2024

View reviewed changes

tikikun approved these changes Feb 29, 2024

View reviewed changes

tikikun merged commit 71824b9 into main Feb 29, 2024

louis-jan deleted the refactor/llama-server-context-should-not-be-in-controller branch May 24, 2024 03:11

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

refactor: move llama-server-context out of controller #456

refactor: move llama-server-context out of controller #456

Uh oh!

louis-jan commented Feb 29, 2024 •

edited

Loading

Uh oh!

Check failure

tikikun left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

refactor: move llama-server-context out of controller #456

refactor: move llama-server-context out of controller #456

Uh oh!

Conversation

louis-jan commented Feb 29, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Description

Uh oh!

Check failure

tikikun left a comment

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

louis-jan commented Feb 29, 2024 •

edited

Loading