Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inference] First PR for rebuild colossal-infer #5143

Merged
merged 2 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 0 additions & 229 deletions colossalai/inference/README.md

This file was deleted.

4 changes: 0 additions & 4 deletions colossalai/inference/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
from .engine import InferenceEngine
from .engine.policies import BloomModelInferPolicy, ChatGLM2InferPolicy, LlamaModelInferPolicy

__all__ = ["InferenceEngine", "LlamaModelInferPolicy", "BloomModelInferPolicy", "ChatGLM2InferPolicy"]
Empty file.
73 changes: 73 additions & 0 deletions colossalai/inference/core/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from logging import Logger
from typing import Optional

from .request_handler import RequestHandler


class InferEngine:
"""
InferEngine is the core component for Inference.

It is responsible for launch the inference process, including:
- Initialize model and distributed training environment(if needed)
- Launch request_handler and corresponding kv cache manager
- Receive requests and generate texts.
- Log the generation process

Args:
colossal_config: We provide a unified config api for that wrapped all the configs. You can use it to replace the below configs.
model_config : The configuration for the model.
parallel_config: The configuration for parallelize model.
cache_config : Configuration for initialize and manage kv cache.
tokenizer (Tokenizer): The tokenizer to be used for inference.
use_logger (bool): Determine whether or not to log the generation process.
"""

def __init__(
self,
model_config,
cache_config,
parallel_config,
tokenizer,
use_logger: bool = False,
colossal_config: Optional["ColossalInferConfig"] = None,
) -> None:
assert colossal_config or (
model_config and cache_config and parallel_config
), "Please provide colossal_config or model_config, cache_config, parallel_config"
if colossal_config:
model_config, cache_config, parallel_config = colossal_config

self.model_config = model_config
self.cache_config = cache_config
self.parallel_config = parallel_config
self._verify_config()

self._init_model()
self.request_handler = RequestHandler(cache_config)
if use_logger:
self.logger = Logger()

def _init_model(self):
"""
Initialize model and distributed training environment(if needed).
May need to provide two different initialization methods:
1. 用户自定义(from local path)
2. 从checkpoint加载(hugging face)
"""

def _verify_config(self):
"""
Verify the configuration to avoid potential bugs.
"""

def generate(self):
pass

def step(self):
"""
In each step, do the follows:
1. Run request_handler to update the kv cache and running input_ids
2. Run model to generate the next token
3. Check whether there is finied request and decode
"""
10 changes: 10 additions & 0 deletions colossalai/inference/core/request_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class RequestHandler:
def __init__(self, cache_config) -> None:
self.cache_config = cache_config
self._init_cache()

def _init_cache(self):
pass

def schedule(self, request):
pass
3 changes: 0 additions & 3 deletions colossalai/inference/engine/__init__.py

This file was deleted.

Loading
Loading