Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing the waiting_served_ratio default (stack more aggressively by default). #1820

Merged
merged 2 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/basic_tutorials/launcher.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ Options:
This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.

[env: WAITING_SERVED_RATIO=]
[default: 1.2]
[default: 0.3]

```
## MAX_BATCH_PREFILL_TOKENS
Expand Down
2 changes: 1 addition & 1 deletion launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ struct Args {
///
/// This setting is only applied if there is room in the batch
/// as defined by `max_batch_total_tokens`.
#[clap(default_value = "1.2", long, env)]
#[clap(default_value = "0.3", long, env)]
waiting_served_ratio: f32,

/// Limits the number of tokens for the prefill operation.
Expand Down
13 changes: 10 additions & 3 deletions server/text_generation_server/models/flash_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,12 @@
from text_generation_server.utils.dist import MEMORY_FRACTION

tracer = trace.get_tracer(__name__)
from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM, IS_XPU_SYSTEM
from text_generation_server.utils.import_utils import (
IS_CUDA_SYSTEM,
IS_ROCM_SYSTEM,
IS_XPU_SYSTEM,
)


@dataclass
class FlashCausalLMBatch(Batch):
Expand Down Expand Up @@ -788,14 +793,16 @@ def warmup(self, batch: FlashCausalLMBatch):

if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
total_free_memory, _ = torch.cuda.mem_get_info(self.device)
total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
total_gpu_memory = torch.cuda.get_device_properties(
self.device
).total_memory

free_memory = max(
0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
)
elif IS_XPU_SYSTEM:
total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory
free_memory = int(total_gpu_memory *0.5)
free_memory = int(total_gpu_memory * 0.5)
else:
raise NotImplementedError("FlashModel is only available on GPU")

Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/flash_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from text_generation_server.utils.import_utils import IS_XPU_SYSTEM


class FlashLlama(FlashCausalLM):
def __init__(
self,
Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/flash_neox.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Weights,
)
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM

tracer = trace.get_tracer(__name__)


Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/flash_rw.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Weights,
)
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM

tracer = trace.get_tracer(__name__)


Expand Down
1 change: 1 addition & 0 deletions server/text_generation_server/models/flash_santacoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)

from text_generation_server.utils.import_utils import IS_XPU_SYSTEM

tracer = trace.get_tracer(__name__)


Expand Down
2 changes: 2 additions & 0 deletions server/text_generation_server/utils/import_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import torch


def is_xpu_available():
try:
import intel_extension_for_pytorch
Expand All @@ -8,6 +9,7 @@ def is_xpu_available():

return hasattr(torch, "xpu") and torch.xpu.is_available()


IS_ROCM_SYSTEM = torch.version.hip is not None
IS_CUDA_SYSTEM = torch.version.cuda is not None
IS_XPU_SYSTEM = is_xpu_available()