Skip to content

Commit

Permalink
feat(model)!: adopt containerized model serving (#542)
Browse files Browse the repository at this point in the history
Because

- we are completely moving to serving models with container image format

This commit

- retire `controller-model`
- retire `model-repository`
- retire caching mechanism
- retire `github` `huggingface` and `artivc` model definitions
- retire github PAT
- refactor `create` and `deploy/undeploy` methods to be sync call
- add model version instance under namespace
- refactor create/deploy/undeploy/trigger methods to have version
instance concept
- move deploy/undeploy endpoints to private
- support accelerator type
- add detail message for model instance status
- support async model trigger with temporal

resolves INS-3724
resolves INS-3715
resolves INS-3714
resolves INS-3713
resolves INS-4050
  • Loading branch information
heiruwu committed Apr 8, 2024
1 parent d4ed219 commit 3c80f39
Show file tree
Hide file tree
Showing 55 changed files with 3,383 additions and 7,207 deletions.
38 changes: 0 additions & 38 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,8 @@ RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=typ
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS CGO_ENABLED=0 GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS CGO_ENABLED=0 GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init-model ./cmd/model

# ArtiVC to work with cloud storage
ARG TARGETOS TARGETARCH ARTIVC_VERSION
ADD https://github.com/InfuseAI/ArtiVC/releases/download/v${ARTIVC_VERSION}/ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz
RUN tar -xf ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz -C /usr/local/bin

# Mounting points
RUN mkdir /model-repository
RUN mkdir /.cache

FROM golang:${GOLANG_VERSION}

ENV DEBIAN_FRONTEND noninteractive

# tools to work with versatile model import
RUN apt-get update && apt-get install -y \
python3 \
python3-setuptools \
python3-pip \
python3-venv \
git \
git-lfs \
curl \
&& rm -rf /var/lib/apt/lists/*

ENV VENV=/opt/venv
RUN python3 -m venv $VENV
ENV PATH="$VENV/bin:$PATH"

# RUN export PIP_DEFAULT_TIMEOUT=10000
RUN pip install --upgrade pip setuptools wheel
RUN pip install dvc[gs]==2.34.2
# RUN pip install jsonschema pyyaml
# RUN pip install --no-cache-dir opencv-contrib-python-headless transformers pillow torch torchvision onnxruntime dvc[gs]==2.34.2
# RUN pip install --no-cache-dir ray[serve] scikit-image
# RUN pip install --no-cache-dir instill-sdk==0.3.2rc7

# Need permission of /tmp folder for internal process such as store temporary files.
RUN chown -R nobody:nogroup /tmp
# Need permission of /nonexistent folder for HuggingFace internal process.
Expand All @@ -77,7 +43,3 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init-model ./
COPY --from=build --chown=nobody:nogroup /usr/local/bin/avc /usr/local/bin/avc

COPY --from=build --chown=nobody:nogroup /model-repository /model-repository
COPY --from=build --chown=nobody:nogroup /.cache /.cache
22 changes: 0 additions & 22 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,10 @@ ARG K6_VERSION
ADD https://github.com/grafana/k6/releases/download/v${K6_VERSION}/k6-v${K6_VERSION}-linux-${TARGETARCH}.tar.gz k6-v${K6_VERSION}-linux-${TARGETARCH}.tar.gz
RUN tar -xf k6-v${K6_VERSION}-linux-$TARGETARCH.tar.gz --strip-components 1 -C /usr/bin

# ArtiVC to work with cloud storage
ADD https://github.com/InfuseAI/ArtiVC/releases/download/v${ARTIVC_VERSION}/ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz
RUN tar -xf ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz -C /bin

# tools to work with versatile model import
RUN apt-get update && apt-get install -y \
python3 \
python3-setuptools \
python3-pip \
python3-venv \
git \
git-lfs \
curl \
&& rm -rf /var/lib/apt/lists/*

ENV VENV=/opt/venv
RUN python3 -m venv $VENV
ENV PATH="$VENV/bin:$PATH"

# RUN export PIP_DEFAULT_TIMEOUT=10000
RUN pip install --upgrade pip setuptools wheel
RUN pip install dvc[gs]==2.34.2

# -- set up Go
COPY go.mod go.sum ./
RUN go mod download
Expand All @@ -58,9 +39,6 @@ RUN mkdir /nonexistent > /dev/null && chown -R nobody:nogroup /nonexistent
ENV GOCACHE /go/.cache/go-build
ENV GOENV /go/.config/go/env

# -- set up mounting points
RUN mkdir /model-repository && chown -R nobody:nogroup /model-repository

USER nobody:nogroup

ENTRYPOINT ["tail", "-f", "/dev/null"]
103 changes: 84 additions & 19 deletions assets/ray/proto/ray_server.proto
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
syntax = "proto3";
option cc_enable_arenas = true;

package ray.serve;

Expand All @@ -13,9 +14,7 @@ message AutoscalingConfig {
// to min_replicas.
uint32 max_replicas = 2;

// Target number of in flight requests per replicas. This is the primary configuration
// knob for replica autoscaler. Lower the number, the more rapidly will the replicas
// being scaled up. Must be a non-negative integer.
// [DEPRECATED] Use `target_ongoing_requests` instead.
double target_num_ongoing_requests_per_replica = 3;

// The frequency of how long does each replica sending metrics to autoscaler.
Expand All @@ -24,7 +23,7 @@ message AutoscalingConfig {
// The window (in seconds) for autoscaler to calculate rolling average of metrics on.
double look_back_period_s = 5;

// The multiplicative "gain" factor to limit scaling decisions.
// [DEPRECATED] Use `upscaling_factor` and/or `downscaling_factor` instead.
double smoothing_factor = 6;

// How long to wait before scaling down replicas.
Expand All @@ -36,52 +35,90 @@ message AutoscalingConfig {
// Initial number of replicas deployment should start with. Must be non-negative.
optional uint32 initial_replicas = 9;

// The multiplicative "gain" factor to limit upscale.
// [DEPRECATED] Use `upscaling_factor` instead.
optional double upscale_smoothing_factor = 10;

// The multiplicative "gain" factor to limit downscale.
// [DEPRECATED] Use `downscaling_factor` instead.
optional double downscale_smoothing_factor = 11;

// The cloudpickled policy definition.
bytes _serialized_policy_def = 12;

// The import path of the policy if user passed a string. Will be the concatenation
// of the policy module and the policy name if user passed a callable.
string _policy = 13;

// Target number of in flight requests per replica. This is the primary configuration
// knob for replica autoscaler. Lower the number, the more rapidly the replicas
// scales up. Must be a non-negative integer.
double target_ongoing_requests = 14;

// The multiplicative "gain" factor to limit upscale.
optional double upscaling_factor = 15;

// The multiplicative "gain" factor to limit downscale.
optional double downscaling_factor = 16;
}

//[Begin] LOGGING CONFIG
// Encoding type
enum EncodingType {
TEXT = 0;
JSON = 1;
}

message LoggingConfig {
EncodingType encoding = 1;
string log_level = 2;
string logs_dir = 3;
bool enable_access_log = 4;
}

//[End] Logging Config

// Configuration options for a deployment, to be set by the user.
message DeploymentConfig {
// The number of processes to start up that will handle requests to this deployment.
// Defaults to 1.
int32 num_replicas = 1;

// The maximum number of queries that will be sent to a replica of this deployment
// without receiving a response. Defaults to 100.
int32 max_concurrent_queries = 2;
// without receiving a response.
int32 max_ongoing_requests = 2;

// The maximum number of requests that will be queued in deployment handles.
int32 max_queued_requests = 3;

// Arguments to pass to the reconfigure method of the deployment. The reconfigure method
// is called if user_config is not None.
bytes user_config = 3;
bytes user_config = 4;

// Duration that deployment replicas will wait until there is no more work to be done
// before shutting down.
double graceful_shutdown_wait_loop_s = 4;
double graceful_shutdown_wait_loop_s = 5;

// Controller waits for this duration to forcefully kill the replica for shutdown.
double graceful_shutdown_timeout_s = 5;
double graceful_shutdown_timeout_s = 6;

// Frequency at which the controller health checks replicas.
double health_check_period_s = 6;
double health_check_period_s = 7;

// Timeout after which a replica is marked unhealthy without a response.
double health_check_timeout_s = 7;
double health_check_timeout_s = 8;

// Is the construction of deployment is cross language?
bool is_cross_language = 8;
bool is_cross_language = 9;

// The deployment's programming language.
DeploymentLanguage deployment_language = 9;
DeploymentLanguage deployment_language = 10;

// The deployment's autoscaling configuration.
AutoscalingConfig autoscaling_config = 10;
AutoscalingConfig autoscaling_config = 11;

string version = 12;

string version = 11;
repeated string user_configured_option_names = 13;

repeated string user_configured_option_names = 12;
LoggingConfig logging_config = 14;
}

// Deployment language.
Expand All @@ -98,6 +135,10 @@ message RequestMetadata {
string call_method = 3;

map<string, string> context = 4;

string multiplexed_model_id = 5;

string route = 6;
}

message RequestWrapper {
Expand Down Expand Up @@ -155,6 +196,12 @@ message ReplicaConfig {
int32 max_replicas_per_node = 8;
}

enum TargetCapacityDirection {
UNSET = 0;
UP = 1;
DOWN = 2;
}

message DeploymentInfo {
string name = 1;
DeploymentConfig deployment_config = 2;
Expand All @@ -163,6 +210,8 @@ message DeploymentInfo {
string actor_name = 5;
string version = 6;
int64 end_time_ms = 7;
double target_capacity = 8;
TargetCapacityDirection target_capacity_direction = 9;
}

// Wrap DeploymentInfo and route. The "" route value need to be convert to None/null.
Expand All @@ -182,12 +231,28 @@ enum DeploymentStatus {
DEPLOYMENT_STATUS_UPDATING = 0;
DEPLOYMENT_STATUS_HEALTHY = 1;
DEPLOYMENT_STATUS_UNHEALTHY = 2;
DEPLOYMENT_STATUS_UPSCALING = 3;
DEPLOYMENT_STATUS_DOWNSCALING = 4;
}

enum DeploymentStatusTrigger {
DEPLOYMENT_STATUS_TRIGGER_UNSPECIFIED = 0;
DEPLOYMENT_STATUS_TRIGGER_CONFIG_UPDATE_STARTED = 1;
DEPLOYMENT_STATUS_TRIGGER_CONFIG_UPDATE_COMPLETED = 2;
DEPLOYMENT_STATUS_TRIGGER_UPSCALE_COMPLETED = 3;
DEPLOYMENT_STATUS_TRIGGER_DOWNSCALE_COMPLETED = 4;
DEPLOYMENT_STATUS_TRIGGER_AUTOSCALING = 5;
DEPLOYMENT_STATUS_TRIGGER_REPLICA_STARTUP_FAILED = 6;
DEPLOYMENT_STATUS_TRIGGER_HEALTH_CHECK_FAILED = 7;
DEPLOYMENT_STATUS_TRIGGER_INTERNAL_ERROR = 8;
DEPLOYMENT_STATUS_TRIGGER_DELETING = 9;
}

message DeploymentStatusInfo {
string name = 1;
DeploymentStatus status = 2;
string message = 3;
DeploymentStatusTrigger status_trigger = 4;
}

// Wrap a list for DeploymentStatusInfo.
Expand Down
37 changes: 0 additions & 37 deletions assets/scripts/query_model_onnx.py

This file was deleted.

5 changes: 0 additions & 5 deletions assets/tiny-vit-random/README.md

This file was deleted.

Loading

0 comments on commit 3c80f39

Please sign in to comment.