feat(model)!: adopt containerized model serving (#542)

Because - we are completely moving to serving models with container image format This commit - retire `controller-model` - retire `model-repository` - retire caching mechanism - retire `github` `huggingface` and `artivc` model definitions - retire github PAT - refactor `create` and `deploy/undeploy` methods to be sync call - add model version instance under namespace - refactor create/deploy/undeploy/trigger methods to have version instance concept - move deploy/undeploy endpoints to private - support accelerator type - add detail message for model instance status - support async model trigger with temporal resolves INS-3724 resolves INS-3715 resolves INS-3714 resolves INS-3713 resolves INS-4050
instill-ai · Apr 8, 2024 · 3c80f39 · 3c80f39
1 parent d4ed219
commit 3c80f39
Show file tree

Hide file tree

Showing 55 changed files with 3,383 additions and 7,207 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -18,42 +18,8 @@ RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=typ
 RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS CGO_ENABLED=0 GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker
 RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS CGO_ENABLED=0 GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init-model ./cmd/model
 
-# ArtiVC to work with cloud storage
-ARG TARGETOS TARGETARCH ARTIVC_VERSION
-ADD https://github.com/InfuseAI/ArtiVC/releases/download/v${ARTIVC_VERSION}/ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz
-RUN tar -xf ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz -C /usr/local/bin
-
-# Mounting points
-RUN mkdir /model-repository
-RUN mkdir /.cache
-
 FROM golang:${GOLANG_VERSION}
 
-ENV DEBIAN_FRONTEND noninteractive
-
-# tools to work with versatile model import
-RUN apt-get update && apt-get install -y \
-    python3 \
-    python3-setuptools \
-    python3-pip \
-    python3-venv \
-    git \
-    git-lfs \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-ENV VENV=/opt/venv
-RUN python3 -m venv $VENV
-ENV PATH="$VENV/bin:$PATH"
-
-# RUN export PIP_DEFAULT_TIMEOUT=10000
-RUN pip install --upgrade pip setuptools wheel
-RUN pip install dvc[gs]==2.34.2
-# RUN pip install jsonschema pyyaml
-# RUN pip install --no-cache-dir opencv-contrib-python-headless transformers pillow torch torchvision onnxruntime dvc[gs]==2.34.2
-# RUN pip install --no-cache-dir ray[serve] scikit-image
-# RUN pip install --no-cache-dir instill-sdk==0.3.2rc7
-
 # Need permission of /tmp folder for internal process such as store temporary files.
 RUN chown -R nobody:nogroup /tmp
 # Need permission of /nonexistent folder for HuggingFace internal process.
@@ -77,7 +43,3 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./
 COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
 COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./
 COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init-model ./
-COPY --from=build --chown=nobody:nogroup /usr/local/bin/avc /usr/local/bin/avc
-
-COPY --from=build --chown=nobody:nogroup /model-repository /model-repository
-COPY --from=build --chown=nobody:nogroup /.cache /.cache
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -21,29 +21,10 @@ ARG K6_VERSION
 ADD https://github.com/grafana/k6/releases/download/v${K6_VERSION}/k6-v${K6_VERSION}-linux-${TARGETARCH}.tar.gz k6-v${K6_VERSION}-linux-${TARGETARCH}.tar.gz
 RUN tar -xf k6-v${K6_VERSION}-linux-$TARGETARCH.tar.gz --strip-components 1 -C /usr/bin
 
-# ArtiVC to work with cloud storage
-ADD https://github.com/InfuseAI/ArtiVC/releases/download/v${ARTIVC_VERSION}/ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz
-RUN tar -xf ArtiVC-v${ARTIVC_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz -C /bin
-
-# tools to work with versatile model import
 RUN apt-get update && apt-get install -y \
-    python3 \
-    python3-setuptools \
-    python3-pip \
-    python3-venv \
-    git \
-    git-lfs \
     curl \
     && rm -rf /var/lib/apt/lists/*
 
-ENV VENV=/opt/venv
-RUN python3 -m venv $VENV
-ENV PATH="$VENV/bin:$PATH"
-
-# RUN export PIP_DEFAULT_TIMEOUT=10000
-RUN pip install --upgrade pip setuptools wheel
-RUN pip install dvc[gs]==2.34.2
-
 # -- set up Go
 COPY go.mod go.sum ./
 RUN go mod download
@@ -58,9 +39,6 @@ RUN mkdir /nonexistent > /dev/null && chown -R nobody:nogroup /nonexistent
 ENV GOCACHE /go/.cache/go-build
 ENV GOENV /go/.config/go/env
 
-# -- set up mounting points
-RUN mkdir /model-repository && chown -R nobody:nogroup /model-repository
-
 USER nobody:nogroup
 
 ENTRYPOINT ["tail", "-f", "/dev/null"]
diff --git a/assets/ray/proto/ray_server.proto b/assets/ray/proto/ray_server.proto
@@ -1,4 +1,5 @@
 syntax = "proto3";
+option cc_enable_arenas = true;
 
 package ray.serve;
 
@@ -13,9 +14,7 @@ message AutoscalingConfig {
   // to min_replicas.
   uint32 max_replicas = 2;
 
-  // Target number of in flight requests per replicas. This is the primary configuration
-  // knob for replica autoscaler. Lower the number, the more rapidly will the replicas
-  // being scaled up. Must be a non-negative integer.
+  // [DEPRECATED] Use `target_ongoing_requests` instead.
   double target_num_ongoing_requests_per_replica = 3;
 
   // The frequency of how long does each replica sending metrics to autoscaler.
@@ -24,7 +23,7 @@ message AutoscalingConfig {
   // The window (in seconds) for autoscaler to calculate rolling average of metrics on.
   double look_back_period_s = 5;
 
-  // The multiplicative "gain" factor to limit scaling decisions.
+  // [DEPRECATED] Use `upscaling_factor` and/or `downscaling_factor` instead.
   double smoothing_factor = 6;
 
   // How long to wait before scaling down replicas.
@@ -36,52 +35,90 @@ message AutoscalingConfig {
   // Initial number of replicas deployment should start with. Must be non-negative.
   optional uint32 initial_replicas = 9;
 
-  // The multiplicative "gain" factor to limit upscale.
+  // [DEPRECATED] Use `upscaling_factor` instead.
   optional double upscale_smoothing_factor = 10;
 
-  // The multiplicative "gain" factor to limit downscale.
+  // [DEPRECATED] Use `downscaling_factor` instead.
   optional double downscale_smoothing_factor = 11;
+
+  // The cloudpickled policy definition.
+  bytes _serialized_policy_def = 12;
+
+  // The import path of the policy if user passed a string. Will be the concatenation
+  // of the policy module and the policy name if user passed a callable.
+  string _policy = 13;
+
+  // Target number of in flight requests per replica. This is the primary configuration
+  // knob for replica autoscaler. Lower the number, the more rapidly the replicas
+  // scales up. Must be a non-negative integer.
+  double target_ongoing_requests = 14;
+
+  // The multiplicative "gain" factor to limit upscale.
+  optional double upscaling_factor = 15;
+
+  // The multiplicative "gain" factor to limit downscale.
+  optional double downscaling_factor = 16;
+}
+
+//[Begin] LOGGING CONFIG
+// Encoding type
+enum EncodingType {
+  TEXT = 0;
+  JSON = 1;
 }
 
+message LoggingConfig {
+  EncodingType encoding = 1;
+  string log_level = 2;
+  string logs_dir = 3;
+  bool enable_access_log = 4;
+}
+
+//[End] Logging Config
+
 // Configuration options for a deployment, to be set by the user.
 message DeploymentConfig {
   // The number of processes to start up that will handle requests to this deployment.
-  // Defaults to 1.
   int32 num_replicas = 1;
 
   // The maximum number of queries that will be sent to a replica of this deployment
-  // without receiving a response. Defaults to 100.
-  int32 max_concurrent_queries = 2;
+  // without receiving a response.
+  int32 max_ongoing_requests = 2;
+
+  // The maximum number of requests that will be queued in deployment handles.
+  int32 max_queued_requests = 3;
 
   // Arguments to pass to the reconfigure method of the deployment. The reconfigure method
   // is called if user_config is not None.
-  bytes user_config = 3;
+  bytes user_config = 4;
 
   // Duration that deployment replicas will wait until there is no more work to be done
   // before shutting down.
-  double graceful_shutdown_wait_loop_s = 4;
+  double graceful_shutdown_wait_loop_s = 5;
 
   // Controller waits for this duration to forcefully kill the replica for shutdown.
-  double graceful_shutdown_timeout_s = 5;
+  double graceful_shutdown_timeout_s = 6;
 
   // Frequency at which the controller health checks replicas.
-  double health_check_period_s = 6;
+  double health_check_period_s = 7;
 
   // Timeout after which a replica is marked unhealthy without a response.
-  double health_check_timeout_s = 7;
+  double health_check_timeout_s = 8;
 
   // Is the construction of deployment is cross language?
-  bool is_cross_language = 8;
+  bool is_cross_language = 9;
 
   // The deployment's programming language.
-  DeploymentLanguage deployment_language = 9;
+  DeploymentLanguage deployment_language = 10;
 
   // The deployment's autoscaling configuration.
-  AutoscalingConfig autoscaling_config = 10;
+  AutoscalingConfig autoscaling_config = 11;
+
+  string version = 12;
 
-  string version = 11;
+  repeated string user_configured_option_names = 13;
 
-  repeated string user_configured_option_names = 12;
+  LoggingConfig logging_config = 14;
 }
 
 // Deployment language.
@@ -98,6 +135,10 @@ message RequestMetadata {
   string call_method = 3;
 
   map<string, string> context = 4;
+
+  string multiplexed_model_id = 5;
+
+  string route = 6;
 }
 
 message RequestWrapper {
@@ -155,6 +196,12 @@ message ReplicaConfig {
   int32 max_replicas_per_node = 8;
 }
 
+enum TargetCapacityDirection {
+  UNSET = 0;
+  UP = 1;
+  DOWN = 2;
+}
+
 message DeploymentInfo {
   string name = 1;
   DeploymentConfig deployment_config = 2;
@@ -163,6 +210,8 @@ message DeploymentInfo {
   string actor_name = 5;
   string version = 6;
   int64 end_time_ms = 7;
+  double target_capacity = 8;
+  TargetCapacityDirection target_capacity_direction = 9;
 }
 
 // Wrap DeploymentInfo and route. The "" route value need to be convert to None/null.
@@ -182,12 +231,28 @@ enum DeploymentStatus {
   DEPLOYMENT_STATUS_UPDATING = 0;
   DEPLOYMENT_STATUS_HEALTHY = 1;
   DEPLOYMENT_STATUS_UNHEALTHY = 2;
+  DEPLOYMENT_STATUS_UPSCALING = 3;
+  DEPLOYMENT_STATUS_DOWNSCALING = 4;
+}
+
+enum DeploymentStatusTrigger {
+  DEPLOYMENT_STATUS_TRIGGER_UNSPECIFIED = 0;
+  DEPLOYMENT_STATUS_TRIGGER_CONFIG_UPDATE_STARTED = 1;
+  DEPLOYMENT_STATUS_TRIGGER_CONFIG_UPDATE_COMPLETED = 2;
+  DEPLOYMENT_STATUS_TRIGGER_UPSCALE_COMPLETED = 3;
+  DEPLOYMENT_STATUS_TRIGGER_DOWNSCALE_COMPLETED = 4;
+  DEPLOYMENT_STATUS_TRIGGER_AUTOSCALING = 5;
+  DEPLOYMENT_STATUS_TRIGGER_REPLICA_STARTUP_FAILED = 6;
+  DEPLOYMENT_STATUS_TRIGGER_HEALTH_CHECK_FAILED = 7;
+  DEPLOYMENT_STATUS_TRIGGER_INTERNAL_ERROR = 8;
+  DEPLOYMENT_STATUS_TRIGGER_DELETING = 9;
 }
 
 message DeploymentStatusInfo {
   string name = 1;
   DeploymentStatus status = 2;
   string message = 3;
+  DeploymentStatusTrigger status_trigger = 4;
 }
 
 // Wrap a list for DeploymentStatusInfo.

diff --git a/assets/scripts/query_model_onnx.py b/assets/scripts/query_model_onnx.py
diff --git a/assets/tiny-vit-random/README.md b/assets/tiny-vit-random/README.md