From 8f565827856c91ea5efca33562cd4445a4203c1e Mon Sep 17 00:00:00 2001
From: HR Wu <5631010+heiruwu@users.noreply.github.com>
Date: Tue, 19 Mar 2024 16:20:22 +0800
Subject: [PATCH] feat(helm,docker-compose): add registry service (#435)

Because

- Instill core will be incorporating a image registry service

This commit

- add `docker registry` service in docker-composes deployment
- add `docker registry` service in helm deployment

Resolves INS-3818
Resolves INS-3819
Resolves INS-3934
---
 .env                                          |   6 +
 Makefile                                      |   4 +-
 charts/core/templates/_helpers.tpl            |  19 +++
 .../templates/model-backend/configmap.yaml    |   3 +
 charts/core/templates/pvc.yaml                |  31 +++++
 .../templates/ray-service/ray-service.yaml    |  58 ++++++++
 charts/core/templates/registry/configmap.yaml |  28 ++++
 charts/core/templates/registry/cronjob.yaml   |  70 ++++++++++
 .../core/templates/registry/deployment.yaml   | 131 ++++++++++++++++++
 charts/core/templates/registry/hpa.yaml       |  33 +++++
 charts/core/templates/registry/service.yaml   |  22 +++
 charts/core/values.yaml                       | 103 +++++++++++++-
 configs/registry/config.yaml                  |  73 ++++++++++
 configs/registry/registries.conf              |  82 +++++++++++
 docker-compose-latest.yml                     |   4 +
 docker-compose.yml                            |  22 ++-
 16 files changed, 683 insertions(+), 6 deletions(-)
 create mode 100644 charts/core/templates/registry/configmap.yaml
 create mode 100644 charts/core/templates/registry/cronjob.yaml
 create mode 100644 charts/core/templates/registry/deployment.yaml
 create mode 100644 charts/core/templates/registry/hpa.yaml
 create mode 100644 charts/core/templates/registry/service.yaml
 create mode 100644 configs/registry/config.yaml
 create mode 100644 configs/registry/registries.conf

diff --git a/.env b/.env
index dbba5af2..57cb7952 100644
--- a/.env
+++ b/.env
@@ -192,3 +192,9 @@ SOCAT_IMAGE=alpine/socat
 SOCAT_VERSION=1.7.4.4-r0
 SOCAT_HOST=socat
 SOCAT_PORT=2375
+
+# registry
+REGISTRY_IMAGE=registry
+REGISTRY_VERSION=2.8.3
+REGISTRY_HOST=registry
+REGISTRY_PORT=5000
diff --git a/Makefile b/Makefile
index b817f4d2..45311098 100644
--- a/Makefile
+++ b/Makefile
@@ -47,13 +47,13 @@ HELM_RELEASE_NAME := core
 
 .PHONY: all
 all:			## Launch all services with their up-to-date release version
+	@docker inspect --type=image instill/ray:${RAY_RELEASE_TAG} >/dev/null 2>&1 || printf "\033[1;33mINFO:\033[0m This may take a while due to the enormous size of the Ray server image, but the image pulling process should be just a one-time effort.\n" && sleep 5
 	@make build-release BUILD=${BUILD}
 	@if [ ! -f "$$(echo ${SYSTEM_CONFIG_PATH}/user_uid)" ]; then \
 		mkdir -p ${SYSTEM_CONFIG_PATH} && \
 		docker run --rm --name uuidgen ${INSTILL_CORE_IMAGE_NAME}:${INSTILL_CORE_VERSION} uuidgen > ${SYSTEM_CONFIG_PATH}/user_uid; \
 	fi
 ifeq (${NVIDIA_GPU_AVAILABLE}, true)
-	@docker inspect --type=image instill/ray:${RAY_SERVER_VERSION} >/dev/null 2>&1 || printf "\033[1;33mINFO:\033[0m This may take a while due to the enormous size of the Ray server image, but the image pulling process should be just a one-time effort.\n" && sleep 5
 	@cat docker-compose-nvidia.yml | yq '.services.ray_server.deploy.resources.reservations.devices[0].device_ids |= (strenv(NVIDIA_VISIBLE_DEVICES) | split(",")) | ..style="double"' | \
 		EDITION=$${EDITION:=local-ce} DEFAULT_USER_UID=$$(cat ${SYSTEM_CONFIG_PATH}/user_uid) RAY_RELEASE_TAG=${RAY_RELEASE_TAG} docker compose ${COMPOSE_FILES} -f - up -d --quiet-pull
 else
@@ -62,13 +62,13 @@ endif
 
 .PHONY: latest
 latest:			## Lunch all dependent services with their latest codebase
+	@docker inspect --type=image instill/ray:${RAY_LATEST_TAG} >/dev/null 2>&1 || printf "\033[1;33mINFO:\033[0m This may take a while due to the enormous size of the Ray server image, but the image pulling process should be just a one-time effort.\n" && sleep 5
 	@make build-latest PROFILE=${PROFILE} BUILD=${BUILD}
 	@if [ ! -f "$$(echo ${SYSTEM_CONFIG_PATH}/user_uid)" ]; then \
 		mkdir -p ${SYSTEM_CONFIG_PATH} && \
 		docker run --rm --name uuidgen ${INSTILL_CORE_IMAGE_NAME}:latest uuidgen > ${SYSTEM_CONFIG_PATH}/user_uid; \
 	fi
 ifeq (${NVIDIA_GPU_AVAILABLE}, true)
-	@docker inspect --type=image instill/ray:${RAY_LATEST_TAG} >/dev/null 2>&1 || printf "\033[1;33mINFO:\033[0m This may take a while due to the enormous size of the Ray server image, but the image pulling process should be just a one-time effort.\n" && sleep 5
 	@cat docker-compose-nvidia.yml | yq '.services.ray_server.deploy.resources.reservations.devices[0].device_ids |= (strenv(NVIDIA_VISIBLE_DEVICES) | split(",")) | ..style="double"' | \
 		COMPOSE_PROFILES=${PROFILE} EDITION=$${EDITION:=local-ce:latest} DEFAULT_USER_UID=$$(cat ${SYSTEM_CONFIG_PATH}/user_uid) RAY_LATEST_TAG=${RAY_LATEST_TAG} docker compose ${COMPOSE_FILES} -f docker-compose-latest.yml -f - up -d --quiet-pull
 else
diff --git a/charts/core/templates/_helpers.tpl b/charts/core/templates/_helpers.tpl
index c09a2f32..c250eb60 100644
--- a/charts/core/templates/_helpers.tpl
+++ b/charts/core/templates/_helpers.tpl
@@ -384,6 +384,21 @@ openfga
   {{- printf "%s-openfga" (include "core.fullname" .) -}}
 {{- end -}}
 
+{{/*
+registry
+*/}}
+{{- define "core.registry" -}}
+  {{- printf "%s-registry" (include "core.fullname" .) -}}
+{{- end -}}
+
+{{- define "core.registry.port" -}}
+  {{- printf "5000" -}}
+{{- end -}}
+
+{{- define "core.registry.metricsPort" -}}
+  {{- printf "5001" -}}
+{{- end -}}
+
 {{/*
 internal TLS secret names
 */}}
@@ -446,6 +461,10 @@ Persistent Volume Claims
   {{- printf "%s-ray-conda-data-volume" (include "core.fullname" .) -}}
 {{- end -}}
 
+{{- define "core.registryDataVolume" -}}
+  {{- printf "%s-registry-data-volume" (include "core.fullname" .) -}}
+{{- end -}}
+
 {{- define "core.databaseDataVolume" -}}
   {{- printf "%s-database-data-volume" (include "core.fullname" .) -}}
 {{- end -}}
diff --git a/charts/core/templates/model-backend/configmap.yaml b/charts/core/templates/model-backend/configmap.yaml
index b46b9345..7aab44ea 100644
--- a/charts/core/templates/model-backend/configmap.yaml
+++ b/charts/core/templates/model-backend/configmap.yaml
@@ -95,3 +95,6 @@ data:
     openfga:
       host: {{ template "core.openfga" . }}
       port: 8080
+    registry:
+      host: {{ template "core.registry" . }}
+      port: {{ template "core.registry.port" . }}
diff --git a/charts/core/templates/pvc.yaml b/charts/core/templates/pvc.yaml
index 031299fe..399ba3c5 100644
--- a/charts/core/templates/pvc.yaml
+++ b/charts/core/templates/pvc.yaml
@@ -57,6 +57,37 @@ spec:
     {{- end }}
   {{- end }}
 {{- end }}
+{{- $registry := .Values.persistence.persistentVolumeClaim.registry -}}
+{{- if not $registry.existingClaim }}
+---
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: {{ template "core.registryDataVolume" . }}
+  annotations:
+  {{- range $key, $value := $registry.annotations }}
+    {{ $key }}: {{ $value | quote }}
+  {{- end }}
+  {{- if eq .Values.persistence.resourcePolicy "keep" }}
+    helm.sh/resource-policy: keep
+  {{- end }}
+  labels:
+    {{- include "core.labels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
+spec:
+  accessModes:
+    - {{ $registry.accessMode }}
+  resources:
+    requests:
+      storage: {{ $registry.size }}
+  {{- if $registry.storageClass }}
+    {{- if eq "-" $registry.storageClass }}
+  storageClassName: ""
+    {{- else }}
+  storageClassName: {{ $registry.storageClass }}
+    {{- end }}
+  {{- end }}
+{{- end }}
 {{- $database := .Values.persistence.persistentVolumeClaim.database -}}
 {{- if and (not $database.existingClaim) .Values.database.enabled }}
 ---
diff --git a/charts/core/templates/ray-service/ray-service.yaml b/charts/core/templates/ray-service/ray-service.yaml
index 58760672..24cdb197 100644
--- a/charts/core/templates/ray-service/ray-service.yaml
+++ b/charts/core/templates/ray-service/ray-service.yaml
@@ -55,9 +55,24 @@ spec:
               items:
                 - key: cp_conda_env.sh
                   path: cp_conda_env.sh
+          - name: podman-configmap
+            configMap:
+              name: podman
+              defaultMode: 0666
+              items:
+                - key: registries.conf
+                  path: registries.conf
+                - key: policy.json
+                  path: policy.json
+                - key: storage.conf
+                  path: storage.conf
         containers:
           - name: ray-head
             image: {{ .Values.rayService.image.repository }}:{{ .Values.rayService.image.tag }}
+            securityContext:
+              # for mounting /dev/fuse
+              # TODO: maybe implement a fuse-device-plugin-daemonset
+              privileged: true
             imagePullPolicy: Always
             {{- if .Values.rayService.headGroupSpec.resources }}
             resources:
@@ -70,11 +85,15 @@ spec:
                 value: http://core-grafana:80
               - name: RAY_PROMETHEUS_HOST
                 value: http://core-prometheus:9090
+              - name: RAY_worker_register_timeout_seconds
+                value: "360"
             volumeMounts:
               - mountPath: /ray-conda-pack
                 name: ray-conda
               - mountPath: /home/ray/script
                 name: cp-conda-env-configmap
+              - mountPath: /etc/containers/
+                name: podman-configmap
             ports:
               - containerPort: 6379
                 name: gcs-server
@@ -134,6 +153,15 @@ spec:
                 items:
                   - key: start_ray_serve.sh
                     path: start_ray_serve.sh
+            - name: podman-configmap
+              configMap:
+                name: podman
+                defaultMode: 0666
+                items:
+                  - key: registries.conf
+                    path: registries.conf
+                  - key: policy.json
+                    path: policy.json
           containers:
             - name: ray-worker
               image: {{ $.Values.rayService.image.repository }}:{{ $.Values.rayService.image.tag }}
@@ -162,6 +190,8 @@ spec:
                   name: start-ray-serve-configmap
                 - mountPath: /model-repository
                   name: model-repository
+                - mountPath: /etc/containers/
+                  name: podman-configmap
   {{- end }}
 ---
 apiVersion: v1
@@ -212,3 +242,31 @@ data:
     serve start --http-host=0.0.0.0 --grpc-port 9000 --grpc-servicer-functions ray_pb2_grpc.add_RayServiceServicer_to_server
 
     echo "INFO: Start ray serve"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: podman
+data:
+  registries.conf: |
+    unqualified-search-registries = ["{{ template "core.registry" . }}:{{ template "core.registry.port" . }}", "docker.io", "quay.io"]
+
+    [[registry]]
+    location = "{{ template "core.registry" . }}:{{ template "core.registry.port" . }}"
+    insecure = true
+  policy.json: |
+    {
+      "default": [
+        {
+          "type": "insecureAcceptAnything"
+        }
+      ],
+      "transports": {
+        "docker-daemon": {
+          "": [{ "type": "insecureAcceptAnything" }]
+        }
+      }
+    }
+  storage.conf: |
+    [storage]
+    driver = "overlay"
diff --git a/charts/core/templates/registry/configmap.yaml b/charts/core/templates/registry/configmap.yaml
new file mode 100644
index 00000000..0dde83c3
--- /dev/null
+++ b/charts/core/templates/registry/configmap.yaml
@@ -0,0 +1,28 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ template "core.registry" . }}
+  labels:
+    {{- include "core.labels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
+data:
+  config.yaml: |+
+    version: {{ .Values.registry.config.version }}
+    log:
+      {{- toYaml .Values.registry.config.log | nindent 6 }}
+    storage:
+      {{- toYaml .Values.registry.config.storage | nindent 6 }}
+    http:
+      {{- toYaml .Values.registry.config.http | nindent 6 }}
+    redis:
+      {{- toYaml .Values.registry.config.redis | nindent 6 }}
+    health:
+      storagedriver:
+        enabled: true
+        interval: 10s
+        threshold: 3
+      tcp:
+        - addr: {{ template "core.redis.addr" . }}
+          timeout: 3s
+          interval: 30s
+          threshold: 3
diff --git a/charts/core/templates/registry/cronjob.yaml b/charts/core/templates/registry/cronjob.yaml
new file mode 100644
index 00000000..969c9d05
--- /dev/null
+++ b/charts/core/templates/registry/cronjob.yaml
@@ -0,0 +1,70 @@
+{{- if .Values.registry.garbageCollect.enabled }}
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: {{ template "core.registry" . }}-garbage-collector
+  labels:
+    {{- include "core.labels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
+spec:
+  concurrencyPolicy: Forbid
+  schedule: {{ .Values.registry.garbageCollect.schedule | quote }}
+  jobTemplate:
+    metadata:
+      labels:
+        {{- include "core.matchLabels" . | nindent 8 }}
+        app.kubernetes.io/component: registry
+      annotations:
+        checksum/config: {{ include (print $.Template.BasePath "/registry/configmap.yaml") . | sha256sum }}
+        {{- with .Values.registry.podAnnotations }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      template:
+        spec:
+          {{- if .Values.registry.serviceAccountName }}
+          serviceAccountName: {{ .Values.registry.serviceAccountName }}
+          {{- end }}
+          {{- with .Values.imagePullSecrets }}
+          imagePullSecrets:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          securityContext:
+            runAsUser: 65534
+            runAsGroup: 65534
+            fsGroup: 65534
+          containers:
+            - name: garbage-collect
+              image: {{ .Values.registry.image.repository }}:{{ .Values.registry.image.tag }}
+              imagePullPolicy: {{ .Values.registry.image.pullPolicy }}
+              command:
+              - /bin/registry
+              - garbage-collect
+              - --delete-untagged={{ .Values.registry.garbageCollect.deleteUntagged }}
+              - /etc/docker/registry/config.yml
+              securityContext:
+                runAsUser: 65534
+                runAsGroup: 65534
+                fsGroup: 65534
+              volumeMounts:
+                - name: config
+                  mountPath: {{ .Values.registry.configPath }}
+                  subPath: config.yaml
+          restartPolicy: OnFailure
+          {{- with .Values.registry.nodeSelector }}
+          nodeSelector:
+            {{- toYaml . | nindent 8 }}
+          {{- end }}
+          {{- with .Values.registry.affinity }}
+          affinity:
+            {{- toYaml . | nindent 8 }}
+          {{- end }}
+          {{- with .Values.registry.tolerations }}
+          tolerations:
+            {{- toYaml . | nindent 8 }}
+          {{- end }}
+          volumes:
+            - name: config
+              configMap:
+                name: {{ template "core.registry" . }}
+{{- end }}
\ No newline at end of file
diff --git a/charts/core/templates/registry/deployment.yaml b/charts/core/templates/registry/deployment.yaml
new file mode 100644
index 00000000..bc121a0f
--- /dev/null
+++ b/charts/core/templates/registry/deployment.yaml
@@ -0,0 +1,131 @@
+{{- $registry := .Values.persistence.persistentVolumeClaim.registry -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ template "core.registry" . }}
+  labels:
+    {{- include "core.labels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
+  annotations:
+    rollme: {{ randAlphaNum 5 | quote }}
+spec:
+  strategy:
+    type: {{ .Values.updateStrategy.type }}
+    {{- if eq .Values.updateStrategy.type "RollingUpdate" }}
+    rollingUpdate:
+      maxSurge: {{ .Values.updateStrategy.rollingUpdate.maxSurge }}
+      maxUnavailable: {{ .Values.updateStrategy.rollingUpdate.maxUnavailable }}
+    {{- else}}
+    rollingUpdate: null
+    {{- end }}
+  {{- if not .Values.registry.autoscaling.enabled }}
+  replicas: {{ .Values.registry.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "core.matchLabels" . | nindent 6 }}
+      app.kubernetes.io/component: registry
+  template:
+    metadata:
+      labels:
+        {{- include "core.matchLabels" . | nindent 8 }}
+        app.kubernetes.io/component: registry
+      annotations:
+        checksum/config: {{ include (print $.Template.BasePath "/registry/configmap.yaml") . | sha256sum }}
+        {{- with .Values.registry.podAnnotations }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+        fsGroup: 65534
+      {{- if .Values.registry.serviceAccountName }}
+      serviceAccountName: {{ .Values.registry.serviceAccountName }}
+      {{- end }}
+      automountServiceAccountToken: {{ .Values.registry.automountServiceAccountToken | default false }}
+      terminationGracePeriodSeconds: 120
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      initContainers:
+        - name: wait-for-redis
+          image:  {{ .Values.redis.image.repository }}:{{ .Values.redis.image.tag }}
+          command: ['redis-cli', '-h']
+          args:
+            - '$(REDIS_HOST)'
+            - 'ping'
+          env:
+            - name: REDIS_HOST
+              value: "{{ template "core.redis" . }}"
+      containers:
+        - name: registry
+          image: {{ .Values.registry.image.repository }}:{{ .Values.registry.image.tag }}
+          imagePullPolicy: {{ .Values.registry.image.pullPolicy }}
+          command:
+          - /bin/registry
+          - serve
+          - {{ .Values.registry.configPath }}
+          readinessProbe:
+            httpGet:
+              path: /
+              scheme: {{ ternary "https" "http" .Values.internalTLS.enabled | upper }}
+              port: {{ template "core.registry.port" . }}
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /
+              scheme: {{ ternary "https" "http" .Values.internalTLS.enabled | upper }}
+              port: {{ template "core.registry.port" . }}
+            periodSeconds: 10
+          {{- if .Values.registry.resources }}
+          resources:
+            {{- toYaml .Values.registry.resources | nindent 12 }}
+          {{- end }}
+          ports:
+            - name: {{ ternary "https" "http" .Values.internalTLS.enabled }}
+              containerPort: {{ template "core.registry.port" . }}
+              protocol: TCP
+            - name: metrics
+              containerPort: {{ template "core.registry.metricsPort" . }}
+              protocol: TCP
+          volumeMounts:
+            - name: config
+              mountPath: {{ .Values.registry.configPath }}
+              subPath: config.yaml
+            - name: data-volume
+              mountPath: /var/lib/registry
+          {{- with .Values.registry.extraEnv }}
+          env:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+      volumes:
+        - name: config
+          configMap:
+            name: {{ template "core.registry" . }}
+        - name: data-volume
+        {{- if not .Values.persistence.enabled }}
+          emptyDir: {}
+        {{- else if $registry.existingClaim }}
+          persistentVolumeClaim:
+            claimName: {{ $registry.existingClaim }}
+        {{- else }}
+          persistentVolumeClaim:
+            claimName: {{ template "core.registryDataVolume" . }}
+        {{- end }}
+        {{- with .Values.registry.extraVolumes }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- with .Values.registry.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.registry.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.registry.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
\ No newline at end of file
diff --git a/charts/core/templates/registry/hpa.yaml b/charts/core/templates/registry/hpa.yaml
new file mode 100644
index 00000000..f286411a
--- /dev/null
+++ b/charts/core/templates/registry/hpa.yaml
@@ -0,0 +1,33 @@
+{{- if .Values.registry.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ template "core.registry" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ template "core.registry" . }}
+    labels:
+    {{- include "core.labels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
+  minReplicas: {{ .Values.registry.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.registry.autoscaling.maxReplicas }}
+  metrics:
+{{- with .Values.registry.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ . }}
+{{- end }}
+{{- with .Values.registry.autoscaling.targetAverageMemoryUtilization }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: AverageValue
+          averageValue: {{ . }}
+{{- end }}
+{{- end }}
diff --git a/charts/core/templates/registry/service.yaml b/charts/core/templates/registry/service.yaml
new file mode 100644
index 00000000..32a4b164
--- /dev/null
+++ b/charts/core/templates/registry/service.yaml
@@ -0,0 +1,22 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ template "core.registry" . }}
+  labels:
+    {{- include "core.labels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
+  {{- with .Values.registry.serviceAnnotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  ports:
+    - name: {{ ternary "https" "http" .Values.internalTLS.enabled }}
+      port: {{ template "core.registry.port" . }}
+      targetPort: {{ template "core.registry.port" . }}
+    - name: metrics
+      port: {{ template "core.registry.metricsPort" . }}
+      targetPort: {{ template "core.registry.metricsPort" . }}
+  selector:
+    {{- include "core.matchLabels" . | nindent 4 }}
+    app.kubernetes.io/component: registry
diff --git a/charts/core/values.yaml b/charts/core/values.yaml
index f725dd94..e2df1a59 100644
--- a/charts/core/values.yaml
+++ b/charts/core/values.yaml
@@ -245,6 +245,13 @@ persistence:
       accessMode: ReadWriteOnce
       size: 128Mi
       annotations: {}
+    registry:
+      existingClaim: ""
+      storageClass: ""
+      subPath: ""
+      accessMode: ReadWriteOnce
+      size: 250Gi
+      annotations: {}
 # -- The usage collector configuration
 usage:
   usageidentifieruid:
@@ -1080,7 +1087,7 @@ etcd:
 # -- The configuration of Redis
 redis:
   # -- If external Redis is used, set "enabled" to false
-  # and fill the connection informations in "external" section
+  # and fill the connection information in "external" section
   enabled: true
   # -- The image of redis
   image:
@@ -1109,7 +1116,7 @@ redis:
 # -- The configuration of Elasticsearch
 elasticsearch:
   # -- If external Elasticsearch is used, set "enabled" to false
-  # and fill the connection informations in "external" section
+  # and fill the connection information in "external" section
   enabled: true
   clusterName: core-elasticsearch
   replicas: 1
@@ -1177,7 +1184,6 @@ jaeger:
         args:
           - >
             while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' http://core-elasticsearch-master:9200)" != "200" ]]; do echo waiting for elasticsearch; sleep 1; done
-
 # -- The configuration of OpenTelemetry Collector
 opentelemetry-collector:
   mode: deployment
@@ -1379,6 +1385,97 @@ openfga:
     spec:
       minAvailable:
       maxUnavailable:
+  # -- The configuration of registry
+registry:
+  image:
+    repository: registry
+    tag: 2.8.3
+    pullPolicy: IfNotPresent
+  configPath: /etc/docker/registry/config.yml
+  serviceAccountName: ""
+  automountServiceAccountToken: false
+  replicaCount: 1
+  extraEnv: []
+  extraVolumes: []
+  extraVolumeMounts: []
+  extraInitContainers: []
+  sidecarContainers: {}
+  podAnnotations: {}
+  serviceAnnotations: {}
+  resources: {}
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}
+  autoscaling:
+    enabled: false
+    minReplicas:
+    maxReplicas:
+    targetCPUUtilizationPercentage:
+    targetAverageMemoryUtilization:
+  podDisruptionBudget:
+    enabled: false
+    spec:
+      minAvailable:
+      maxUnavailable:
+  garbageCollect:
+    enabled: false
+    deleteUntagged: true
+    schedule: "@midnight"
+  config:
+    version: 0.1
+    log:
+      level: Info
+      formatter: text
+      fields:
+        service: registry
+        environment: k8s:ce
+    storage:
+      filesystem:
+        rootdirectory: /var/lib/registry
+        maxthreads: 100
+      delete:
+        enabled: true
+      redirect:
+        disable: false
+      cache:
+        blobdescriptor: redis
+        blobdescriptorsize: 10000
+      maintenance:
+        uploadpurging:
+          enabled: true
+          age: 168h
+          interval: 24h
+          dryrun: false
+        readonly:
+          enabled: false
+    http:
+      addr: 0.0.0.0:5000
+      prefix: /
+      secret: i-love-instill-ai
+      relativeurls: false
+      draintimeout: 60s
+      debug:
+        addr: 0.0.0.0:5001
+        prometheus:
+          enabled: true
+          path: /metrics
+      headers:
+        X-Content-Type-Options: [nosniff]
+      http2:
+        disabled: false
+      h2c:
+        enabled: false
+    redis:
+      addr: core-redis:6379
+      dialtimeout: 100ms
+      readtimeout: 100ms
+      writetimeout: 100ms
+      pool:
+        maxidle: 16
+        maxactive: 128
+        idletimeout: 300s
+      tls:
+        enabled: false
 tags:
   observability: true
   prometheusStack: false
diff --git a/configs/registry/config.yaml b/configs/registry/config.yaml
new file mode 100644
index 00000000..d00822ee
--- /dev/null
+++ b/configs/registry/config.yaml
@@ -0,0 +1,73 @@
+version: 0.1
+log:
+  level: debug
+  formatter: text
+  fields:
+    service: registry
+    environment: local:ce
+storage:
+  filesystem:
+    rootdirectory: /var/lib/registry
+    maxthreads: 100
+  delete:
+    enabled: true
+  redirect:
+    disable: false
+  cache:
+    blobdescriptor: redis
+    blobdescriptorsize: 10000
+  maintenance:
+    uploadpurging:
+      enabled: false
+      age: 168h
+      interval: 24h
+      dryrun: false
+    readonly:
+      enabled: false
+http:
+  addr: 0.0.0.0:5000
+  prefix: /
+  secret: asecretforlocaldevelopment
+  relativeurls: false
+  draintimeout: 60s
+  debug:
+    addr: 0.0.0.0:5001
+    prometheus:
+      enabled: true
+      path: /metrics
+  headers:
+    X-Content-Type-Options: [nosniff]
+  http2:
+    disabled: false
+  h2c:
+    enabled: false
+redis:
+  addr: redis:6379
+  dialtimeout: 100ms
+  readtimeout: 100ms
+  writetimeout: 100ms
+  pool:
+    maxidle: 16
+    maxactive: 128
+    idletimeout: 300s
+  tls:
+    enabled: false
+health:
+  storagedriver:
+    enabled: true
+    interval: 10s
+    threshold: 3
+  http:
+    #  Check Instill Artifact
+    # - uri: http://server.to.check/must/return/200
+    #   headers:
+    #     Authorization: [Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ==]
+    #   statuscode: 200
+    #   timeout: 3s
+    #   interval: 10s
+    #   threshold: 3
+  tcp:
+    - addr: redis:6379
+      timeout: 3s
+      interval: 30s
+      threshold: 3
diff --git a/configs/registry/registries.conf b/configs/registry/registries.conf
new file mode 100644
index 00000000..7478fbd1
--- /dev/null
+++ b/configs/registry/registries.conf
@@ -0,0 +1,82 @@
+# For more information on this configuration file, see containers-registries.conf(5).
+#
+# NOTE: RISK OF USING UNQUALIFIED IMAGE NAMES
+# We recommend always using fully qualified image names including the registry
+# server (full dns name), namespace, image name, and tag
+# (e.g., registry.redhat.io/ubi8/ubi:latest). Pulling by digest (i.e.,
+# quay.io/repository/name@digest) further eliminates the ambiguity of tags.
+# When using short names, there is always an inherent risk that the image being
+# pulled could be spoofed. For example, a user wants to pull an image named
+# `foobar` from a registry and expects it to come from myregistry.com. If
+# myregistry.com is not first in the search list, an attacker could place a
+# different `foobar` image at a registry earlier in the search list. The user
+# would accidentally pull and run the attacker's image and code rather than the
+# intended content. We recommend only adding registries which are completely
+# trusted (i.e., registries which don't allow unknown or anonymous users to
+# create accounts with arbitrary names). This will prevent an image from being
+# spoofed, squatted or otherwise made insecure.  If it is necessary to use one
+# of these registries, it should be added at the end of the list.
+#
+# # An array of host[:port] registries to try when pulling an unqualified image, in order.
+unqualified-search-registries = ["registry:5000", "docker.io", "quay.io"]
+
+[[registry]]
+location = "registry:5000"
+insecure = true
+
+#
+# [[registry]]
+# # The "prefix" field is used to choose the relevant [[registry]] TOML table;
+# # (only) the TOML table with the longest match for the input image name
+# # (taking into account namespace/repo/tag/digest separators) is used.
+# #
+# # The prefix can also be of the form: *.example.com for wildcard subdomain
+# # matching.
+# #
+# # If the prefix field is missing, it defaults to be the same as the "location" field.
+# prefix = "example.com/foo"
+#
+# # If true, unencrypted HTTP as well as TLS connections with untrusted
+# # certificates are allowed.
+# insecure = false
+#
+# # If true, pulling images with matching names is forbidden.
+# blocked = false
+#
+# # The physical location of the "prefix"-rooted namespace.
+# #
+# # By default, this is equal to "prefix" (in which case "prefix" can be omitted
+# # and the [[registry]] TOML table can only specify "location").
+# #
+# # Example: Given
+# #   prefix = "example.com/foo"
+# #   location = "internal-registry-for-example.net/bar"
+# # requests for the image example.com/foo/myimage:latest will actually work with the
+# # internal-registry-for-example.net/bar/myimage:latest image.
+#
+# # The location can be empty iff prefix is in a
+# # wildcarded format: "*.example.com". In this case, the input reference will
+# # be used as-is without any rewrite.
+# location = internal-registry-for-example.com/bar"
+#
+# # (Possibly-partial) mirrors for the "prefix"-rooted namespace.
+# #
+# # The mirrors are attempted in the specified order; the first one that can be
+# # contacted and contains the image will be used (and if none of the mirrors contains the image,
+# # the primary location specified by the "registry.location" field, or using the unmodified
+# # user-specified reference, is tried last).
+# #
+# # Each TOML table in the "mirror" array can contain the following fields, with the same semantics
+# # as if specified in the [[registry]] TOML table directly:
+# # - location
+# # - insecure
+# [[registry.mirror]]
+# location = "example-mirror-0.local/mirror-for-foo"
+# [[registry.mirror]]
+# location = "example-mirror-1.local/mirrors/foo"
+# insecure = true
+# # Given the above, a pull of example.com/foo/image:latest will try:
+# # 1. example-mirror-0.local/mirror-for-foo/image:latest
+# # 2. example-mirror-1.local/mirrors/foo/image:latest
+# # 3. internal-registry-for-example.net/bar/image:latest
+# # in order, and use the first one that exists.
\ No newline at end of file
diff --git a/docker-compose-latest.yml b/docker-compose-latest.yml
index a7f42a0b..1b24caa8 100644
--- a/docker-compose-latest.yml
+++ b/docker-compose-latest.yml
@@ -167,3 +167,7 @@ services:
   etcd:
     ports:
       - ${ETCD_CLIENT_PORT}:${ETCD_CLIENT_PORT}
+
+  registry:
+    ports:
+      - ${REGISTRY_PORT}:5000
diff --git a/docker-compose.yml b/docker-compose.yml
index f09693a6..c647f171 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -411,6 +411,10 @@ services:
   ray_server:
     container_name: ${RAY_SERVER_HOST}
     image: ${RAY_SERVER_IMAGE}:${RAY_RELEASE_TAG}
+    # for dind podman
+    privileged: true
+    devices:
+      - "/dev/fuse"
     restart: unless-stopped
     environment:
       - RAY_ADDRESS=0.0.0.0:6379
@@ -418,14 +422,20 @@ services:
       - RAY_GRAFANA_HOST=http://${GRAFANA_HOST}:${GRAFANA_PORT}
       - RAY_PROMETHEUS_HOST=http://${PROMETHEUS_HOST}:${PROMETHEUS_PORT}
       - RAY_GRAFANA_IFRAME_HOST=http://localhost:${GRAFANA_PORT}
+      - RAY_worker_register_timeout_seconds=360
     entrypoint: ["/bin/bash", "-c"]
     command: |
-      'ray start --head --node-ip-address=0.0.0.0 --dashboard-host=0.0.0.0 --metrics-export-port ${RAY_SERVER_PROMETHEUS_PORT} --disable-usage-stats &&
+      'if [[ $${NVIDIA_VISIBLE_DEVICES} == all ]]; then
+        sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml;
+      fi &&
+      ray start --head --node-ip-address=0.0.0.0 --dashboard-host=0.0.0.0 --metrics-export-port ${RAY_SERVER_PROMETHEUS_PORT} --disable-usage-stats &&
       serve start --http-host=0.0.0.0 --grpc-port ${RAY_SERVER_SERVE_GRPC_PORT} --grpc-servicer-functions ray_pb2_grpc.add_RayServiceServicer_to_server &&
       tail -f /dev/null'
     volumes:
       - model_repository:/model-repository
       - ray_conda:/home/ray/anaconda3/
+      - /var/lib/containers:/var/lib/containers
+      - ${CONFIG_DIR_PATH}/registry/registries.conf:/etc/containers/registries.conf
     healthcheck:
       test: ["CMD", "ray", "status"]
       start_period: 20s
@@ -576,3 +586,13 @@ services:
       - type: bind
         source: /var/run/docker.sock
         target: /var/run/docker.sock
+
+  registry:
+    container_name: ${REGISTRY_HOST}
+    image: ${REGISTRY_IMAGE}:${REGISTRY_VERSION}
+    restart: unless-stopped
+    volumes:
+      - ${CONFIG_DIR_PATH}/registry/config.yaml:/etc/docker/registry/config.yml
+    depends_on:
+      redis:
+        condition: service_healthy