From 9390c55e37db496710e8ce5f36f95f5d768cafcc Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 12:18:51 +0000
Subject: [PATCH 01/11] wip vertex ai

---
 README.md                                     | 29 +++++++++-
 dockerfiles/inference-endpoints/Dockerfile    | 48 ++++++++++++++++
 requirements.txt                              |  0
 scripts/entrypoint.sh                         | 18 ++++--
 src/huggingface_inference_toolkit/handler.py  | 40 ++++++++++++-
 .../vertex_ai_utils.py                        | 46 +++++++++++++++
 .../webservice_robyn.py                       | 57 -------------------
 .../webservice_starlette.py                   | 49 ++++++++++++----
 8 files changed, 211 insertions(+), 76 deletions(-)
 create mode 100644 dockerfiles/inference-endpoints/Dockerfile
 delete mode 100644 requirements.txt
 create mode 100644 src/huggingface_inference_toolkit/vertex_ai_utils.py
 delete mode 100644 src/huggingface_inference_toolkit/webservice_robyn.py

diff --git a/README.md b/README.md
index f2f66b40..7137f225 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ HF_MODEL_ID=hf-internal-testing/tiny-random-distilbert HF_MODEL_DIR=tmp2 HF_TASK
 ### Container
 
 
-1. build the preferred container for either CPU or GPU for PyTorch or TensorFlow.
+1. build the preferred container for either CPU or GPU for PyTorch o.
 
 _cpu images_
 ```bash
@@ -58,6 +58,32 @@ curl --request POST \
 }'
 ```
 
+### Vertex AI Support
+
+The Hugging Face Inference Toolkit is also supported on Vertex AI, based on [Custom container requirements for prediction](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements). [Enviornment variables set by Vertex AI](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables) are automatically detected and used by the toolkit. 
+
+#### Local run with HF_MODEL_ID and HF_TASK
+
+Start Hugging Face Inference Toolkit with the following environment variables. 
+
+```bash
+mkdir tmp2/
+AIP_MODE=PREDICTION AIP_PORT=8080 AIP_PREDICT_ROUTE=/pred AIP_HEALTH_ROUTE=/h HF_MODEL_DIR=tmp2 HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english HF_TASK=text-classification uvicorn src.huggingface_inference_toolkit.webservice_starlette:app  --port 8080
+```
+
+Send request. The API schema is the same as from the [inference API](https://huggingface.co/docs/api-inference/detailed_parameters)
+
+```bash
+curl --request POST \
+  --url http://localhost:8080/pred \
+  --header 'Content-Type: application/json' \
+  --data '{
+	"instances": ["I love this product", "I hate this product"],
+	"parameters": { "top_k": 2 }
+}'
+```
+
+
 
 ---
 
@@ -176,6 +202,7 @@ Below you ll find a list of supported and tested transformers and sentence trans
 ##  ⚙ Supported Frontend
 
 - [x] Starlette (HF Endpoints)
+- [ ] Starlette (Vertex AI)
 - [ ] Starlette (Azure ML)
 - [ ] Starlette (SageMaker)
 
diff --git a/dockerfiles/inference-endpoints/Dockerfile b/dockerfiles/inference-endpoints/Dockerfile
new file mode 100644
index 00000000..8e4c4d35
--- /dev/null
+++ b/dockerfiles/inference-endpoints/Dockerfile
@@ -0,0 +1,48 @@
+ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
+
+FROM $BASE_IMAGE
+SHELL ["/bin/bash", "-c"]
+
+LABEL maintainer="Hugging Face"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install software-properties-common -y && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get -y upgrade --only-upgrade systemd openssl cryptsetup && \
+    apt-get install -y \
+        build-essential \
+        bzip2 \
+        curl \
+        git \
+        git-lfs \
+        tar \
+        gcc \
+        g++ \
+        cmake \
+        libprotobuf-dev \
+        protobuf-compiler \
+        python3-dev \
+        python3-pip \
+        python3.11 \
+        libsndfile1-dev \
+        ffmpeg \
+    && apt-get clean autoremove --yes \
+    && rm -rf /var/lib/{apt,dpkg,cache,log}
+# Copying only necessary files as filtered by .dockerignore
+COPY . .
+
+# install wheel and setuptools
+RUN pip install --no-cache-dir -U pip ".[torch, st, diffusers]"
+
+# copy application
+COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
+COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starlette.py
+
+# copy entrypoint and change permissions
+COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
+
+ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index e69de29b..00000000
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
index 8544a63c..3e62536b 100644
--- a/scripts/entrypoint.sh
+++ b/scripts/entrypoint.sh
@@ -1,13 +1,21 @@
-# /bin/bash
+#!/bin/bash
 
-# check if HF_MODEL_DIR is set and if not skip installing custom dependencies
+# Define the default port
+PORT=5000
+
+# Check if AIP_MODE is set and adjust the port for Vertex AI
+if [[ ! -z "${AIP_MODE}" ]]; then
+  PORT=${AIP_HTTP_PORT}
+fi
+
+# Check if HF_MODEL_DIR is set and if not skip installing custom dependencies
 if [[ ! -z "${HF_MODEL_DIR}" ]]; then
-  # check if requirements.txt exists and if so install dependencies
+  # Check if requirements.txt exists and if so install dependencies
   if [ -f "${HF_MODEL_DIR}/requirements.txt" ]; then
     echo "Installing custom dependencies from ${HF_MODEL_DIR}/requirements.txt"
     pip install -r ${HF_MODEL_DIR}/requirements.txt --no-cache-dir;
   fi
 fi
 
-# start the server
-uvicorn webservice_starlette:app --host 0.0.0.0 --port 5000
\ No newline at end of file
+# Start the server
+uvicorn webservice_starlette:app --host 0.0.0.0 --port ${PORT}
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 08368326..c8bd4952 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from typing import Optional, Union
 
@@ -40,15 +41,52 @@ def __call__(self, data):
         return prediction
 
 
+class VertexAIHandler(HuggingFaceHandler):
+    """
+    A Default Vertex AI Hugging Face Inference Handler which abstracts the
+    Vertex AI specific logic for inference.
+    """
+    def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
+        super().__init__(model_dir, task, framework)
+    
+    def __call__(self, data):
+        """
+        Handles an inference request with input data and makes a prediction.
+        Args:
+            :data: (obj): the raw request body data.
+        :return: prediction output
+        """
+        if "instances" not in data:
+            raise ValueError("The request body must contain a key 'instances' with a list of instances.")
+        parameters = data.pop("parameters", None)
+        
+        predictions = []
+        # iterate over all instances and make predictions
+        for inputs in data["instances"]:
+            payload = {"inputs": inputs, "parameters": parameters}
+            predictions.append(super().__call__(payload))
+        
+        # reutrn predictions
+        return {"predictions": predictions}
+
 def get_inference_handler_either_custom_or_default_handler(
     model_dir: Path,
     task: Optional[str] = None
 ):
     """
-    get inference handler either custom or default Handler
+    Returns the appropriate inference handler based on the given model directory and task.
+    
+    Args:
+        model_dir (Path): The directory path where the model is stored.
+        task (Optional[str]): The task for which the inference handler is required. Defaults to None.
+    
+    Returns:
+        InferenceHandler: The appropriate inference handler based on the given model directory and task.
     """
     custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
     if custom_pipeline:
         return custom_pipeline
+    elif os.environ.get("AIP_MODE", None) == "PREDICTION": 
+        return VertexAIHandler(model_dir=model_dir, task=task)
     else:
         return HuggingFaceHandler(model_dir=model_dir, task=task)
diff --git a/src/huggingface_inference_toolkit/vertex_ai_utils.py b/src/huggingface_inference_toolkit/vertex_ai_utils.py
new file mode 100644
index 00000000..cf2bedad
--- /dev/null
+++ b/src/huggingface_inference_toolkit/vertex_ai_utils.py
@@ -0,0 +1,46 @@
+import logging
+from pathlib import Path
+import re
+from typing import Union
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
+
+from google.cloud import storage
+
+_logger = logging.getLogger(__name__)
+
+
+GCS_URI_PREFIX = "gs://"
+
+
+# copied from https://github.com/googleapis/python-aiplatform/blob/94d838d8cfe1599bc2d706e66080c05108821986/google/cloud/aiplatform/utils/prediction_utils.py#L121
+def _load_repository_from_gcs(artifact_uri: str, target_dir: Union[str, Path]="/tmp"):
+    """
+    Load files from GCS path to target_dir
+    """
+    _logger.info(f"Loading model artifacts from {artifact_uri} to {target_dir}")
+    target_dir = Path(target_dir)
+    
+    if artifact_uri.startswith(GCS_URI_PREFIX):
+        matches = re.match(f"{GCS_URI_PREFIX}(.*?)/(.*)", artifact_uri)
+        bucket_name, prefix = matches.groups()
+
+        gcs_client = storage.Client()
+        blobs = gcs_client.list_blobs(bucket_name, prefix=prefix)
+        for blob in blobs:
+            name_without_prefix = blob.name[len(prefix) :]
+            name_without_prefix = (
+                name_without_prefix[1:]
+                if name_without_prefix.startswith("/")
+                else name_without_prefix
+            )
+            file_split = name_without_prefix.split("/")
+            directory = target_dir.join(file_split[0:-1])
+            directory.mkdir(parents=True, exist_ok=True)
+            if name_without_prefix and not name_without_prefix.endswith("/"):
+                blob.download_to_filename(name_without_prefix)
+
+    return str(target_dir.absolute())
+
diff --git a/src/huggingface_inference_toolkit/webservice_robyn.py b/src/huggingface_inference_toolkit/webservice_robyn.py
deleted file mode 100644
index 5aeaf605..00000000
--- a/src/huggingface_inference_toolkit/webservice_robyn.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import logging
-import os
-
-from robyn import Robyn
-
-from huggingface_inference_toolkit.serialization.base import ContentType
-from huggingface_inference_toolkit.serialization.json_utils import Jsoner
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
-
-
-app = Robyn(__file__)
-
-HF_MODEL_DIR = os.environ.get("HF_MODEL_DIR", "/opt/huggingface/model")
-HF_TASK = os.environ.get("HF_TASK", None)
-
-# @app.startup_handler
-# async def startup_event():
-# global inference_handler
-
-# if empty_directory_or_not_hf_remote_id is None or task is None:
-#     raise ValueError(
-#         f"""Can't initialize model.
-#             Please set correct model id and task.
-#             Provided values are model_id:
-#             {model_id_or_path} and task:{task}"""
-#     )
-
-# logger.info(f"Initializing model with model_id:{model_id_or_path} and task:{task}")
-# # create inference handler
-# inference_handler = HuggingFaceHandler(HF_MODEL_ID)
-# logger.info(f"Model initialized successfully on device: {inference_handler.model.device}")
-# return inference_handler
-
-
-@app.get("/health")
-async def health():
-    return "OK"
-
-
-@app.post("/predict")
-async def predict(request):
-    try:
-        logger.info(request)
-        content_type = request.headers.get("Content-Type", None)
-        body = ContentType.get_deserializer(content_type).deserialize(request["body"])
-        logger.info(body)
-
-        # pred = inference_handler(body["inputs"])
-        return Jsoner.serialize(body)
-    except Exception as e:
-        logger.error(e)
-        return Jsoner.serialize({"error": str(e)})
-
-
-app.start(port=5000)
diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
index 8bc68b2e..b749237b 100644
--- a/src/huggingface_inference_toolkit/webservice_starlette.py
+++ b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from time import perf_counter
 
@@ -20,6 +21,7 @@
 from huggingface_inference_toolkit.serialization.base import ContentType
 from huggingface_inference_toolkit.serialization.json_utils import Jsoner
 from huggingface_inference_toolkit.utils import _load_repository_from_hf, convert_params_to_int_or_bool
+from huggingface_inference_toolkit.vertex_ai_utils import _load_repository_from_gcs
 
 
 def config_logging(level=logging.INFO):
@@ -35,10 +37,11 @@ def config_logging(level=logging.INFO):
 logger = logging.getLogger(__name__)
 
 
-async def some_startup_task():
+async def prepare_model_artifacts():
     global inference_handler
     # 1. check if model artifacts available in HF_MODEL_DIR
     if len(list(Path(HF_MODEL_DIR).glob("**/*"))) <= 0:
+        # 2. if not available, try to load from HF_MODEL_ID
         if HF_MODEL_ID is not None:
             _load_repository_from_hf(
                 repository_id=HF_MODEL_ID,
@@ -47,6 +50,11 @@ async def some_startup_task():
                 revision=HF_REVISION,
                 hf_hub_token=HF_HUB_TOKEN,
             )
+        # 3. check if in Vertex AI environment and load from GCS
+        # If artifactUri not on Model Creation not set returns an empty string
+        elif len(os.environ.get("AIP_STORAGE_URI", '')) > 0: 
+            _load_repository_from_gcs(os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR)
+        # 4. if not available, raise error
         else:
             raise ValueError(
                 f"""Can't initialize model.
@@ -72,7 +80,7 @@ async def predict(request):
         # try to deserialize payload
         deserialized_body = ContentType.get_deserializer(content_type).deserialize(await request.body())
         # checks if input schema is correct
-        if "inputs" not in deserialized_body:
+        if "inputs" not in deserialized_body and "instances" not in deserialized_body:
             raise ValueError(f"Body needs to provide a inputs key, recieved: {orjson.dumps(deserialized_body)}")
 
         # check for query parameter and add them to the body
@@ -97,14 +105,31 @@ async def predict(request):
         logger.error(e)
         return Response(Jsoner.serialize({"error": str(e)}), status_code=400, media_type="application/json")
 
-
-app = Starlette(
-    debug=True,
-    routes=[
-        Route("/", health, methods=["GET"]),
-        Route("/health", health, methods=["GET"]),
-        Route("/", predict, methods=["POST"]),
-        Route("/predict", predict, methods=["POST"]),
-    ],
-    on_startup=[some_startup_task],
+# Create app based on which cloud environment is used
+if os.getenv("AIP_MODE", None) == "PREDICTION":
+    logger.info("Running in Vertex AI environment")
+    # extract routes from environment variables
+    _predict_route = os.getenv("AIP_PREDICT_ROUTE", None)
+    _health_route = os.getenv("AIP_HEALTH_ROUTE", None)
+    if _predict_route is None or _health_route is None:
+        raise ValueError("AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment")    
+    
+    app = Starlette(
+        debug=False,
+        routes=[
+            Route(_health_route, health, methods=["GET"]),
+            Route(_predict_route, predict, methods=["POST"]),
+        ],
+        on_startup=[prepare_model_artifacts],
+    )    
+else:
+    app = Starlette(
+        debug=False,
+        routes=[
+            Route("/", health, methods=["GET"]),
+            Route("/health", health, methods=["GET"]),
+            Route("/", predict, methods=["POST"]),
+            Route("/predict", predict, methods=["POST"]),
+        ],
+        on_startup=[prepare_model_artifacts],
 )

From 0b91c65bf8aeca73d49ad587d269cbf51ca8c4bf Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 13:25:21 +0000
Subject: [PATCH 02/11] updated versions

---
 README.md                                   | 27 +++++++++-
 dockerfiles/inference-endpoints/Dockerfile  | 48 -----------------
 dockerfiles/pytorch/Dockerfile              | 10 +++-
 dockerfiles/tensorflow/cpu/Dockerfile       | 53 ------------------
 dockerfiles/tensorflow/cpu/environment.yaml |  8 ---
 dockerfiles/tensorflow/gpu/Dockerfile       | 59 ---------------------
 dockerfiles/tensorflow/gpu/environment.yaml |  9 ----
 makefile                                    |  6 +++
 setup.py                                    | 17 ++----
 9 files changed, 45 insertions(+), 192 deletions(-)
 delete mode 100644 dockerfiles/inference-endpoints/Dockerfile
 delete mode 100644 dockerfiles/tensorflow/cpu/Dockerfile
 delete mode 100644 dockerfiles/tensorflow/cpu/environment.yaml
 delete mode 100644 dockerfiles/tensorflow/gpu/Dockerfile
 delete mode 100644 dockerfiles/tensorflow/gpu/environment.yaml

diff --git a/README.md b/README.md
index 7137f225..62f49723 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ mkdir tmp2/
 AIP_MODE=PREDICTION AIP_PORT=8080 AIP_PREDICT_ROUTE=/pred AIP_HEALTH_ROUTE=/h HF_MODEL_DIR=tmp2 HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english HF_TASK=text-classification uvicorn src.huggingface_inference_toolkit.webservice_starlette:app  --port 8080
 ```
 
-Send request. The API schema is the same as from the [inference API](https://huggingface.co/docs/api-inference/detailed_parameters)
+Send request
 
 ```bash
 curl --request POST \
@@ -83,6 +83,31 @@ curl --request POST \
 }'
 ```
 
+#### Container run with HF_MODEL_ID and HF_TASK
+
+1. build the preferred container for either CPU or GPU for PyTorch o.
+
+```bash
+docker build -t vertex -f dockerfiles/pytorch/Dockerfile -t vertex-test-pytorch:gpu .
+```
+
+2. Run the container and provide either environment variables to the HUB model you want to use or mount a volume to the container, where your model is stored.
+
+```bash
+docker run -ti -p 8080:8080 -e AIP_MODE=PREDICTION -e AIP_HTTP_PORT=8080 -e AIP_PREDICT_ROUTE=/pred -e AIP_HEALTH_ROUTE=/h -e HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english -e HF_TASK=text-classification vertex-test-pytorch:gpu
+```
+
+1. Send request
+
+```bash
+curl --request POST \
+	--url http://localhost:8080/pred \
+	--header 'Content-Type: application/json' \
+	--data '{
+	"instances": ["I love this product", "I hate this product"],
+	"parameters": { "top_k": 2 }
+}'
+```
 
 
 ---
diff --git a/dockerfiles/inference-endpoints/Dockerfile b/dockerfiles/inference-endpoints/Dockerfile
deleted file mode 100644
index 8e4c4d35..00000000
--- a/dockerfiles/inference-endpoints/Dockerfile
+++ /dev/null
@@ -1,48 +0,0 @@
-ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
-
-FROM $BASE_IMAGE
-SHELL ["/bin/bash", "-c"]
-
-LABEL maintainer="Hugging Face"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install software-properties-common -y && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get -y upgrade --only-upgrade systemd openssl cryptsetup && \
-    apt-get install -y \
-        build-essential \
-        bzip2 \
-        curl \
-        git \
-        git-lfs \
-        tar \
-        gcc \
-        g++ \
-        cmake \
-        libprotobuf-dev \
-        protobuf-compiler \
-        python3-dev \
-        python3-pip \
-        python3.11 \
-        libsndfile1-dev \
-        ffmpeg \
-    && apt-get clean autoremove --yes \
-    && rm -rf /var/lib/{apt,dpkg,cache,log}
-# Copying only necessary files as filtered by .dockerignore
-COPY . .
-
-# install wheel and setuptools
-RUN pip install --no-cache-dir -U pip ".[torch, st, diffusers]"
-
-# copy application
-COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
-COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starlette.py
-
-# copy entrypoint and change permissions
-COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
-
-ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
\ No newline at end of file
diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile
index 8e4c4d35..c554ce59 100644
--- a/dockerfiles/pytorch/Dockerfile
+++ b/dockerfiles/pytorch/Dockerfile
@@ -1,6 +1,6 @@
 ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
 
-FROM $BASE_IMAGE
+FROM $BASE_IMAGE as base 
 SHELL ["/bin/bash", "-c"]
 
 LABEL maintainer="Hugging Face"
@@ -45,4 +45,10 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle
 # copy entrypoint and change permissions
 COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
 
-ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
\ No newline at end of file
+ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
+
+
+from base as vertex
+
+# Install Vertex AI requiremented packages
+RUN pip install --no-cache-dir google-cloud-storage
diff --git a/dockerfiles/tensorflow/cpu/Dockerfile b/dockerfiles/tensorflow/cpu/Dockerfile
deleted file mode 100644
index d16010bb..00000000
--- a/dockerfiles/tensorflow/cpu/Dockerfile
+++ /dev/null
@@ -1,53 +0,0 @@
-FROM ubuntu:22.04
-
-LABEL maintainer="Hugging Face"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update \
-    && apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \
-    && apt-get install -y \
-    bzip2 \
-    curl \
-    git \
-    git-lfs \
-    tar \
-    gcc \
-    g++ \
-    cmake \
-    # audio
-    libsndfile1-dev \
-    ffmpeg \
-    && apt-get clean autoremove --yes \
-    && rm -rf /var/lib/{apt,dpkg,cache,log}
-
-# install micromamba
-ENV MAMBA_ROOT_PREFIX=/opt/conda
-ENV PATH=/opt/conda/bin:$PATH
-RUN curl -L https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xj "bin/micromamba" \
-    && touch /root/.bashrc \
-    && ./bin/micromamba shell init -s bash -p /opt/conda  \
-    && grep -v '[ -z "\$PS1" ] && return' /root/.bashrc  > /opt/conda/bashrc
-
-WORKDIR /app
-
-# install base python dependencies
-COPY dockerfiles/tensorflow/cpu/environment.yaml /app/environment.yaml
-RUN micromamba install -y -n base -f environment.yaml \
-    && rm environment.yaml \
-    && micromamba clean --all --yes
-
-# install huggingface inference toolkit
-COPY requirements.txt /tmp/requirements.txt
-RUN pip install  --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt
-
-# copy application
-COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
-COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starlette.py
-
-# copy entrypoint and change permissions
-COPY scripts/entrypoint.sh entrypoint.sh
-RUN chmod +x entrypoint.sh
-
-# run app
-ENTRYPOINT ["/bin/bash", "entrypoint.sh"]
\ No newline at end of file
diff --git a/dockerfiles/tensorflow/cpu/environment.yaml b/dockerfiles/tensorflow/cpu/environment.yaml
deleted file mode 100644
index a370380c..00000000
--- a/dockerfiles/tensorflow/cpu/environment.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-name: base
-channels:
-- conda-forge
-dependencies:
-- python=3.9.13
-- tensorflow=2.9.1=*cpu*py39*
-- pip:
-  - transformers[sklearn,sentencepiece,audio,vision]==4.27.2
\ No newline at end of file
diff --git a/dockerfiles/tensorflow/gpu/Dockerfile b/dockerfiles/tensorflow/gpu/Dockerfile
deleted file mode 100644
index c42a33c0..00000000
--- a/dockerfiles/tensorflow/gpu/Dockerfile
+++ /dev/null
@@ -1,59 +0,0 @@
-FROM nvidia/cuda:11.2.2-base-ubuntu20.04
-
-LABEL maintainer="Hugging Face"
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV CONDA_OVERRIDE_CUDA="11.2"
-
-RUN apt-get update \
-    && apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \
-    && apt-get install -y \
-    bzip2 \
-    curl \
-    git \
-    git-lfs \
-    tar \
-    gcc \
-    g++ \
-    cmake \
-    # audio
-    libsndfile1-dev \
-    ffmpeg \
-    && apt-get clean autoremove --yes \
-    && rm -rf /var/lib/{apt,dpkg,cache,log}
-
-# install micromamba
-ENV MAMBA_ROOT_PREFIX=/opt/conda
-ENV PATH=/opt/conda/bin:$PATH
-ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
-
-RUN curl -L https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xj "bin/micromamba" \
-    && touch /root/.bashrc \
-    && ./bin/micromamba shell init -s bash -p /opt/conda  \
-    && grep -v '[ -z "\$PS1" ] && return' /root/.bashrc  > /opt/conda/bashrc
-
-WORKDIR /app
-
-# install base python dependencies
-COPY dockerfiles/tensorflow/gpu/environment.yaml /app/environment.yaml
-RUN micromamba install -y -n base -f environment.yaml \
-    && rm environment.yaml \
-    && micromamba clean --all --yes
-
-# install huggingface inference toolkit
-COPY requirements.txt /tmp/requirements.txt
-RUN pip install  --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt
-
-# copy tests
-COPY . /tmp/hf-inference-test
-
-# copy application
-COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
-COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starlette.py
-
-# copy entrypoint and change permissions
-COPY scripts/entrypoint.sh entrypoint.sh
-RUN chmod +x entrypoint.sh
-
-# run app
-ENTRYPOINT ["/bin/bash", "entrypoint.sh"]
\ No newline at end of file
diff --git a/dockerfiles/tensorflow/gpu/environment.yaml b/dockerfiles/tensorflow/gpu/environment.yaml
deleted file mode 100644
index 1d886795..00000000
--- a/dockerfiles/tensorflow/gpu/environment.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-name: base
-channels:
-- conda-forge
-dependencies:
-- python=3.9.13
-- nvidia::cudatoolkit=11.7
-- tensorflow=2.9.1=*cuda112*py39*
-- pip:
-  - transformers[sklearn,sentencepiece,audio,vision]==4.27.2
\ No newline at end of file
diff --git a/makefile b/makefile
index a9490428..3502d83e 100644
--- a/makefile
+++ b/makefile
@@ -26,5 +26,11 @@ inference-pytorch-gpu:
 inference-pytorch-cpu:
 	docker build --build-arg="BASE_IMAGE=ubuntu:22.04" -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:cpu .
 
+vertex-pytorch-gpu:
+	docker build -t vertex -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:gpu .
+
+vertex-pytorch-cpu:
+	docker build  -t vertex --build-arg="BASE_IMAGE=ubuntu:22.04" -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:cpu .
+
 stop-all:
 	docker stop $$(docker ps -a -q) && docker container prune --force
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 5e99df02..192cd5c5 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
 from __future__ import absolute_import
-from datetime import date
 from setuptools import find_packages, setup
 
 # We don't declare our dependency on transformers here because we build with
@@ -7,18 +6,13 @@
 
 VERSION = "0.3.0"
 
-
 # Ubuntu packages
 # libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev
 # ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
 # libavcodec-extra : libavcodec-extra  inculdes additional codecs for ffmpeg
 
 install_requires = [
-    "wheel==0.42.0",
-    "setuptools==69.1.0",
-    "cmake==3.28.3",
-    "transformers[sklearn,sentencepiece, audio, vision]==4.38.2",
-    "huggingface_hub==0.20.3",
+    "transformers[sklearn,sentencepiece, audio,vision]==4.41.1",
     "orjson",
     # vision
     "Pillow",
@@ -31,15 +25,14 @@
     "starlette",
     "uvicorn",
     "pandas",
-    "peft==0.9.0"
+    "peft==0.11.1"
 ]
 
 extras = {}
 
-extras["st"] = ["sentence_transformers==2.4.0"]
-extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"]
-extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"]
-extras["tensorflow"] = ["tensorflow"]
+extras["st"] = ["sentence_transformers==2.7.0"]
+extras["diffusers"] = ["diffusers==0.28.0", "accelerate==0.30.1"]
+extras["torch"] = ["torch==2.3.0", "torchvision", "torchaudio"]
 extras["test"] = [
     "pytest==7.2.1",
     "pytest-xdist",

From e78be25d10d8dd776b977d85b0c7042f0f0be83e Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 13:26:05 +0000
Subject: [PATCH 03/11] 0.4.0

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 192cd5c5..035d29fc 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 # We don't declare our dependency on transformers here because we build with
 # different packages for different variants
 
-VERSION = "0.3.0"
+VERSION = "0.4.0"
 
 # Ubuntu packages
 # libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev

From 0c79f58446e477449d98c67846dc5cd11322aade Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 13:26:44 +0000
Subject: [PATCH 04/11] x

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 62f49723..6b2182b6 100644
--- a/README.md
+++ b/README.md
@@ -227,7 +227,7 @@ Below you ll find a list of supported and tested transformers and sentence trans
 ##  ⚙ Supported Frontend
 
 - [x] Starlette (HF Endpoints)
-- [ ] Starlette (Vertex AI)
+- [x] Starlette (Vertex AI)
 - [ ] Starlette (Azure ML)
 - [ ] Starlette (SageMaker)
 

From a2317c829f3233df64dc83cb5d82412a86a46e40 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 13:35:12 +0000
Subject: [PATCH 05/11] style

---
 README.md                                            |  2 +-
 src/huggingface_inference_toolkit/handler.py         | 12 ++++++------
 src/huggingface_inference_toolkit/vertex_ai_utils.py |  8 ++++----
 .../webservice_starlette.py                          |  8 ++++----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 6b2182b6..f8196973 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ docker build -t vertex -f dockerfiles/pytorch/Dockerfile -t vertex-test-pytorch:
 docker run -ti -p 8080:8080 -e AIP_MODE=PREDICTION -e AIP_HTTP_PORT=8080 -e AIP_PREDICT_ROUTE=/pred -e AIP_HEALTH_ROUTE=/h -e HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english -e HF_TASK=text-classification vertex-test-pytorch:gpu
 ```
 
-1. Send request
+3. Send request
 
 ```bash
 curl --request POST \
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index c8bd4952..0a8c93b8 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -48,7 +48,7 @@ class VertexAIHandler(HuggingFaceHandler):
     """
     def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
         super().__init__(model_dir, task, framework)
-    
+
     def __call__(self, data):
         """
         Handles an inference request with input data and makes a prediction.
@@ -59,13 +59,13 @@ def __call__(self, data):
         if "instances" not in data:
             raise ValueError("The request body must contain a key 'instances' with a list of instances.")
         parameters = data.pop("parameters", None)
-        
+
         predictions = []
         # iterate over all instances and make predictions
         for inputs in data["instances"]:
             payload = {"inputs": inputs, "parameters": parameters}
             predictions.append(super().__call__(payload))
-        
+
         # reutrn predictions
         return {"predictions": predictions}
 
@@ -75,18 +75,18 @@ def get_inference_handler_either_custom_or_default_handler(
 ):
     """
     Returns the appropriate inference handler based on the given model directory and task.
-    
+
     Args:
         model_dir (Path): The directory path where the model is stored.
         task (Optional[str]): The task for which the inference handler is required. Defaults to None.
-    
+
     Returns:
         InferenceHandler: The appropriate inference handler based on the given model directory and task.
     """
     custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
     if custom_pipeline:
         return custom_pipeline
-    elif os.environ.get("AIP_MODE", None) == "PREDICTION": 
+    elif os.environ.get("AIP_MODE", None) == "PREDICTION":
         return VertexAIHandler(model_dir=model_dir, task=task)
     else:
         return HuggingFaceHandler(model_dir=model_dir, task=task)
diff --git a/src/huggingface_inference_toolkit/vertex_ai_utils.py b/src/huggingface_inference_toolkit/vertex_ai_utils.py
index cf2bedad..19dd41e2 100644
--- a/src/huggingface_inference_toolkit/vertex_ai_utils.py
+++ b/src/huggingface_inference_toolkit/vertex_ai_utils.py
@@ -1,13 +1,12 @@
 import logging
-from pathlib import Path
 import re
+from pathlib import Path
 from typing import Union
 
-
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
 
-from google.cloud import storage
+
 
 _logger = logging.getLogger(__name__)
 
@@ -20,9 +19,10 @@ def _load_repository_from_gcs(artifact_uri: str, target_dir: Union[str, Path]="/
     """
     Load files from GCS path to target_dir
     """
+    from google.cloud import storage
     _logger.info(f"Loading model artifacts from {artifact_uri} to {target_dir}")
     target_dir = Path(target_dir)
-    
+
     if artifact_uri.startswith(GCS_URI_PREFIX):
         matches = re.match(f"{GCS_URI_PREFIX}(.*?)/(.*)", artifact_uri)
         bucket_name, prefix = matches.groups()
diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
index b749237b..862560dc 100644
--- a/src/huggingface_inference_toolkit/webservice_starlette.py
+++ b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -52,7 +52,7 @@ async def prepare_model_artifacts():
             )
         # 3. check if in Vertex AI environment and load from GCS
         # If artifactUri not on Model Creation not set returns an empty string
-        elif len(os.environ.get("AIP_STORAGE_URI", '')) > 0: 
+        elif len(os.environ.get("AIP_STORAGE_URI", '')) > 0:
             _load_repository_from_gcs(os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR)
         # 4. if not available, raise error
         else:
@@ -112,8 +112,8 @@ async def predict(request):
     _predict_route = os.getenv("AIP_PREDICT_ROUTE", None)
     _health_route = os.getenv("AIP_HEALTH_ROUTE", None)
     if _predict_route is None or _health_route is None:
-        raise ValueError("AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment")    
-    
+        raise ValueError("AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment")
+
     app = Starlette(
         debug=False,
         routes=[
@@ -121,7 +121,7 @@ async def predict(request):
             Route(_predict_route, predict, methods=["POST"]),
         ],
         on_startup=[prepare_model_artifacts],
-    )    
+    )
 else:
     app = Starlette(
         debug=False,

From f993c05e1d356adc7a3a084a5519810fdcc308a5 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 14:00:45 +0000
Subject: [PATCH 06/11] revert diffusersr version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 035d29fc..c7d26da9 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
 extras = {}
 
 extras["st"] = ["sentence_transformers==2.7.0"]
-extras["diffusers"] = ["diffusers==0.28.0", "accelerate==0.30.1"]
+extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.30.1"]
 extras["torch"] = ["torch==2.3.0", "torchvision", "torchaudio"]
 extras["test"] = [
     "pytest==7.2.1",

From bde90156f82454a5bcdf2f4cd7d563a47add47b8 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 14:13:12 +0000
Subject: [PATCH 07/11] pt 2.2.2

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c7d26da9..81628fa1 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
 
 extras["st"] = ["sentence_transformers==2.7.0"]
 extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.30.1"]
-extras["torch"] = ["torch==2.3.0", "torchvision", "torchaudio"]
+extras["torch"] = ["torch==2.2.2", "torchvision", "torchaudio"]
 extras["test"] = [
     "pytest==7.2.1",
     "pytest-xdist",

From b3a65867af627a9f1d5adac3a278fa338a552922 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 15:10:05 +0000
Subject: [PATCH 08/11] update model repo to fix tests

---
 .github/workflows/docker-build-action.yaml | 1 +
 setup.py                                   | 4 ++--
 tests/unit/test_diffusers.py               | 6 +++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker-build-action.yaml b/.github/workflows/docker-build-action.yaml
index 62cba961..fe644056 100644
--- a/.github/workflows/docker-build-action.yaml
+++ b/.github/workflows/docker-build-action.yaml
@@ -63,6 +63,7 @@ jobs:
           push: true
           context: ${{ inputs.context }}
           build-args: ${{ inputs.build_args }}
+          target: base
           file:  ${{ inputs.context }}/${{ inputs.dockerfile }}
           tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest
      
diff --git a/setup.py b/setup.py
index 81628fa1..fde51c5e 100644
--- a/setup.py
+++ b/setup.py
@@ -31,8 +31,8 @@
 extras = {}
 
 extras["st"] = ["sentence_transformers==2.7.0"]
-extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.30.1"]
-extras["torch"] = ["torch==2.2.2", "torchvision", "torchaudio"]
+extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"]
+extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"]
 extras["test"] = [
     "pytest==7.2.1",
     "pytest-xdist",
diff --git a/tests/unit/test_diffusers.py b/tests/unit/test_diffusers.py
index 0f2890a8..4384cd4e 100644
--- a/tests/unit/test_diffusers.py
+++ b/tests/unit/test_diffusers.py
@@ -15,7 +15,7 @@
 def test_get_diffusers_pipeline():
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
+            "echarlaix/tiny-random-stable-diffusion-xl",
             tmpdirname,
             framework="pytorch"
         )
@@ -28,7 +28,7 @@ def test_get_diffusers_pipeline():
 def test_pipe_on_gpu():
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
+            "echarlaix/tiny-random-stable-diffusion-xl",
             tmpdirname,
             framework="pytorch"
         )
@@ -44,7 +44,7 @@ def test_pipe_on_gpu():
 def test_text_to_image_task():
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
+            "echarlaix/tiny-random-stable-diffusion-xl",
             tmpdirname,
             framework="pytorch"
         )

From 0cdeee73d32dc0f9dd7db30b8f40f0c3f04dafea Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 27 May 2024 15:19:40 +0000
Subject: [PATCH 09/11] update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fde51c5e..deffb557 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
 
 extras["st"] = ["sentence_transformers==2.7.0"]
 extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"]
-extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"]
+extras["torch"] = ["torch==2.2.2", "torchvision", "torchaudio"]
 extras["test"] = [
     "pytest==7.2.1",
     "pytest-xdist",

From 66afdb01a526b659624f6443675eca55c8ac71c7 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Tue, 28 May 2024 10:23:55 +0200
Subject: [PATCH 10/11] Update README.md

Co-authored-by: oOraph <13552058+oOraph@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f8196973..c95f3619 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ curl --request POST \
 
 ### Vertex AI Support
 
-The Hugging Face Inference Toolkit is also supported on Vertex AI, based on [Custom container requirements for prediction](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements). [Enviornment variables set by Vertex AI](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables) are automatically detected and used by the toolkit. 
+The Hugging Face Inference Toolkit is also supported on Vertex AI, based on [Custom container requirements for prediction](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements). [Environment variables set by Vertex AI](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables) are automatically detected and used by the toolkit. 
 
 #### Local run with HF_MODEL_ID and HF_TASK
 

From e339f9a8c7236fdafa747564a65706e2cceafe2a Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Tue, 28 May 2024 10:24:01 +0200
Subject: [PATCH 11/11] Update README.md

Co-authored-by: oOraph <13552058+oOraph@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c95f3619..f3056a89 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ HF_MODEL_ID=hf-internal-testing/tiny-random-distilbert HF_MODEL_DIR=tmp2 HF_TASK
 ### Container
 
 
-1. build the preferred container for either CPU or GPU for PyTorch o.
+1. build the preferred container for either CPU or GPU for PyTorch.
 
 _cpu images_
 ```bash