Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ docker run -ti -p 5000:5000 -e HF_MODEL_ID=distilbert-base-uncased-distilled-squ
docker run -ti -p 5000:5000 --gpus all -e HF_MODEL_ID=nlpconnect/vit-gpt2-image-captioning -e HF_TASK=image-to-text integration-test-pytorch:gpu
docker run -ti -p 5000:5000 --gpus all -e HF_MODEL_ID=echarlaix/tiny-random-stable-diffusion-xl -e HF_TASK=text-to-image integration-test-pytorch:gpu
docker run -ti -p 5000:5000 --gpus all -e HF_MODEL_ID=stabilityai/stable-diffusion-xl-base-1.0 -e HF_TASK=text-to-image integration-test-pytorch:gpu
docker run -ti -p 8080:5000 --gpus all -e HF_MODEL_ID=cross-encoder/ms-marco-MiniLM-L-6-v2 -e HF_TASK=sentence-ranking integration-test-pytorch:gpu
docker run -ti -p 5000:5000 -e HF_MODEL_DIR=/repository -v $(pwd)/distilbert-base-uncased-emotion:/repository integration-test-pytorch:cpu
```

Expand All @@ -66,6 +67,15 @@ The Hugging Face Inference Toolkit is also supported on Vertex AI, based on [Cus

Start Hugging Face Inference Toolkit with the following environment variables.

Inference Endpoints

```bash
mkdir tmp2/
HF_MODEL_DIR=tmp2 HF_MODEL_ID=cross-encoder/ms-marco-MiniLM-L-6-v2 HF_TASK=sentence-ranking uvicorn src.huggingface_inference_toolkit.webservice_starlette:app --port 8080
```

Vertex AI

```bash
mkdir tmp2/
AIP_MODE=PREDICTION AIP_PORT=8080 AIP_PREDICT_ROUTE=/pred AIP_HEALTH_ROUTE=/h HF_MODEL_DIR=tmp2 HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english HF_TASK=text-classification uvicorn src.huggingface_inference_toolkit.webservice_starlette:app --port 8080
Expand Down
31 changes: 21 additions & 10 deletions src/huggingface_inference_toolkit/sentence_transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def __init__(self, model_dir: str, device: str = None): # needs "cuda" for GPU
self.model = SentenceTransformer(model_dir, device=device)

def __call__(self, inputs=None):
embeddings1 = self.model.encode(inputs["source_sentence"], convert_to_tensor=True)
embeddings1 = self.model.encode(
inputs["source_sentence"], convert_to_tensor=True
)
embeddings2 = self.model.encode(inputs["sentences"], convert_to_tensor=True)
similarities = util.pytorch_cos_sim(embeddings1, embeddings2).tolist()[0]
return {"similarities": similarities}
Expand All @@ -35,9 +37,23 @@ class RankingPipeline:
def __init__(self, model_dir: str, device: str = None): # needs "cuda" for GPU
self.model = CrossEncoder(model_dir, device=device)

def __call__(self, inputs):
scores = self.model.predict(inputs).tolist()
return {"scores": scores}
def __call__(self, inputs, return_documents=None):
if isinstance(inputs, list):
scores = self.model.predict(inputs).tolist()
return {"scores": scores}
else:
_scores = self.model.rank(
inputs["query"],
inputs["texts"],
return_documents=return_documents,
)
# rename "corpus_id" key to "index" for all scores to match TEI
scores = []
for score in _scores:
score["index"] = score.pop("corpus_id")
scores.append(score)

return scores


SENTENCE_TRANSFORMERS_TASKS = {
Expand All @@ -47,12 +63,7 @@ def __call__(self, inputs):
}


def get_sentence_transformers_pipeline(
task=None,
model_dir=None,
device=-1,
**kwargs
):
def get_sentence_transformers_pipeline(task=None, model_dir=None, device=-1, **kwargs):
device = "cuda" if device == 0 else "cpu"
pipeline = SENTENCE_TRANSFORMERS_TASKS[task](model_dir=model_dir, device=device)
return pipeline
62 changes: 47 additions & 15 deletions src/huggingface_inference_toolkit/webservice_starlette.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,22 @@
HF_REVISION,
HF_TASK,
)
from huggingface_inference_toolkit.handler import get_inference_handler_either_custom_or_default_handler
from huggingface_inference_toolkit.handler import (
get_inference_handler_either_custom_or_default_handler,
)
from huggingface_inference_toolkit.serialization.base import ContentType
from huggingface_inference_toolkit.serialization.json_utils import Jsoner
from huggingface_inference_toolkit.utils import _load_repository_from_hf, convert_params_to_int_or_bool
from huggingface_inference_toolkit.utils import (
_load_repository_from_hf,
convert_params_to_int_or_bool,
)
from huggingface_inference_toolkit.vertex_ai_utils import _load_repository_from_gcs


def config_logging(level=logging.INFO):
logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level)
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level
)
# disable uvicorn access logs to hide /health
uvicorn_access = logging.getLogger("uvicorn.access")
uvicorn_access.disabled = True
Expand All @@ -52,8 +59,10 @@ async def prepare_model_artifacts():
)
# 3. check if in Vertex AI environment and load from GCS
# If artifactUri not on Model Creation not set returns an empty string
elif len(os.environ.get("AIP_STORAGE_URI", '')) > 0:
_load_repository_from_gcs(os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR)
elif len(os.environ.get("AIP_STORAGE_URI", "")) > 0:
_load_repository_from_gcs(
os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR
)
# 4. if not available, raise error
else:
raise ValueError(
Expand All @@ -65,7 +74,9 @@ async def prepare_model_artifacts():

logger.info(f"Initializing model from directory:{HF_MODEL_DIR}")
# 2. determine correct inference handler
inference_handler = get_inference_handler_either_custom_or_default_handler(HF_MODEL_DIR, task=HF_TASK)
inference_handler = get_inference_handler_either_custom_or_default_handler(
HF_MODEL_DIR, task=HF_TASK
)
logger.info("Model initialized successfully")


Expand All @@ -78,32 +89,51 @@ async def predict(request):
# extracts content from request
content_type = request.headers.get("content-Type", None)
# try to deserialize payload
deserialized_body = ContentType.get_deserializer(content_type).deserialize(await request.body())
deserialized_body = ContentType.get_deserializer(content_type).deserialize(
await request.body()
)
# checks if input schema is correct
if "inputs" not in deserialized_body and "instances" not in deserialized_body:
raise ValueError(f"Body needs to provide a inputs key, recieved: {orjson.dumps(deserialized_body)}")
if (
"inputs" not in deserialized_body
and "instances" not in deserialized_body
and "query" not in deserialized_body
):
raise ValueError(
f"Body needs to provide a inputs key, recieved: {orjson.dumps(deserialized_body)}"
)

# check for query parameter and add them to the body
if request.query_params and "parameters" not in deserialized_body:
deserialized_body["parameters"] = convert_params_to_int_or_bool(dict(request.query_params))
deserialized_body["parameters"] = convert_params_to_int_or_bool(
dict(request.query_params)
)

# tracks request time
start_time = perf_counter()
# run async not blocking call
pred = await async_handler_call(inference_handler, deserialized_body)
# log request time
logger.info(f"POST {request.url.path} | Duration: {(perf_counter()-start_time) *1000:.2f} ms")
logger.info(
f"POST {request.url.path} | Duration: {(perf_counter()-start_time) *1000:.2f} ms"
)

# response extracts content from request
accept = request.headers.get("accept", None)
if accept is None or accept == "*/*":
accept = "application/json"
# deserialized and resonds with json
serialized_response_body = ContentType.get_serializer(accept).serialize(pred, accept)
serialized_response_body = ContentType.get_serializer(accept).serialize(
pred, accept
)
return Response(serialized_response_body, media_type=accept)
except Exception as e:
logger.error(e)
return Response(Jsoner.serialize({"error": str(e)}), status_code=400, media_type="application/json")
return Response(
Jsoner.serialize({"error": str(e)}),
status_code=400,
media_type="application/json",
)


# Create app based on which cloud environment is used
if os.getenv("AIP_MODE", None) == "PREDICTION":
Expand All @@ -112,7 +142,9 @@ async def predict(request):
_predict_route = os.getenv("AIP_PREDICT_ROUTE", None)
_health_route = os.getenv("AIP_HEALTH_ROUTE", None)
if _predict_route is None or _health_route is None:
raise ValueError("AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment")
raise ValueError(
"AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment"
)

app = Starlette(
debug=False,
Expand All @@ -132,4 +164,4 @@ async def predict(request):
Route("/predict", predict, methods=["POST"]),
],
on_startup=[prepare_model_artifacts],
)
)
48 changes: 42 additions & 6 deletions tests/unit/test_sentence_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from transformers.file_utils import is_torch_available
from transformers.testing_utils import require_tf, require_torch, slow

from huggingface_inference_toolkit.handler import get_inference_handler_either_custom_or_default_handler
from huggingface_inference_toolkit.handler import (
get_inference_handler_either_custom_or_default_handler,
)
from huggingface_inference_toolkit.sentence_transformers_utils import (
SentenceEmbeddingPipeline,
get_sentence_transformers_pipeline,
Expand All @@ -32,7 +34,9 @@ def test_sentence_embedding_task():
storage_dir = _load_repository_from_hf(
"sentence-transformers/all-MiniLM-L6-v2", tmpdirname, framework="pytorch"
)
pipe = get_sentence_transformers_pipeline("sentence-embeddings", storage_dir.as_posix())
pipe = get_sentence_transformers_pipeline(
"sentence-embeddings", storage_dir.as_posix()
)
res = pipe("Lets create an embedding")
assert isinstance(res["embeddings"], list)

Expand All @@ -43,16 +47,27 @@ def test_sentence_similarity():
storage_dir = _load_repository_from_hf(
"sentence-transformers/all-MiniLM-L6-v2", tmpdirname, framework="pytorch"
)
pipe = get_sentence_transformers_pipeline("sentence-similarity", storage_dir.as_posix())
res = pipe({"source_sentence": "Lets create an embedding", "sentences": ["Lets create an embedding"]})
pipe = get_sentence_transformers_pipeline(
"sentence-similarity", storage_dir.as_posix()
)
res = pipe(
{
"source_sentence": "Lets create an embedding",
"sentences": ["Lets create an embedding"],
}
)
assert isinstance(res["similarities"], list)


@require_torch
def test_sentence_ranking():
with tempfile.TemporaryDirectory() as tmpdirname:
storage_dir = _load_repository_from_hf("cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch")
pipe = get_sentence_transformers_pipeline("sentence-ranking", storage_dir.as_posix())
storage_dir = _load_repository_from_hf(
"cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch"
)
pipe = get_sentence_transformers_pipeline(
"sentence-ranking", storage_dir.as_posix()
)
res = pipe(
[
["Lets create an embedding", "Lets create an embedding"],
Expand All @@ -64,3 +79,24 @@ def test_sentence_ranking():
["Lets create an embedding", "Lets create an embedding"],
)
assert isinstance(res["scores"], float)


@require_torch
def test_sentence_ranking_tei():
with tempfile.TemporaryDirectory() as tmpdirname:
storage_dir = _load_repository_from_hf(
"cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch"
)
pipe = get_sentence_transformers_pipeline(
"sentence-ranking", storage_dir.as_posix()
)
res = pipe(
{
"query": "Lets create an embedding",
"texts": ["Lets create an embedding", "I like noodles"],
}
)
assert isinstance(res, list)
for r in res:
assert "index" in r
assert "score" in r