diff --git a/README.md b/README.md index f3056a89..d143051f 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ docker run -ti -p 5000:5000 -e HF_MODEL_ID=distilbert-base-uncased-distilled-squ docker run -ti -p 5000:5000 --gpus all -e HF_MODEL_ID=nlpconnect/vit-gpt2-image-captioning -e HF_TASK=image-to-text integration-test-pytorch:gpu docker run -ti -p 5000:5000 --gpus all -e HF_MODEL_ID=echarlaix/tiny-random-stable-diffusion-xl -e HF_TASK=text-to-image integration-test-pytorch:gpu docker run -ti -p 5000:5000 --gpus all -e HF_MODEL_ID=stabilityai/stable-diffusion-xl-base-1.0 -e HF_TASK=text-to-image integration-test-pytorch:gpu +docker run -ti -p 8080:5000 --gpus all -e HF_MODEL_ID=cross-encoder/ms-marco-MiniLM-L-6-v2 -e HF_TASK=sentence-ranking integration-test-pytorch:gpu docker run -ti -p 5000:5000 -e HF_MODEL_DIR=/repository -v $(pwd)/distilbert-base-uncased-emotion:/repository integration-test-pytorch:cpu ``` @@ -66,6 +67,15 @@ The Hugging Face Inference Toolkit is also supported on Vertex AI, based on [Cus Start Hugging Face Inference Toolkit with the following environment variables. +Inference Endpoints + +```bash +mkdir tmp2/ +HF_MODEL_DIR=tmp2 HF_MODEL_ID=cross-encoder/ms-marco-MiniLM-L-6-v2 HF_TASK=sentence-ranking uvicorn src.huggingface_inference_toolkit.webservice_starlette:app --port 8080 +``` + +Vertex AI + ```bash mkdir tmp2/ AIP_MODE=PREDICTION AIP_PORT=8080 AIP_PREDICT_ROUTE=/pred AIP_HEALTH_ROUTE=/h HF_MODEL_DIR=tmp2 HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english HF_TASK=text-classification uvicorn src.huggingface_inference_toolkit.webservice_starlette:app --port 8080 diff --git a/src/huggingface_inference_toolkit/sentence_transformers_utils.py b/src/huggingface_inference_toolkit/sentence_transformers_utils.py index 72bb2ee2..d4f56064 100644 --- a/src/huggingface_inference_toolkit/sentence_transformers_utils.py +++ b/src/huggingface_inference_toolkit/sentence_transformers_utils.py @@ -16,7 +16,9 @@ def __init__(self, model_dir: str, device: str = None): # needs "cuda" for GPU self.model = SentenceTransformer(model_dir, device=device) def __call__(self, inputs=None): - embeddings1 = self.model.encode(inputs["source_sentence"], convert_to_tensor=True) + embeddings1 = self.model.encode( + inputs["source_sentence"], convert_to_tensor=True + ) embeddings2 = self.model.encode(inputs["sentences"], convert_to_tensor=True) similarities = util.pytorch_cos_sim(embeddings1, embeddings2).tolist()[0] return {"similarities": similarities} @@ -35,9 +37,23 @@ class RankingPipeline: def __init__(self, model_dir: str, device: str = None): # needs "cuda" for GPU self.model = CrossEncoder(model_dir, device=device) - def __call__(self, inputs): - scores = self.model.predict(inputs).tolist() - return {"scores": scores} + def __call__(self, inputs, return_documents=None): + if isinstance(inputs, list): + scores = self.model.predict(inputs).tolist() + return {"scores": scores} + else: + _scores = self.model.rank( + inputs["query"], + inputs["texts"], + return_documents=return_documents, + ) + # rename "corpus_id" key to "index" for all scores to match TEI + scores = [] + for score in _scores: + score["index"] = score.pop("corpus_id") + scores.append(score) + + return scores SENTENCE_TRANSFORMERS_TASKS = { @@ -47,12 +63,7 @@ def __call__(self, inputs): } -def get_sentence_transformers_pipeline( - task=None, - model_dir=None, - device=-1, - **kwargs -): +def get_sentence_transformers_pipeline(task=None, model_dir=None, device=-1, **kwargs): device = "cuda" if device == 0 else "cpu" pipeline = SENTENCE_TRANSFORMERS_TASKS[task](model_dir=model_dir, device=device) return pipeline diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py index 862560dc..501ce376 100644 --- a/src/huggingface_inference_toolkit/webservice_starlette.py +++ b/src/huggingface_inference_toolkit/webservice_starlette.py @@ -17,15 +17,22 @@ HF_REVISION, HF_TASK, ) -from huggingface_inference_toolkit.handler import get_inference_handler_either_custom_or_default_handler +from huggingface_inference_toolkit.handler import ( + get_inference_handler_either_custom_or_default_handler, +) from huggingface_inference_toolkit.serialization.base import ContentType from huggingface_inference_toolkit.serialization.json_utils import Jsoner -from huggingface_inference_toolkit.utils import _load_repository_from_hf, convert_params_to_int_or_bool +from huggingface_inference_toolkit.utils import ( + _load_repository_from_hf, + convert_params_to_int_or_bool, +) from huggingface_inference_toolkit.vertex_ai_utils import _load_repository_from_gcs def config_logging(level=logging.INFO): - logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level) + logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level + ) # disable uvicorn access logs to hide /health uvicorn_access = logging.getLogger("uvicorn.access") uvicorn_access.disabled = True @@ -52,8 +59,10 @@ async def prepare_model_artifacts(): ) # 3. check if in Vertex AI environment and load from GCS # If artifactUri not on Model Creation not set returns an empty string - elif len(os.environ.get("AIP_STORAGE_URI", '')) > 0: - _load_repository_from_gcs(os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR) + elif len(os.environ.get("AIP_STORAGE_URI", "")) > 0: + _load_repository_from_gcs( + os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR + ) # 4. if not available, raise error else: raise ValueError( @@ -65,7 +74,9 @@ async def prepare_model_artifacts(): logger.info(f"Initializing model from directory:{HF_MODEL_DIR}") # 2. determine correct inference handler - inference_handler = get_inference_handler_either_custom_or_default_handler(HF_MODEL_DIR, task=HF_TASK) + inference_handler = get_inference_handler_either_custom_or_default_handler( + HF_MODEL_DIR, task=HF_TASK + ) logger.info("Model initialized successfully") @@ -78,32 +89,51 @@ async def predict(request): # extracts content from request content_type = request.headers.get("content-Type", None) # try to deserialize payload - deserialized_body = ContentType.get_deserializer(content_type).deserialize(await request.body()) + deserialized_body = ContentType.get_deserializer(content_type).deserialize( + await request.body() + ) # checks if input schema is correct - if "inputs" not in deserialized_body and "instances" not in deserialized_body: - raise ValueError(f"Body needs to provide a inputs key, recieved: {orjson.dumps(deserialized_body)}") + if ( + "inputs" not in deserialized_body + and "instances" not in deserialized_body + and "query" not in deserialized_body + ): + raise ValueError( + f"Body needs to provide a inputs key, recieved: {orjson.dumps(deserialized_body)}" + ) # check for query parameter and add them to the body if request.query_params and "parameters" not in deserialized_body: - deserialized_body["parameters"] = convert_params_to_int_or_bool(dict(request.query_params)) + deserialized_body["parameters"] = convert_params_to_int_or_bool( + dict(request.query_params) + ) # tracks request time start_time = perf_counter() # run async not blocking call pred = await async_handler_call(inference_handler, deserialized_body) # log request time - logger.info(f"POST {request.url.path} | Duration: {(perf_counter()-start_time) *1000:.2f} ms") + logger.info( + f"POST {request.url.path} | Duration: {(perf_counter()-start_time) *1000:.2f} ms" + ) # response extracts content from request accept = request.headers.get("accept", None) if accept is None or accept == "*/*": accept = "application/json" # deserialized and resonds with json - serialized_response_body = ContentType.get_serializer(accept).serialize(pred, accept) + serialized_response_body = ContentType.get_serializer(accept).serialize( + pred, accept + ) return Response(serialized_response_body, media_type=accept) except Exception as e: logger.error(e) - return Response(Jsoner.serialize({"error": str(e)}), status_code=400, media_type="application/json") + return Response( + Jsoner.serialize({"error": str(e)}), + status_code=400, + media_type="application/json", + ) + # Create app based on which cloud environment is used if os.getenv("AIP_MODE", None) == "PREDICTION": @@ -112,7 +142,9 @@ async def predict(request): _predict_route = os.getenv("AIP_PREDICT_ROUTE", None) _health_route = os.getenv("AIP_HEALTH_ROUTE", None) if _predict_route is None or _health_route is None: - raise ValueError("AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment") + raise ValueError( + "AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment" + ) app = Starlette( debug=False, @@ -132,4 +164,4 @@ async def predict(request): Route("/predict", predict, methods=["POST"]), ], on_startup=[prepare_model_artifacts], -) + ) diff --git a/tests/unit/test_sentence_transformers.py b/tests/unit/test_sentence_transformers.py index 233da490..366ddf53 100644 --- a/tests/unit/test_sentence_transformers.py +++ b/tests/unit/test_sentence_transformers.py @@ -5,7 +5,9 @@ from transformers.file_utils import is_torch_available from transformers.testing_utils import require_tf, require_torch, slow -from huggingface_inference_toolkit.handler import get_inference_handler_either_custom_or_default_handler +from huggingface_inference_toolkit.handler import ( + get_inference_handler_either_custom_or_default_handler, +) from huggingface_inference_toolkit.sentence_transformers_utils import ( SentenceEmbeddingPipeline, get_sentence_transformers_pipeline, @@ -32,7 +34,9 @@ def test_sentence_embedding_task(): storage_dir = _load_repository_from_hf( "sentence-transformers/all-MiniLM-L6-v2", tmpdirname, framework="pytorch" ) - pipe = get_sentence_transformers_pipeline("sentence-embeddings", storage_dir.as_posix()) + pipe = get_sentence_transformers_pipeline( + "sentence-embeddings", storage_dir.as_posix() + ) res = pipe("Lets create an embedding") assert isinstance(res["embeddings"], list) @@ -43,16 +47,27 @@ def test_sentence_similarity(): storage_dir = _load_repository_from_hf( "sentence-transformers/all-MiniLM-L6-v2", tmpdirname, framework="pytorch" ) - pipe = get_sentence_transformers_pipeline("sentence-similarity", storage_dir.as_posix()) - res = pipe({"source_sentence": "Lets create an embedding", "sentences": ["Lets create an embedding"]}) + pipe = get_sentence_transformers_pipeline( + "sentence-similarity", storage_dir.as_posix() + ) + res = pipe( + { + "source_sentence": "Lets create an embedding", + "sentences": ["Lets create an embedding"], + } + ) assert isinstance(res["similarities"], list) @require_torch def test_sentence_ranking(): with tempfile.TemporaryDirectory() as tmpdirname: - storage_dir = _load_repository_from_hf("cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch") - pipe = get_sentence_transformers_pipeline("sentence-ranking", storage_dir.as_posix()) + storage_dir = _load_repository_from_hf( + "cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch" + ) + pipe = get_sentence_transformers_pipeline( + "sentence-ranking", storage_dir.as_posix() + ) res = pipe( [ ["Lets create an embedding", "Lets create an embedding"], @@ -64,3 +79,24 @@ def test_sentence_ranking(): ["Lets create an embedding", "Lets create an embedding"], ) assert isinstance(res["scores"], float) + + +@require_torch +def test_sentence_ranking_tei(): + with tempfile.TemporaryDirectory() as tmpdirname: + storage_dir = _load_repository_from_hf( + "cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch" + ) + pipe = get_sentence_transformers_pipeline( + "sentence-ranking", storage_dir.as_posix() + ) + res = pipe( + { + "query": "Lets create an embedding", + "texts": ["Lets create an embedding", "I like noodles"], + } + ) + assert isinstance(res, list) + for r in res: + assert "index" in r + assert "score" in r