# Create Inference Endpoint on Huggingface for NousHermes 2 Pro - Mistral 7B Finetuned on the DocILE dataset

Note: you might need to update `huggingface_hub` to the latest version with `pip install -U huggingface_hubg` to run this notebook.

Then, login to your Huggingface account with: `!huggingface-cli login`

In [8]:
import huggingface_hub
from huggingface_hub import create_inference_endpoint

## Create Inference Endpoint on Huggingface

Note that this costs $4 per hour.

In [6]:
# repository = "HuggingFaceM4/idefics2-8b"
# endpoint_name = "idefics2-8b-00"
# repository = "llava-hf/llava-v1.6-vicuna-13b-hf"
# endpoint_name = "llava-v1-6-vicuna-13b-hf-9ah"
# repository = "google/paligemma-3b-mix-448"
# endpoint_name = "paligemma-3b-mix-448-kqj"
repository = "leloy/Nous-Hermes-2-Pro-Docile-RASG-1ShotRetrieval-StructuredPrompt"
enpoint_name = "nous-hermes-2-pro-docile-ras-hxh"
namespace = "leloy"

In [None]:
endpoint = create_inference_endpoint(
    endpoint_name,
    repository=repository,
    # namespace=namespace,
    framework="pytorch",
    task="text-generation",
    accelerator="gpu",
    vendor="aws",
    region="us-east-1",
    type="protected",
    instance_size="x4",  #"medium", 
    instance_type="nvidia-t4",  #"g5.2xlarge",
    min_replica=0,
    max_replica=1,
    custom_image={
        "health_route": "/health",
        "env": {
            "MAX_BATCH_PREFILL_TOKENS": "4096",
            "MAX_INPUT_LENGTH": "3072",
            "MAX_TOTAL_TOKENS": "8192",
            "MODEL_ID": "/repository"
        },
        # If you receive an error, try upgrading or downgrading the TGI image
        "url": "ghcr.io/huggingface/text-generation-inference:2.0.3",
    },
    token=huggingface_hub.get_token(),
)

endpoint.wait()
print(endpoint.status)

## Test

In [31]:
import os
import requests

import json
import pandas as pd
import huggingface_hub
from transformers import AutoTokenizer

In [22]:
dataset_name = "mydoc"
dataset_path = os.path.join("data/raw_datasets", dataset_name, "annot_wo_answer.json")
print(dataset_path)
assert os.path.exists(dataset_path)

df_data = pd.read_json(dataset_path)

data/raw_datasets/mydoc/annot_wo_answer.json


In [None]:
id = df_data.iloc[0]["id"]
image_path = df_data.iloc[0]["image"]
question = df_data.iloc[0]["conversations"][0]["value"]
id, image_path, question

In [None]:
image_path

In [29]:
key = "order_description"

In [38]:
dataset_name = "mydoc"
surya_results_mydoc = json.load(
    open(
        f"inference_results/surya/{dataset_name}/surya/{dataset_name}/images/results.json",
        "r",
    )
)

In [56]:
def get_surya_ocr_text(image_path: str) -> str:
    surya_ocr = surya_results_mydoc[image_path[:-4]]
    surya_text = "\n".join(line["text"] for line in surya_ocr[0]["text_lines"])
    return surya_text

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    repository, trust_remote_code=True
)

In [58]:
SYSTEM_PROMPT_FORMAT = "You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n<json_schema>\n<schema>"

In [59]:
question = "What is the name of candidate in the document?"

In [60]:
json_format = {key: {"type": "string"}}

In [None]:
messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT_FORMAT.replace("<json_schema>", json.dumps(json_format)),
    },
    {
        "role": "user",
        "content": get_surya_ocr_text(image_path),
    }
]
messages

In [62]:
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
prompt

'<|im_start|>system\nYou are a helpful assistant that answers in JSON. Here\'s the json schema you must adhere to:\n<schema>\n{"order_description": {"type": "string"}}\n<schema><|im_end|>\n<|im_start|>user\nSUMMARY FOR SYSCODE 3271\nTraffic Order #\n12241669\nCreated On\n5/3/2023 12:42:25 PM\nOrder Status\nContract Confirmed \nOrder #\n3921553\nCreated By\nReitler, Matthew\nSyscode Gross $ \n150.00\nPOL-Candidate-FRIENDS OF ELECTRA\nOrder Descrp\nUpdated On\n5/3/2023 1:10:33 PM\nSyscode Net $\n127.50\nJANIS - WHL\nFRIENDS OF ELECTRA JANIS - WHL\nClient\n| Updated By\neClerx BOT\nSyscode Units\nStart Date\n5/7/2023\nSource\nVIEW\nZone Status\nContract Confirmed\nEnd Date\n5/14/2023\n Zones\nWeirton  WV\nAvg Unit Rate\n75.00\n# of Weeks\n2\nUnique\nTr Ln\nParent\nRev\nPriority\nNetwork\nSys\nProgram\nStart\n End\nDaypart\nSa  Su\nTotal\n Unit\nTotal\nUnit\nM\n Tu W Th F\nLine#\n Code\nDate\nDate\nUnits\nAmount\nLn #\n Code\nRate\nLen\n#\n110 |\nAT&T Sports\n3271\nPittsburgh Pirates vs\n0

In [63]:
API_URL = "https://asreb1rtjp81xsv2.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
	"Accept" : "application/json",
	"Authorization": f"Bearer {huggingface_hub.get_token()}",
	"Content-Type": "application/json" 
}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()


output = query({
	"inputs": prompt,
	"parameters": {
		"return_full_text": False,
		"max_new_tokens": 32,
		"grammar": {
            "type": "json",
            "value": {
                "properties": {
                    key: {
                        "type": "string",
                    },
                },
                "required": [key]
            }
        }
	}
})

In [64]:
output

[{'generated_text': '{"order_description": "FRIENDS OF ELECTRA JANIS - WHL"}'}]