In [1]:
from typing import Literal, Optional, TypedDict
import httpx

INFERENCE_ENDPOINT = "http://127.0.0.1:9000/generate"

class ChatMLMessage(TypedDict):
    name: Optional[str] = None
    role: Literal["assistant", "system", "user"]
    content: str

ChatML = list[ChatMLMessage]

def message_role_to_prefix(message: ChatMLMessage) -> str:
    match message:
        case {"role": "system", "name": name, **rest}:
            return name
        case {"role": "user", "name": name, **rest}:
            return f"person ({name})" if name else "person"
        case {"role": "assistant", "name": name, **rest}:
            return f"me ({name})" if name else "me"

def to_prompt(
    messages: ChatML,
    bos: str = "<|section|>",
    eos: str = "<|endsection|>",
    suffix: str = "\n<|section|>me (Samantha)\n",
) -> str:
    prompt = "\n".join([
        f"{bos}{message_role_to_prefix(message)}\n{message['content']}{eos}"
        for message in messages
    ])

    return prompt + suffix

async def generate(messages: ChatML, **parameters):
    inputs = to_prompt(messages)

    async with httpx.AsyncClient() as client:
        response = await client.post(
            INFERENCE_ENDPOINT,
            json=dict(
                inputs=inputs,
                parameters=parameters,
            ),
        )

    # response.raise_for_status()
    response_data = response.json()

    return response_data
    if "error" in response_data:
        raise Exception(response_data["error"])
        
    generated_text = response.json()["generated_text"]
    
    return generated_text.strip()

chatml = [
    ChatMLMessage(role="system", name="situation", content="I am talking to Diwank"),
    ChatMLMessage(role="user", name="Diwank", content="Hey Samantha!"),
]


r = await generate(
    chatml, 
    max_new_tokens=80, 
    return_full_text=False,
    stop=["<|endsection|>", "\n"],
    repetition_penalty=1.05,
    temperature=1.2,
    do_sample=False,
    # do_sample=True,
    # best_of=2,
)


ReadError: 

In [None]:
r