In [1]:
import guidance
from enum import Enum
from guidance import models
from pydantic import BaseModel, conlist, TypeAdapter
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import json
import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chat_template = guidance.chat.Llama3ChatTemplate
# lm = models.LlamaCpp("../models/meta-llama-3.1-8B-instruct-Q8_0.gguf", n_gpu_layers=-1, chat_template=chat_template, n_ctx=8192)

quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model_8bit = AutoModelForCausalLM.from_pretrained(
    "../models/meta-llama-3.1-8B-instruct/", 
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained("../models/meta-llama-3.1-8B-instruct/")
lm = models.Transformers(model=model_8bit, tokenizer=tokenizer, chat_template=chat_template )

print("model loaded!")

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [01:04<00:00, 16.24s/it]


model loaded!




In [3]:
class Emotion(str, Enum):
    happy = "happy",
    sad = "sad",
    angry = "angry",
    neutral = "neutral"
class Response(BaseModel):
    reply: str
    emotion: Emotion
    continue_talking: bool
    remaining_responses: int
schema = TypeAdapter(conlist(Response, min_length=1))
single_schema = Response
print("schema loaded!")

schema loaded!


In [4]:
@guidance.guidance(stateless=False)
def json_gen(lm):
    lm += f"""\
    {{
        "emotion": "{guidance.select([e.value for e in Emotion])}",
        "message": "{guidance.gen(stop=['"'], temperature=1)}",
        "user_response_needed": {guidance.select(["true", "false"], name="user_response_needed")}
    }}"""
    return lm

@guidance.guidance(stateless=True)
def f_gen(lm):
    newline = "\n"
    dbl_quote = "\""
    lm += f"{guidance.gen(stop=[newline, dbl_quote, '<'], temperature=1)}<{guidance.select([e.value for e in Emotion])}>"
    return lm

@guidance.guidance(stateless=True)
def regex_gen(lm):
    emotion_regex = rf"(.*\.<({'|'.join([e.value for e in Emotion])})>)+"
    lm += guidance.gen(stop=["\"", "\n"], regex=emotion_regex, max_tokens=1000)
    return lm

state = lm
with guidance.system():
    state += "You are at a frat party. Split sentences with multiple emotions into multiple responses before asking for a user response. Keep responses short"

# with guidance.user():
#     state += "tell me 5 facts about paris."

# with guidance.assistant():
#     state += """
# [
#     {
#         "reply": "1. The Eiffel Tower is Paris' most iconic landmark and one of the world's most recognizable symbols.",
#         "emotion": "neutral"
#     },
#     {
#         "reply": "2. The Louvre Museum is home to the Mona Lisa, a famous painting by Leonardo da Vinci, and has a collection of over 550,000 works of art.",
#         "emotion": "neutral"
#     },
#     {
#         "reply": "3. The Seine River runs through the heart of Paris and offers beautiful views of the city's landmarks and bridges.",
#         "emotion": "neutral"
#     },
#     {
#         "reply": "4. The Arc de Triomphe honors the soldiers who fought and died for France, and offers stunning views of the city from its top.",
#         "emotion": "neutral"
#     },
#     {
#         "reply": "5. The Champs-\\u00e9lys\\u00e9s is one of the world's most famous shopping streets, lined with high-end boutiques, cafes, and restaurants.",
#         "emotion": "neutral"
#     }
# ]
# """

with guidance.user():
    state += "Hey there, how are you?"

In [5]:
# with guidance.assistant():
#     json_schema = state + guidance.json(schema=schema, name="response", temperature=0.5)
#     print(json.dumps(json.loads(json_schema["response"]), indent=4))


In [21]:
json_res = state
while True:
    with guidance.assistant():
        json_res = json_res.set("user_response_needed", "false")
        print(json_res)
        while json_res["user_response_needed"] == "false":
            last = 0
            for part in json_res.stream() + guidance.with_temperature(json_gen(), 1):
                json_res = part
                string_rep = str(part)
                string_rep.rfind("<|eot_id|>")
                state.chat_template
                last = string_rep
            print(json_res._current_prompt())
    with guidance.user():
        inp = input()
        if inp == "q":
            break
        else:
            json_res += inp


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are at a frat party. Split sentences with multiple emotions into multiple responses before asking for a user response. Keep responses short<|eot_id|><|start_header_id|>user<|end_header_id>

Hey there, how are you?<|eot_id|>
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are at a frat party. Split sentences with multiple emotions into multiple responses before asking for a user response. Keep responses short<|eot_id|><|start_header_id|>user<|end_header_id>

Hey there, how are you?<|eot_id|><|start_header_id|>assistant<|end_header_id>

{
    "emotion": "sad",
    "message": "I'm not really doing great, been stressed about final exams lately.",
    "user_response_needed": true
}
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are at a frat party. Split sentences with multiple emotions into multiple responses before asking for a user response. Keep responses short<|eot_id|><|start_header_id|

In [15]:
# with guidance.assistant():
#     state + guidance.one_or_more(f_gen())