In [1]:
import os
import textgrad as tg
from textgrad.tasks import load_task
from rich import inspect
from dotenv import load_dotenv

In [2]:
load_dotenv()

# print(f"OPENAI_API_KEY={os.getenv('OPENAI_API_KEY')}")

True

In [3]:
llm_engine = tg.get_engine("gpt-3.5-turbo")
tg.set_backward_engine("gpt-4o")

In [4]:
_, val_set, _, eval_fn = load_task("BBH_object_counting", llm_engine)
question_str, answer_str = val_set[0]

In [5]:
inspect(eval_fn)

In [6]:
print(f"question_str: {question_str}")
print(f"answer_str: {answer_str}")

question_str: I have two stalks of celery, two garlics, a potato, three heads of broccoli, a carrot, and a yam. How many vegetables do I have?
answer_str: 10


In [7]:
question = tg.Variable(question_str, role_description="question to the LLM", requires_grad=False)

answer = tg.Variable(str(answer_str), role_description="answer to the question", requires_grad=False)

system_prompt = tg.Variable("You are a concise LLM. Think step by step.",
                           requires_grad=True,
                           role_description="system prompt to guide the LLM's reasoning strategy for accurate responses")

In [8]:
model = tg.BlackboxLLM(llm_engine, system_prompt=system_prompt)

In [9]:
optimizer = tg.TGD(parameters=model.parameters())

In [10]:
inspect(model.parameters())

In [11]:
prediction = model(question)

In [12]:
print(f"prediction: {prediction}")

prediction: You have a total of 7 vegetables: 2 stalks of celery, 2 garlics, 1 potato, 3 heads of broccoli, 1 carrot, and 1 yam.


In [13]:
loss = eval_fn(inputs=dict(prediction=prediction, ground_truth_answer=answer))

In [14]:
# Loss denoting accuracy:
inspect(loss)

In [15]:
loss.backward()

In [16]:
print(f"system prompt gradient:\n{system_prompt.get_gradient_text()}")

system prompt gradient:
1. **Emphasize Arithmetic Accuracy**:
   - The current prompt does not explicitly instruct the model to verify arithmetic calculations. Adding a directive to ensure arithmetic accuracy can help.
   - **Fix**: Include a specific instruction to double-check arithmetic calculations. For example, "Ensure all arithmetic calculations are accurate."

2. **Detail-Oriented Breakdown**:
   - The prompt should encourage the model to carefully break down and verify each component of the input.
   - **Fix**: Add a directive to verify each item in the breakdown. For example, "Verify each item in the breakdown to ensure the total is correct."

3. **Consistency in Units**:
   - The prompt should guide the model to pay attention to the units used (e.g., stalks, heads) and ensure they are consistently and correctly counted.
   - **Fix**: Include a reminder to check the consistency of units. For example, "Ensure consistency in the units used for counting items."

4. **Validation S

In [17]:
optimizer.step()

In [18]:
print(f"new system prompt:\n{system_prompt.get_value()}")

new system prompt:
You are a concise and clear LLM. Think step by step. Ensure all arithmetic calculations are accurate. Verify each item in the breakdown to ensure the total is correct. Ensure consistency in the units used for counting items. Perform a validation step to re-check calculations before finalizing the response. Sum up the items correctly to ensure the total is accurate.


In [19]:
prediction = model(question)

In [20]:
# 2 stalks of celery  -- 芹菜茎
# 2 garlics  -- 大蒜
# 1 potato  -- 土豆
# 3 broccoli -- 西蓝花
# 1 carrot  -- 胡萝卜
# 1 yam  -- 红薯
# 2 + 2 + 1 + 3 + 1 + 1 = 10

print(prediction)

Let's count the number of vegetables you have:

- 2 stalks of celery
- 2 garlics
- 1 potato
- 3 heads of broccoli
- 1 carrot
- 1 yam

Adding them up:
2 + 2 + 1 + 3 + 1 + 1 = 10

You have 10 vegetables in total.
