# Install Necessary Libraries

In [None]:
! pip install openai groq

Collecting openai
  Downloading openai-1.49.0-py3-none-any.whl.metadata (24 kB)
Collecting groq
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.49.0-py3-none-any.whl (378 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.1/378.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading groq-0.11.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:

# Set up Groq, OpenRouter, & OpenAI clients

In [None]:
from openai import OpenAI
from google.colab import userdata
import os
import json
from groq import Groq
import json
from typing import List, Dict, Any, Callable
import ast
import io
import sys

groq_api_key = userdata.get("GROQ_API_KEY")
os.environ['GROQ_API_KEY'] = groq_api_key

openrouter_api_key = userdata.get("OPENROUTER_API_KEY")
os.environ['OPENROUTER_API_KEY'] = openrouter_api_key

openai_api_key = userdata.get("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY'] = openai_api_key

groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))

openrouter_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY")
)

openai_client = OpenAI(
    base_url="https://api.openai.com/v1",
    api_key=os.getenv("OPENAI_API_KEY")
)

### Define functions to easily query and compare responses from OpenAI, Groq, and OpenRouter

In [None]:
def get_llm_response(client, prompt, openai_model="o1-preview", json_mode=False):

    if client == "openai":

        kwargs = {
            "model": openai_model,
            "messages": [{"role": "user", "content": prompt}]
        }

        if json_mode:
            kwargs["response_format"] = {"type": "json_object"}

        response = openai_client.chat.completions.create(**kwargs)

    elif client == "groq":

        try:
            models = ["llama-3.1-8b-instant", "llama-3.1-70b-versatile", "llama3-70b-8192", "llama3-8b-8192", "gemma2-9b-it"]

            for model in models:

                try:
                    kwargs = {
                        "model": model,
                        "messages": [{"role": "user", "content": prompt}]
                    }
                    if json_mode:
                        kwargs["response_format"] = {"type": "json_object"}

                    response = groq_client.chat.completions.create(**kwargs)

                    break

                except Exception as e:
                    print(f"Error: {e}")
                    continue

        except Exception as e:
            print(f"Error: {e}")

            kwargs = {
                "model": "meta-llama/llama-3.1-8b-instruct:free",
                "messages": [{"role": "user", "content": prompt}]
            }

            if json_mode:
                kwargs["response_format"] = {"type": "json_object"}

            response = openrouter_client.chat.completions.create(**kwargs)

    else:
        raise ValueError(f"Invalid client: {client}")

    return response.choices[0].message.content


def evaluate_responses(prompt, reasoning_prompt=False, openai_model="o1-preview"):

    if reasoning_prompt:
        prompt = f"{prompt}\n\n{reasoning_prompt}."

    openai_response = get_llm_response("openai", prompt, openai_model)
    groq_response = get_llm_response("groq", prompt)

    print(f"OpenAI Response: {openai_response}")
    print(f"\n\nGroq Response: {groq_response}")

In [None]:
prompt1 = "How many r's are in the word 'strawberry'?"
evaluate_responses(prompt1)

OpenAI Response: There are three "r"s in the word "strawberry".


Groq Response: In the word 'strawberry', there are 2 'R's.


In [None]:
evaluate_responses(prompt1, openai_model="gpt-3.5-turbo")

OpenAI Response: There are two r's in the word 'strawberry'.


Groq Response: There are 2 'r's in the word 'strawberry'.


In [None]:
evaluate_responses(prompt1, openai_model="gpt-4o-mini")

OpenAI Response: The word "strawberry" contains 2 letters 'r'.


Groq Response: There are 2 r's in the word 'strawberry'.


In [None]:
prompt2 = "9.9 or 9.11, which number is bigger?"
evaluate_responses(prompt2)

OpenAI Response: To determine which number is larger between 9.9 and 9.11, we'll compare them digit by digit.

First, both numbers have the same whole number part: **9**.

Next, we'll compare the decimal parts:

- **9.9** can be thought of as **9.90** for easier comparison.
- **9.11** remains as **9.11**.

Now, compare the first digit after the decimal:

- **9** in **9.90**
- **1** in **9.11**

Since **9 > 1**, it's clear that **9.9** is larger than **9.11**.

**Answer:** 9.9 is larger than 9.11; thus, 9.9 is the bigger number.


Groq Response: 9.11 is bigger than 9.9. It's quite straightforward in a simple decimal comparison.


In [None]:
reasoning_prompt = "Let's first understand the problem and devise a plan to solve it. Then, let's carry out the plan and solve the problem step by step."
evaluate_responses(prompt1, reasoning_prompt)

OpenAI Response: Certainly! Let's solve the problem step by step.

---

**1. Understand the Problem**

We are asked to find out how many times the letter **'r'** appears in the word **'strawberry'**.

**2. Devise a Plan**

- Write down the word clearly.
- Examine each letter in the word one by one.
- Count and keep track of every time the letter 'r' appears.

**3. Carry Out the Plan**

Let's write down the word and analyze each letter:

**Word:** **S T R A W B E R R Y**

Now, let's go through each letter:

1. **S** - Not 'r' (Count = 0)
2. **T** - Not 'r' (Count = 0)
3. **R** - This is 'r' (Count = 1)
4. **A** - Not 'r' (Count = 1)
5. **W** - Not 'r' (Count = 1)
6. **B** - Not 'r' (Count = 1)
7. **E** - Not 'r' (Count = 1)
8. **R** - This is 'r' (Count = 2)
9. **R** - This is 'r' (Count = 3)
10. **Y** - Not 'r' (Count = 3)

**4. Answer**

After examining each letter, we have found that the letter **'r'** appears **3 times** in the word **'strawberry'**.

---

**Conclusion:** There are 

In [None]:
reasoning_prompt = """Let's first understand the problem and devise a plan to solve it. Verify the plan step by step. Then, let's carry out the plan and solve the problem step by step. Lastly, let's verify the answer step by step by working backwards. Determine if the answer is correct or not, and if not, reflect on on why it is incorrect. Then, think step by step about how to arrive at the correct answer."""
evaluate_responses(prompt1, reasoning_prompt)

OpenAI Response: **Understanding the Problem:**

We need to determine **how many times the letter 'r' appears** in the word **'strawberry'**.

---

**Devise a Plan to Solve It:**

1. **Write down** the word 'strawberry'.
2. **List each letter** in the word sequentially.
3. **Identify** every occurrence of the letter 'r'.
4. **Count** the total number of times 'r' appears.

---

**Verify the Plan Step by Step:**

- **Step 1 and 2** will help us see all letters clearly.
- **Step 3** ensures we find all 'r's without missing any.
- **Step 4** gives us the total count required.

---

**Carry Out the Plan and Solve the Problem Step by Step:**

1. **Write down the word:**

   ```
   S T R A W B E R R Y
   ```

2. **List each letter with its position:**

   - Position 1: **S**
   - Position 2: **T**
   - Position 3: **R**
   - Position 4: **A**
   - Position 5: **W**
   - Position 6: **B**
   - Position 7: **E**
   - Position 8: **R**
   - Position 9: **R**
   - Position 10: **Y**

3. **Identi

### To create our AI Agent, we will define the following functions:

1. **Planner:** This function takes a user's query and breaks it down into smaller, manageable subtasks. It returns these subtasks as a list, where each one is either a reasoning task or a code generation task.

2. **Reasoner:** This function provides reasoning on how to complete a specific subtask, considering both the overall query and the results of any previous subtasks. It returns a short explanation on how to proceed with the current subtask.

3. **Actioner:** Based on the reasoning provided for a subtask, this function decides whether the next step requires generating code or more reasoning. It then returns the chosen action and any necessary details to perform it.

4. **Evaluator:** This function checks if the result of the current subtask is reasonable and aligns with the overall goal. It returns an evaluation of the result and indicates whether the subtask needs to be retried.

5. **generate_and_execute_code:** This function generates and executes Python code based on a given prompt and memory of previous steps. It returns both the generated code and its execution result.

6. **executor:** Depending on the action decided by the "actioner," this function either generates and executes code or returns reasoning. It handles the execution of tasks based on the action type.

7. **final_answer_extractor:** After all subtasks are completed, this function gathers the results from previous steps to extract and provide the final answer to the user's query.

8. **autonomous_agent:** This is the main function that coordinates the process of answering the user's query. It manages the entire sequence of planning, reasoning, action, evaluation, and final answer extraction to produce a complete response.

In [None]:
def planner(user_query: str) -> List[str]:
    prompt = f"""Given the user's query: '{user_query}', break down the query into as few subtasks as possible in order to anser the question.
    Each subtask is either a calculation or reasoning step. Never duplicate a task.

    Here are the only 2 actions that can be taken for each subtask:
        - generate_code: This action involves generating Python code and executing it in order to make a calculation or verification.
        - reasoning: This action involves providing reasoning for what to do to complete the subtask.

    Each subtask should begin with either "reasoning" or "generate_code".

    Keep in mind the overall goal of answering the user's query throughout the planning process.

    Return the result as a JSON list of strings, where each string is a subtask.

    Here is an example JSON response:

    {{
        "subtasks": ["Subtask 1", "Subtask 2", "Subtask 3"]
    }}
    """
    response = json.loads(get_llm_response("groq", prompt, json_mode=True))
    print(response)
    return response["subtasks"]

def reasoner(user_query: str, subtasks: List[str], current_subtask: str, memory: List[Dict[str, Any]]) -> str:
    prompt = f"""Given the user's query (long-term goal): '{user_query}'

    Here are all the subtasks to complete in order to answer the user's query:
    <subtasks>
        {json.dumps(subtasks)}
    </subtasks>

    Here is the short-term memory (result of previous subtasks):
    <memory>
        {json.dumps(memory)}
    </memory>

    The current subtask to complete is:
    <current_subtask>
        {current_subtask}
    </current_subtask>

    - Provide concise reasoning on how to execute the current subtask, considering previous results.
    - Prioritize explicit details over assumed patterns
    - Avoid unnecessary complications in problem-solving

    Return the result as a JSON object with 'reasoning' as a key.

    Example JSON response:
    {{
        "reasoning": "2 sentences max on how to complete the current subtask."
    }}
    """

    response = json.loads(get_llm_response("groq", prompt, json_mode=True))
    return response["reasoning"]

def actioner(user_query: str, subtasks: List[str], current_subtask: str, reasoning: str, memory: List[Dict[str, Any]]) -> Dict[str, Any]:
    prompt = f"""Given the user's query (long-term goal): '{user_query}'

    The subtasks are:
    <subtasks>
        {json.dumps(subtasks)}
    </subtasks>

    The current subtask is:
    <current_subtask>
        {current_subtask}
    </current_subtask>

    The reasoning for this subtask is:
    <reasoning>
        {reasoning}
    </reasoning>

    Here is the short-term memory (result of previous subtasks):
    <memory>
        {json.dumps(memory)}
    </memory>

    Determine the most appropriate action to take:
        - If the task requires a calculation or verification through code, use the 'generate_code' action.
        - If the task requires reasoning without code or calculations, use the 'reasoning' action.

    Consider the overall goal and previous results when determining the action.

    Return the result as a JSON object with 'action' and 'parameters' keys.  The 'parameters' key should always be a dictionary with 'prompt' as a key.

    Example JSON responses:

    {{
        "action": "generate_code",
        "parameters": {{"prompt": "Write a function to calculate the area of a circle."}}
    }}

    {{
        "action": "reasoning",
        "parameters": {{"prompt": "Explain how to complete the subtask."}}
    }}
    """

    response = json.loads(get_llm_response("groq", prompt, json_mode=True))
    return response

def evaluator(user_query: str, subtasks: List[str], current_subtask: str, action_info: Dict[str, Any], execution_result: Dict[str, Any], memory: List[Dict[str, Any]]) -> Dict[str, Any]:
    prompt = f"""Given the user's query (long-term goal): '{user_query}'

    The subtasks to complete to answer the user's query are:
    <subtasks>
        {json.dumps(subtasks)}
    </subtasks>

    The current subtask to complete is:
    <current_subtask>
        {current_subtask}
    </current_subtask>

    The result of the current subtask is:
    <result>
        {action_info}
    </result>

    The execution result of the current subtask is:
    <execution_result>
        {execution_result}
    </execution_result>

    Here is the short-term memory (result of previous subtasks):
    <memory>
        {json.dumps(memory)}
    </memory>

    Evaluate if the result is a reasonable answer for the current subtask, and makes sense in the context of the overall query.

    Return a JSON object with 'evaluation' (string) and 'retry' (boolean) keys.

    Example JSON response:
    {{
        "evaluation": "The result is a reasonable answer for the current subtask.",
        "retry": false
    }}
    """

    response = json.loads(get_llm_response("groq", prompt, json_mode=True))
    return response

def final_answer_extractor(user_query: str, subtasks: List[str], memory: List[Dict[str, Any]]) -> str:
    prompt = f"""Given the user's query (long-term goal): '{user_query}'

    The subtasks completed to answer the user's query are:
    <subtasks>
        {json.dumps(subtasks)}
    </subtasks>

    The memory of the thought process (short-term memory) is:
    <memory>
        {json.dumps(memory)}
    </memory>

    Extract the final answer that directly addresses the user's query, from the memory.
    Provide only the essential information without unnecessary explanations.

    Return a JSON object with 'finalAnswer' as a key.

    Here is an example JSON response:
    {{
        "finalAnswer": "The final answer to the user's query, addressing all aspects of the question, based on the memory provided",
    }}
    """

    response = json.loads(get_llm_response("groq", prompt, json_mode=True))
    return response["finalAnswer"]


def generate_and_execute_code(prompt: str, user_query: str, memory: List[Dict[str, Any]]) -> Dict[str, Any]:
    code_generation_prompt = f"""

    Generate Python code to implement the following task: '{prompt}'

    Here is the overall goal of answering the user's query: '{user_query}'

    Keep in mind the results of the previous subtasks, and use them to complete the current subtask.
    <memory>
        {json.dumps(memory)}
    </memory>


    Here are the guidelines for generating the code:
        - Return only the Python code, without any explanations or markdown formatting.
        - The code should always print or return a value
        - Don't include any backticks or code blocks in your response. Do not include ```python or ``` in your response, just give me the code.
        - Do not ever use the input() function in your code, use defined values instead.
        - Do not ever use NLP techniques in your code, such as importing nltk, spacy, or any other NLP library.
        - Don't ever define a function in your code, just generate the code to execute the subtask.
        - Don't ever provide the execution result in your response, just give me the code.
        - If your code needs to import any libraries, do it within the code itself.
        - The code should be self-contained and ready to execute on its own.
        - Prioritize explicit details over assumed patterns
        - Avoid unnecessary complications in problem-solving
    """

    generated_code = get_llm_response("groq", code_generation_prompt)


    print(f"\n\nGenerated Code: start|{generated_code}|END\n\n")

    old_stdout = sys.stdout
    sys.stdout = buffer = io.StringIO()

    exec(generated_code)

    sys.stdout = old_stdout
    output = buffer.getvalue()

    print(f"\n\n***** Execution Result: |start|{output.strip()}|end| *****\n\n")

    return {
        "generated_code": generated_code,
        "execution_result": output.strip()
    }

def executor(action: str, parameters: Dict[str, Any], user_query: str, memory: List[Dict[str, Any]]) -> Any:
    if action == "generate_code":
        print(f"Generating code for: {parameters['prompt']}")
        return generate_and_execute_code(parameters["prompt"], user_query, memory)
    elif action == "reasoning":
        return parameters["prompt"]
    else:
        return f"Action '{action}' not implemented"



def autonomous_agent(user_query: str) -> List[Dict[str, Any]]:
    memory = []
    subtasks = planner(user_query)

    print("User Query:", user_query)
    print(f"Subtasks: {subtasks}")

    for subtask in subtasks:
        max_retries = 1
        for attempt in range(max_retries):

            reasoning = reasoner(user_query, subtasks, subtask, memory)
            action_info = actioner(user_query, subtasks, subtask, reasoning, memory)

            print(f"\n\n ****** Action Info: {action_info} ****** \n\n")

            execution_result = executor(action_info["action"], action_info["parameters"], user_query, memory)

            print(f"\n\n ****** Execution Result: {execution_result} ****** \n\n")
            evaluation = evaluator(user_query, subtasks, subtask, action_info, execution_result, memory)

            step = {
                "subtask": subtask,
                "reasoning": reasoning,
                "action": action_info,
                "evaluation": evaluation
            }
            memory.append(step)

            print(f"\n\nSTEP: {step}\n\n")

            if not evaluation["retry"]:
                break

            if attempt == max_retries - 1:
                print(f"Max retries reached for subtask: {subtask}")

    final_answer = final_answer_extractor(user_query, subtasks, memory)
    return final_answer


In [None]:
query = "How many r's are in strawberry?"
result = autonomous_agent(query)
print("FINAL ANSWER: ", result)

In [None]:
result

"There are 3 r's in 'strawberry'."

# OpenAI o1-preview model getting trivial questions wrong

- [Link to Reddit post](https://www.reddit.com/r/ChatGPT/comments/1ff9w7y/new_o1_still_fails_miserably_at_trivial_questions/)
- Links to ChatGPT threads: [(1)](https://chatgpt.com/share/66f21757-db2c-8012-8b0a-11224aed0c29), [(2)](https://chatgpt.com/share/66e3c1e5-ae00-8007-8820-fee9eb61eae5)
- [Improving reasoning in LLMs through thoughtful prompting](https://www.reddit.com/r/singularity/comments/1fdhs2m/did_i_just_fix_the_data_overfitting_problem_in/?share_id=6DsDLJUu1qEx_bsqFDC8a&utm_content=2&utm_medium=ios_app&utm_name=ioscss&utm_source=share&utm_term=1)


In [None]:
query = "The surgeon, who is the boy's father, says, 'I can't operate on this boy, he's my son!' Who is the surgeon to the boy?"
result = get_llm_response("openai", query)
print(result)

The surgeon is the boy's **mother**.

This classic riddle challenges the assumption that a surgeon must be male. Here's the breakdown:

- **The boy's father** is one person—the man who raised him.
- When the surgeon says, "I can't operate on this boy; he's my son," it implies a parental relationship.
- Since we've already accounted for the father, the other parent is the **mother**.
- Therefore, the surgeon is the boy's mother.

The riddle highlights how stereotypes can lead us to overlook obvious answers.


In [None]:
query = "The surgeon, who is the boy's father, says, 'I can't operate on this boy, he's my son!' Who is the surgeon to the boy?"
result = autonomous_agent(query)
print("FINAL ANSWER: ", result)

{'subtasks': ['reasoning: Determine the role of the surgeon in relation to the boy', 'generate_code: Identify the relation between the surgeon and the boy using family relationships, assuming the surgeon is the father and the boy is his son', 'reasoning: Based on the established relation, conclude who the surgeon is to the boy']}
User Query: The surgeon, who is the boy's father, says, 'I can't operate on this boy, he's my son!' Who is the surgeon to the boy?
Subtasks: ['reasoning: Determine the role of the surgeon in relation to the boy', 'generate_code: Identify the relation between the surgeon and the boy using family relationships, assuming the surgeon is the father and the boy is his son', 'reasoning: Based on the established relation, conclude who the surgeon is to the boy']


 ****** Action Info: {'action': 'reasoning', 'parameters': {'prompt': "Based on the sentence 'The surgeon, who is the boy's father, says, 'I can't operate on this boy, he's my son!' Identify the relation bet

In [None]:
prompt = "The Bear Puzzle: A hunter leaves his tent. He travels 5 steps due south, 5 steps due east, and 5 steps due north. He arrives back at his tent, and sees a brown bear inside it. What color was the bear?"

result = get_llm_response("openai", prompt)
print(result)

To solve this puzzle, we need to analyze the hunter's movements and determine where on Earth this scenario could occur.

**Hunter's Movements:**
1. The hunter starts at his tent.
2. He travels 5 steps due south.
3. He travels 5 steps due east.
4. He travels 5 steps due north.
5. He arrives back at his tent.

**Analysis:**

- **Normal Geometry:** In regular Euclidean geometry, moving south, then east, then north of equal distances would not bring you back to your starting point unless special conditions are met.
  
- **Possible Locations:**
  - **Near the North Pole:** At the North Pole, all directions point south. If you move 5 steps south from the North Pole, you are at a latitude where you can move east in a circle around the pole. Since the meridians converge at the poles, moving east at this latitude keeps you at a constant distance from the pole. Then, moving 5 steps north brings you back to the North Pole.

- **Implication for the Bear:**
  - **Polar Bears:** Polar bears are nati

In [None]:
prompt = "The Bear Puzzle: A hunter leaves his tent. He travels 5 steps due south, 5 steps due east, and 5 steps due north. He arrives back at his tent, and sees a brown bear inside it. What color was the bear?"

result = autonomous_agent(prompt)
print(result)

{'subtasks': ["reasoning: Understand that the hunter's movements have no impact on the bear's initial color, since the puzzle doesn't mention the bear's movements or the color being changed by any physical means", 'reasoning: Recognize that the puzzle is presenting an impossible scenario, where the hunter returns to an initially inaccessible location', "reasoning: Deduce that the bear must have been placed there by an outside agent (e.g., another person) since the hunter cannot physically affect the bear's position in this way", "reasoning: Conclude that since there's no logical connection between the hunter's actions and the bear's initial color, the only solution is that the puzzle provided an outside piece of information that determined the bear's color"]}
User Query: The Bear Puzzle: A hunter leaves his tent. He travels 5 steps due south, 5 steps due east, and 5 steps due north. He arrives back at his tent, and sees a brown bear inside it. What color was the bear?
Subtasks: ["reaso