# Warm up

In [None]:
from lavague.core import  WorldModel, ActionEngine
from lavague.core.agents import WebAgent
from lavague.drivers.selenium import SeleniumDriver

from llama_index.llms.gemini import Gemini
from llama_index.embeddings.fireworks import FireworksEmbedding
from llama_index.multi_modal_llms.gemini import GeminiMultiModal

embed_model = FireworksEmbedding()
mm_llm = GeminiMultiModal(model_name="models/gemini-1.5-pro-latest")
llm = Gemini(model_name="models/gemini-1.5-flash-latest")

selenium_driver = SeleniumDriver(headless=False)
world_model = WorldModel(mm_llm=mm_llm)
action_engine = ActionEngine(llm=llm, embedding=embed_model, driver=selenium_driver)

agent = WebAgent(world_model, action_engine)

objective = "Go on the quicktour of PEFT"
url = "https://huggingface.co/docs"
agent.get(url)

In [None]:
agent.logger.new_run()
current_state, past = agent.st_memory.get_state()
obs = selenium_driver.get_obs()

world_model_output = world_model.get_instruction(
    objective, current_state, past, obs
    )

print(world_model_output)

In [None]:
from lavague.core.logger import load_images_from_folder

images = load_images_from_folder(obs["screenshots_path"])
images[0]

In [None]:
from lavague.core.utilities.format_utils import (
    extract_next_engine,
    extract_world_model_instruction,
)

next_engine_name = extract_next_engine(world_model_output)
instruction = extract_world_model_instruction(world_model_output)

print(next_engine_name)
print(instruction)

In [None]:
nodes = action_engine.get_nodes(instruction)

In [None]:
from IPython.display import display, HTML, Code

for node in nodes:
    display(HTML(node))
    display(Code(node, language="html"))
    print("--------------")

In [None]:
context = "\n".join(nodes)

action = action_engine.get_action_from_context(context, instruction)

display(Code(action, language="python"))

In [None]:
from selenium.webdriver.common.by import By
driver = selenium_driver.driver

In [None]:
# Let's think step by step

# First, we notice that the query asks us to click on the 'PEFT' option.

# In the provided HTML, we see several anchor elements that could potentially lead to the 'PEFT' page.

# We need to identify the correct anchor element that corresponds to 'PEFT'.

# Upon examining the HTML structure, we see an anchor element with the text 'PEFT' and a specific href attribute:
# /html/body/div/main/div[1]/nav/a[11]

# This anchor element has the href attribute '/docs/peft', which suggests that it leads to the 'PEFT' page.

# Thus, we believe this is the correct element to be interacted with:
peft_link = driver.find_element(By.XPATH, "/html/body/div/main/div[1]/nav/a[11]")

# Then we can click on the link
peft_link.click()

# Hugging Face paper researcher Agent: First try

In [None]:
!rm -rf screenshots/
from selenium.webdriver.common.by import By

from lavague.core import  WorldModel, ActionEngine
from lavague.core.agents import WebAgent
from lavague.drivers.selenium import SeleniumDriver
from lavague.core.world_model import WORLD_MODEL_GENERAL_EXAMPLES

from llama_index.llms.gemini import Gemini
from llama_index.embeddings.fireworks import FireworksEmbedding
from llama_index.multi_modal_llms.gemini import GeminiMultiModal

embed_model = FireworksEmbedding()
mm_llm = GeminiMultiModal(model_name="models/gemini-1.5-pro-latest")
llm = Gemini(model_name="models/gemini-1.5-flash-latest")

examples = WORLD_MODEL_GENERAL_EXAMPLES

selenium_driver = SeleniumDriver(headless=False)
world_model = WorldModel(mm_llm=mm_llm, examples=examples)
action_engine = ActionEngine(llm=llm, embedding=embed_model, driver=selenium_driver)

agent = WebAgent(world_model, action_engine)

url = "https://huggingface.co/papers"
objective = """What is the most trendy recent paper on In Context learning (ICL) on Hugging Face papers?
Provide the date and a summary of the paper."""

agent.get(url)
agent.run(objective)

In [None]:
df = agent.logger.return_pandas()
df

In [None]:
for output in df.world_model_output.values:
    print(output)
    print("--------------------------------------------------")

In [None]:
print(WORLD_MODEL_GENERAL_EXAMPLES)

In [None]:
!rm -rf screenshots/
from selenium.webdriver.common.by import By

from lavague.core import  WorldModel, ActionEngine
from lavague.core.agents import WebAgent
from lavague.drivers.selenium import SeleniumDriver
from lavague.core.world_model import WORLD_MODEL_GENERAL_EXAMPLES

from llama_index.llms.gemini import Gemini
from llama_index.embeddings.fireworks import FireworksEmbedding
from llama_index.multi_modal_llms.gemini import GeminiMultiModal
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

added_knoweldge = """
Objective: Find the latest papers on Fine tuning
Previous instructions:
- SCAN
- Click on 'Previous'
Last engine: [NONE]
Current state:
external_observations:
  vision: '[SCREENSHOT]'
internal_state:
    agent_outputs: []
    user_inputs: []

Thoughts:
- The current screenshot shows the top of a page showing papers papers published on the 22nd May 2024 on Hugging Face.
- The objective is to find the latest papers on Fine tuning.
- As we need to find the latest papers, the best next step is to gather more information to see if this page contains the information we need.
- The best next step is to use the Navigation Controls to take a screenshot of the whole page to find the latest papers on Fine tuning.
Next engine: Navigation Controls
Instruction: SCAN
"""

embed_model = FireworksEmbedding()
mm_llm = GeminiMultiModal(model_name="models/gemini-1.5-pro-latest")
# mm_llm = OpenAIMultiModal(model="gpt-4o")
llm = Gemini(model_name="models/gemini-1.5-pro")

examples = WORLD_MODEL_GENERAL_EXAMPLES + added_knoweldge

selenium_driver = SeleniumDriver(headless=False)
world_model = WorldModel(mm_llm=mm_llm, examples=examples)
action_engine = ActionEngine(llm=llm, embedding=embed_model, driver=selenium_driver)

agent = WebAgent(world_model, action_engine)

url = "https://huggingface.co/papers"
objective = """What is the most trendy recent paper on In Context learning (ICL) on Hugging Face papers?
Provide the date and a summary of the paper."""

agent.get(url)
agent.run(objective)

In [None]:
!rm -rf screenshots/
from selenium.webdriver.common.by import By

from lavague.core import  WorldModel, ActionEngine
from lavague.core.agents import WebAgent
from lavague.drivers.selenium import SeleniumDriver
from lavague.core.world_model import WORLD_MODEL_GENERAL_EXAMPLES

from llama_index.llms.gemini import Gemini
from llama_index.embeddings.fireworks import FireworksEmbedding
from llama_index.multi_modal_llms.gemini import GeminiMultiModal
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

added_knoweldge = """
Objective: Find the latest papers on Fine tuning
Previous instructions:
- SCAN
- Click on 'Previous'
Last engine: [NONE]
Current state:
external_observations:
  vision: '[SCREENSHOT]'
internal_state:
    agent_outputs: []
    user_inputs: []

Thoughts:
- The current screenshot shows the top of a page showing papers papers published on the 22nd May 2024 on Hugging Face.
- The objective is to find the latest papers on Fine tuning.
- As we need to find the latest papers, the best next step is to gather more information to see if this page contains the information we need.
- The best next step is to use the Navigation Controls to take a screenshot of the whole page to find the latest papers on Fine tuning.
Next engine: Navigation Controls
Instruction: SCAN
-----
Objective: Find the latest papers on Fine tuning
Previous instructions:
- SCAN
Last engine: [NONE]
Current state:
external_observations:
  vision: '[SCREENSHOT]'
internal_state:
    agent_outputs: []
    user_inputs: []

Thoughts:
- The page has been scanned and screenshots show we are on today's papers page of Hugging Face.
- The objective is to find the latest papers on Fine tuning.
- There has been a paper mentioning Fine tuning called 'Face Adapter for Pre-Trained Diffusion Models with Fine-Grained ID and Attribute Control'
- The best next step is to go on the paper page and provide a summary.
Next engine: Navigation Engine
Instruction: Click on the paper 'Face Adapter for Pre-Trained Diffusion Models with Fine-Grained ID and Attribute Control'
-----
Objective: Find the latest papers on Multi Modal Models
Previous instructions:
- SCAN
Last engine: Navigation Controls
Current state:
external_observations:
  vision: '[SCREENSHOTS]'
internal_state:
    agent_outputs: []
    user_inputs: []

Thoughts:
- The current screenshots show a list of papers on Hugging Face.
- The objective is to find the latest paper on Multi Modal Models.
- The whole page has been scanned and no relevant paper has been found.
- One screenshot shows a 'Previous' button. As the current page seems to show the papers of the day, and we haven't found any matching our objective, the best next step is to click on the 'Previous' button to find earlier papers on Multi Modal Models.
Next engine: Navigation Engine
Instruction: Click on the 'Previous' button.
"""

embed_model = FireworksEmbedding()
# mm_llm = GeminiMultiModal(model_name="models/gemini-1.5-pro-latest")
mm_llm = OpenAIMultiModal(model="gpt-4o")
llm = Gemini(model_name="models/gemini-1.5-flash-latest")

examples = WORLD_MODEL_GENERAL_EXAMPLES + added_knoweldge

selenium_driver = SeleniumDriver(headless=False)
world_model = WorldModel(mm_llm=mm_llm, examples=examples)
action_engine = ActionEngine(llm=llm, embedding=embed_model, driver=selenium_driver)

agent = WebAgent(world_model, action_engine)

url = "https://huggingface.co/papers?date=2024-05-20"
objective = """What is the most trendy recent paper on In Context learning (ICL) on Hugging Face papers?
Provide the date and a summary of the paper."""

agent.get(url)
agent.run(objective)