## Agent
* LLM을 통해 액션을 선택하여 연속적으로 실행
* 구성 요소
    1. Tools: Descriptions of available tools
    2. User input: high level objective
    3. Intermediate steps: Any (action, tool output) pairs previously executed in order to achieve the user input
* Output은 다음 final response를 받기 위한 다음 action(`AgentAction`, `AgentFinish`)
    1. `AgentAction`: This is a dataclass that represents the action an agent should take. It has a tool property (which is the name of the tool that should be invoked) and a tool_input property (the input to that tool)
    2. `AgentFinish`: This is a dataclass that signifies that the agent has finished and should return to the user. It has a return_values parameter, which is a dictionary to return. It often only has one key - output - that is a string, and so often it is just this key that is returned.
    3. `intermediate_steps`: These represent previous agent actions and corresponding outputs that are passed around. These are important to pass to future iteration so the agent knows what work it has already done. This is typed as a List[Tuple[AgentAction, Any]]. Note that observation is currently left as type Any to be maximally flexible. In practice, this is often a string.

### Tools
* functions that an agent can invoke
    1. 어떻게 agent에게 올바른 툴을 줄 수 있을까?
    2. agent에게 tool을 이해 시키기 위한 설명을 해줘야 함

### Toolkits
* task를 위한 3-5개의 유사한 툴을 통해 toolkit 생성


# 실행

In [81]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.tools import StructuredTool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium import webdriver


llm = ChatOpenAI(
    temperature=0.1,
    # model="gpt-4",
)


def login_tool(login_url: str, id: str, pwd: str, btn_name: str):
    chrome_options = Options()
    chrome_options.add_experimental_option("detach", True)

    # tool 사이에 webdriver가 공유되지 않아 global로 선언
    global driver

    driver = webdriver.Chrome(options=chrome_options)
    driver.implicitly_wait(3)

    driver.get(login_url)

    id_input = driver.find_element(By.XPATH, '//*[@id="id"]')
    pwd_input = driver.find_element(By.XPATH, '//*[@id="password"]')

    id_input.send_keys(id)
    pwd_input.send_keys(pwd)

    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, f"//button[contains(., '{btn_name}')]"))
    )
    element.click()
    return driver


def first_tool(btn_name: str):
    if "<secret>" in btn_name:
        cond = f"//a[.//h3[contains(., '{btn_name}')]]"
    elif "<secret>" in btn_name:
        cond = f"//a[contains(., '{btn_name}')]"
    else:
        cond = f"//button[contains(., '{btn_name}')]"

    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, cond))
    )
    element.click()
    return driver


def second_tool(click_btn, move_btn):
    element = driver.find_elements(By.XPATH, f"//p[contains(., '{click_btn}')]")
    element[0].click()

    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, f"//button[contains(., '{move_btn}')]"))
    )
    element.click()
    return driver


def third_tool(select_box_btn, grade_btn, selected_class_list, move_btn):
    placeholder = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable(
            (By.XPATH, f"//div[contains(text(), '{select_box_btn}')]")
        )
    )
    placeholder.click()

    grade_element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, f"//div[text()='{grade_btn}']"))
    )
    grade_element.click()

    # TODO: 복수 <secret> 선택 로직 추가 필요
    for sc in selected_class_list:
        class_element = driver.find_element(
            By.XPATH, f"//div[contains(text(), '{sc}')]"
        )
        class_element.click()

    move_element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, f"//button[contains(., '{move_btn}')]"))
    )
    move_element.click()
    return driver


agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    tools=[
        StructuredTool.from_function(
            func=login_tool,
            name="login tool",
            description="""
            With this tool, you can access a url where login is possible, enter the id and password(pwd), and by pressing a button, you can log in to the page.
            """,
        ),
        StructuredTool.from_function(
            func=first_tool,
            name="first tool",
            description="""
            This tool is used simply for the action of clicking a single button.
            """,
        ),
        StructuredTool.from_function(
            func=second_tool,
            name="second tool",
            description="""
            This tool is used to select one condition and then move to the next page through a '다음' or '완료' button. 
            A total of two clicks are involved.
            Use this tool in the final step of submission.
            """,
        ),
        StructuredTool.from_function(
            func=third_tool,
            name="third tool",
            description="""
            This tool allows you to first select a target through a placeholder, and then choose various conditions. Therefore, more than three clicks will occur.
            """,
        ),
    ],
    # callback_manager=[handler]
)

In [82]:
import cv2
import time
from PIL import ImageChops, Image


def screenshot(driver, output_path="test.png"):
    total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
    driver.execute_script("window.scrollTo(0, document.body.parentNode.scrollHeight);")
    time.sleep(1)

    driver.set_window_size("1024", total_height)
    time.sleep(1)
    driver.save_screenshot(output_path)


def end_test():
    print(driver.current_url)
    screenshot(driver, output_path="test.png")

    test_img = Image.open("test.png")
    true_img = Image.open("true-new.png")

    test_img = test_img.convert("RGB")
    true_img = true_img.convert("RGB")

    diff = ImageChops.difference(true_img, test_img)
    diff.save("diff.jpg")

    true_img = cv2.imread("true-new.png")
    diff_img = cv2.imread("diff.jpg")

    gray = cv2.cvtColor(diff_img, cv2.COLOR_RGB2GRAY)
    contours, _ = cv2.findContours(gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    color = (0, 200, 0)

    for con in contours:
        if cv2.contourArea(con) > 300:
            x, y, width, height = cv2.boundingRect(con)
            cv2.rectangle(diff_img, (x, y), (x + width, y + height), color, 2)
            cv2.rectangle(true_img, (x, y), (x + width, y + height), color, 2)

    cv2.imwrite("bbox_diff.jpg", diff_img)
    cv2.imwrite("bbox_true.jpg", true_img)

    return Image.open("bbox_true.jpg")

In [None]:
prompt = """ 
우리는 아래 단계대로 차례차례 진행해야돼. 
1. id: aaa, pwd: bbb를 입력 후에 로그인 버튼을 클릭해서 로그인해줘. 
2. 버튼을 클릭해줘. 
3. 를 클릭하고 다음 버튼을 클릭해서 이동해줘. 
4. 을 선택하여 다음으로 이동해줘. 
5. 을 선택하기 위해, 을 먼저 눌러줘. 그리고 을 선택하고,  선택하여 다음으로 이동해줘. 
6. 마지막 단계에선 을 선택하고 완료로 이동해줘.
7. 로 들어가줘. 
8. 을 클릭해줘. 
9. 을 클릭해줘.
""" 

agent.invoke(prompt)
end_test()
driver.quit()