In [83]:
import os

llm_config = {
    "config_list": [{
        "model": "gpt-4o-mini",
        "api_type": "openai",
        "api_key": os.environ.get("OPENAI_KEY"),
    }]
}

In [113]:
context_variables = {
    "target_file_location": "",
    "html_file_location": "",
    "html_code": "",
    "num_tries": 0,
    "attempt_code": None,
    "feedback": None,
}

In [137]:
import os
from openai import OpenAI
import base64

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

def diff_image(target_file_name: str, try_file_name: str) -> str:
    encoded_image_1 = encode_image(target_file_name)
    encoded_image_2 = encode_image(try_file_name)

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": """You are a helpful AI teacher that will help a student replicate a target website.
                You will be given images of two websites. 
                The first image is the target website, the second image is the student's attempt at replicating the target website.
                You will understand the top 3 key differences in the two images. Using this, only suggest a list of improvemnts to the student.
                Do not explain your thought process to the student. Be specific about what improvements to make.
                Refer to the student in the first person.
                Only focus on the content of the web page and not on differences in the browser or operating system.
                """
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "The target website is shown in the first image. The student's attempt is shown in the second image."},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{encoded_image_1}"},
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{encoded_image_2}"},
                    }
                ],
            }
        ],
        model="gpt-4o-mini",
    )
    
    return chat_completion.choices[0].message.content


In [None]:
def generate_html_code(image_path: str, attempt_code: str = None, feedback: str = None) -> str:
    encoded_image_1 = encode_image(image_path)

    messages = [{
                "role": "system",
                "content": """You are a helpful HTML coder. You will be given an image of a website in a web browser.
                You will use the image to generate the HTML code for the website.
                Focus on replicating the design of the website, and not the content.
                Only return the HTML code, in a block quote. 
                Any styling should be included as inline CSS within the HTML file.
                """
                },
                {
                "role": "user",
                "content": [
                    {"type": "text", "text": "You are given the image of a website, generate the HTML code for the website."},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{encoded_image_1}"},
                    },
                ]}
            ]

    # Has there been a past attempt?
    if attempt_code is not None and attempt_code != "":
        print("There has been a past attempt at writing code")
        messages.append({
            "role": "user",
            "content": [
                {"type": "text", "text": "Here is the code that I have generated so far."},
                {"type": "text", "text": attempt_code},
                {"type": "text", "text": "Here is the feedback that I received on this code, can you improve it by accounting for the feedback?"},
                {"type": "text", "text": feedback},
            ],
        })

    chat_completion = client.chat.completions.create(messages=messages,
        model="gpt-4o",
    )

    return chat_completion.choices[0].message.content

In [152]:
import webbrowser, pyautogui
from autogen import SwarmAgent, SwarmResult, UserProxyAgent, initiate_swarm_chat, AFTER_WORK, AfterWorkOption, ON_CONDITION

context_variables = {
    "target_file_location": "",
    "html_file_location": "",
    "html_code": "",
    "num_tries": 0,
    "attempt_code": None,
    "feedback": None,
}

def take_screenshot(url: str, context_variables: dict) -> str:
    webbrowser.get('firefox').open(url, new=0)
    pyautogui.sleep(2)
    screenshot = pyautogui.screenshot()
    target_file_location = "target.png"
    screenshot.save(target_file_location)
    context_variables["target_file_location"] = target_file_location
    return SwarmResult(values="Done taking a screenshot. Use this to create a website.", context_variables=context_variables, agent=coder_agent)

def write_html_code(context_variables: dict) -> str:
    print("Writing HTML Code")
    html_code = generate_html_code(context_variables["target_file_location"],
                                   context_variables["attempt_code"],
                                   context_variables["feedback"])

    context_variables["attempt_code"] = html_code

    # strip block quotes
    html_code = html_code[7:]
    html_code = html_code[:-3]

    with open("website.html", "w") as f:
        f.write(html_code)

    context_variables["html_file_location"] = "website.html"
    context_variables["num_tries"] = context_variables.get("num_tries", 0) + 1
    
    return SwarmResult(values="Wrote HTML code. This should be tested by tester.", context_variables=context_variables, agent=tester_agent)
    
def test_html_code(context_variables: dict) -> str:
    print("Testing HTML Code")
    html_file_name = context_variables.get("html_file_location")
    if not html_file_name:
        return SwarmResult(values="No HTML file location found.", context_variables=context_variables)
    
    # open the file in firefox
    # i could reuse a function here.
    webbrowser.get('firefox').open(f'file:///Users/kartik/Code/ai-agents/website-clone-assistant/{html_file_name}',
                                   new=0)
    
    pyautogui.sleep(2)
    screenshot = pyautogui.screenshot()
    try_file_location = "try.png"
    screenshot.save(try_file_location)
    
    context_variables["try_file_location"] = try_file_location
    
    feedback = diff_image(context_variables["target_file_location"], try_file_location)
    
    context_variables["feedback"] = feedback
    
    return SwarmResult(values="Tested HTML code, website should be re-written by coder.", context_variables=context_variables, agent=coder_agent)

# Do i even need a screenshot agent?
screenshot_agent = SwarmAgent(
    name="screenshotter",
    system_message="""You are a helpful assistant.
    You can use the take_screenshot function to take a screenshot of a webpage.
    Pass in the name of the webpage as an argument to the function.
    After taking a screenshot, pass it to the coder agent.
    """,
    functions=[take_screenshot],
    llm_config=llm_config,
)

coder_agent = SwarmAgent(name = "coder",
    system_message="""You are a coding assistant.
    You have the ability to write HTML code to a file.
    Always use the function write_html_code to write HTML code to a file.""",
    functions=[write_html_code],
    llm_config=llm_config)

tester_agent = SwarmAgent(name = "tester",
    system_message="""You are the tester.
    You have access to a function test_html_code to test HTML code.""",
    functions=[test_html_code],
    llm_config=llm_config) 

user_agent = UserProxyAgent(name = "user",
    system_message="You are the user. You can interact with the agents to get the task done.",
    human_input_mode="NEVER",
    code_execution_config=False)

# Define transitions
screenshot_agent.register_hand_off(
    hand_to=[
        ON_CONDITION(coder_agent, "After taking a screenshot, transfer to coder_agent.")
    ])

coder_agent.register_hand_off(
    hand_to=[
        ON_CONDITION(tester_agent, "After writing HTML code, transfer to tester_agent.")
    ])

tester_agent.register_hand_off(
    hand_to=[
        ON_CONDITION(coder_agent, "After testing HTML code, transfer back to coder_agent."),
    ])   

In [158]:
chat_result, context_variables, last_speaker = initiate_swarm_chat(initial_agent=screenshot_agent,
                    agents=[screenshot_agent, coder_agent, tester_agent],
                    user_agent=user_agent,
                    messages=["Take a screenshot of https://ag2ai.github.io/ag2/docs/tutorial/introduction/ and save it as target.png",],
                    context_variables=context_variables,
                    after_work=AFTER_WORK(AfterWorkOption.TERMINATE),
                    max_rounds=25)

print(context_variables)

[33muser[0m (to chat_manager):

Take a screenshot of https://ag2ai.github.io/ag2/docs/tutorial/introduction/ and save it as target.png

--------------------------------------------------------------------------------
[32m
Next speaker: screenshotter
[0m
[33mscreenshotter[0m (to chat_manager):

[32m***** Suggested tool call (call_cC6z75r1tFapqEXXewxY7AML): take_screenshot *****[0m
Arguments: 
{"url":"https://ag2ai.github.io/ag2/docs/tutorial/introduction/"}
[32m********************************************************************************[0m

--------------------------------------------------------------------------------
[32m
Next speaker: Tool_Execution
[0m
[35m
>>>>>>>> EXECUTING FUNCTION take_screenshot...[0m
[33mTool_Execution[0m (to chat_manager):

[32m***** Response from calling tool (call_cC6z75r1tFapqEXXewxY7AML) *****[0m
Done taking a screenshot. Use this to create a website.
[32m**********************************************************************[0m

-

KeyboardInterrupt: 

In [27]:
print(chat_completion)

ChatCompletion(id='chatcmpl-AdPYto7FJ1Zpv0rfDK8RvFhWr3HIR', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The image shows a screenshot of the Google homepage displayed in dark mode. At the center, the colorful "Google" logo is surrounded by various small, colorful icons resembling stars and planets. Below the logo is a prominent search bar, with buttons for "Google Search" and "I\'m Feeling Lucky" beneath it. There\'s also a notification about the new "Gemini 2.0" AI model and a message regarding climate action at the bottom. The browser interface shows tabs and options typical for a web browser, like bookmarks and settings.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1733956415, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_818c284075', usage=CompletionUsage(completion_tokens=107, prompt_tokens=36845, total_tokens=36952, complet