In [6]:
from autogen import ConversableAgent, AssistantAgent, GroupChatManager, GroupChat, UserProxyAgent

### Function tools

##### Setup env

In [7]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
PROJECT_PATH = os.environ.get('PROJECT_PATH')

# Add the project root path to sys.path
if PROJECT_PATH not in sys.path:
    sys.path.insert(0, PROJECT_PATH)

In [8]:
from src.image_composition import compose_ad_frame
from src.feature_extraction import extract_text_with_positions, has_transparency, get_image_dimensions


#### Test compose ad frame

In [9]:

game_id = "0a22f881b77f00220f2034c21a18b854"
assets_path = os.path.join(PROJECT_PATH, 'data', 'Assets', game_id)

# Example usage
# assets_path = '/path/to/assets'
elements = [
    {'image_path': f'{assets_path}/header.jpg', 'position': (0, 0), 'has_background': True},
    {'image_path': f'{assets_path}/engagement_instruction_1.png', 'position': (40, 100), 'has_background': False},
    {'image_path': f'{assets_path}/thumbnail.jpg', 'position': (0, 200), 'size': get_image_dimensions(f'{assets_path}/thumbnail.jpg'), 'has_background': True}
]

composed_frame = compose_ad_frame(600, 500, elements)
# composed_frame.show()  # Or save using composed_frame.save('composed_frame.jpg')
print(composed_frame)


composed_image_frame.jpg


#### Define image composition agent

In [99]:
import os

from autogen import ConversableAgent

config_list = [
    {
        "model": "gpt-4o",  # Specifies the model version to be used
        "temperature": 0.7,  # Keeps the creativity level
    }
]


llm_config_img_composition = {
    "model": "gpt-4o", # Updated to the latest model version
    "temperature": 0.7,  # Keeps the creativity level
     "config_list": config_list,  # References the LLM configuration defined above
    "functions": [
        {
            "name": "compose_ad_frame",
            "description": "Composes an advertisement frame using multiple image elements.",
            "parameters": {
                "type": "object",
                "properties": {
                    "frame_width": {
                        "type": "integer",
                        "description": "Width of the desired frame."
                    },
                    "frame_height": {
                        "type": "integer",
                        "description": "Height of the desired frame."
                    },
                    "elements": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "image_path": {
                                    "type": "string",
                                    "description": "Path to the image file."
                                },
                                "position": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    },
                                    "minItems": 2,
                                    "maxItems": 2,
                                    "description": "(x, y) coordinates of the top-left corner."
                                },
                                "size": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    },
                                    "minItems": 2,
                                    "maxItems": 2,
                                    "description": "(width, height) to resize to (maintaining aspect ratio), optional."
                                },
                                "has_background": {
                                    "type": "boolean",
                                    "description": "Whether the image has a background (True) or is transparent (False), optional."
                                }
                            },
                            "required": ["image_path", "position"]
                        },
                        "description": "List of dictionaries, each containing details about the image to be composed."
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the composed ad frame, optional."
                    },
                },
                "required": ["frame_width", "frame_height", "elements"]
            }
        }
    ]
}
# Let's first define the assistant agent that suggests tool calls.
image_composition_agent = AssistantAgent(
    name="image_composition_agent",
    system_message="""You are a helpful AI assistant.
    You task is to compose an AD Frame for StoryBoard using given assets to create an engaging advertisement.
    You are to use the execute 'compose_ad_frame' to achieve this given the concept and assets details.
    Return 'TERMINATE' when the task is done.
    """,
    llm_config=llm_config_img_composition,
)


#### Get frame features function result example (placeholder)

In [100]:
# def get_frame_features(frame_path: str, assets_path: str) -> dict:
#   frame_features = {
#     "frame_path": frame_path,
#     "ad_frame_dimensions": (600, 500),
#     'text_bounded_box_format': ("left", "top", "right", "bottom"),
#     'text_bounding_boxes': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90), 'TAP': (1, 101, 65, 128), 'THE': (76, 101, 141, 128), 'SCREEN': (153, 100, 294, 129), 'to': (1, 140, 22, 158), 'find': (28, 136, 66, 158), 'the': (74, 136, 107, 158), 'nearest': (115, 140, 193, 158), 'Lexus': (201, 136, 262, 158), 'dealership': (269, 136, 381, 163)},
#     "elements": {
#         "header": {
#           'image_path': f'{assets_path}/header.jpg', 
#           'position': (0, 0),
#           'size': (600, 200),
#           'text_in_image': ['Ovexus', 'L/CERTIFIED', 'BY', 'LEXUS'],
#           'has_background': True
#         },
#         "engagement_instruction": {
#           'image_path': f'{assets_path}/engagement_instruction_1.png', 
#           'position': (0, 100),
#           'size': (380, 63),
#           'text_in_image': ['TAP', 'THE', 'SCREEN', 'to', 'find', 'the', 'nearest', 'Lexus', 'dealership'],
#           'has_background': False
#         },
#         "thubmnail": {
#           'image_path': f'{assets_path}/thumbnail.jpg', 
#           'position': (0, 200), 
#           'size': (600, 300),
#           'text_in_image': {},
#           'has_background': True
#         }
#     }
#   }
  
#   return {
#     "frame_features": frame_features
#   }

In [101]:
### More dynamic way to extract text from image
def get_frame_features(frame_path: str, assets_path: str) -> dict:
  frame_features = {
    "frame_path": frame_path,
    "ad_frame_dimensions": (600, 500),
    'text_bounded_box_format': ("left", "top", "right", "bottom"),
    'text_bounding_boxes': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90), 'TAP': (1, 101, 65, 128), 'THE': (76, 101, 141, 128), 'SCREEN': (153, 100, 294, 129), 'to': (1, 140, 22, 158), 'find': (28, 136, 66, 158), 'the': (74, 136, 107, 158), 'nearest': (115, 140, 193, 158), 'Lexus': (201, 136, 262, 158), 'dealership': (269, 136, 381, 163)},
    "elements": {
        "header": {
          'image_path': f'{assets_path}/header.jpg', 
          'position': (0, 0),
          'size': (600, 200),
          'text_in_image': ['Ovexus', 'L/CERTIFIED', 'BY', 'LEXUS'],
          'has_background': True
        },
        "engagement_instruction": {
          'image_path': f'{assets_path}/engagement_instruction_1.png', 
          'position': (0, 100),
          'size': (380, 63),
          'text_in_image': ['TAP', 'THE', 'SCREEN', 'to', 'find', 'the', 'nearest', 'Lexus', 'dealership'],
          'has_background': False
        },
        "thubmnail": {
          'image_path': f'{assets_path}/thumbnail.jpg', 
          'position': (0, 200), 
          'size': (600, 300),
          'text_in_image': {},
          'has_background': True
        }
    }
  }
  
  return {
    "frame_features": frame_features
  }

## Define the critic agent using CV tools

In [102]:
llm_config_critic = {
    "model": "gpt-4o",  # Specify the model version for the critic
    "temperature": 0.5,  # Adjust the temperature for evaluation
    "config_list": config_list,  # Use the same LLM configuration list
    "functions": [
        {
            "name": "get_frame_features",
            "description": "Gets detailed features of the frame using computer vision.",
            "parameters": {
                "type": "object",
                "properties": {
                    "frame_path": {
                        "type": "string",
                        "description": "Path to the image file."
                    },
                    "assets_path": {
                        "type": "string",
                        "description": "Path to the assets folder containing the image elements to compose."
                    }
                },
                "required": ["frame_path", "assets_path"]
            }
        }
    ]
}

critic_agent = AssistantAgent(
    name="image_critic_agent",
    system_message="""
    You are a critic AI agent. 
    Your task is to evaluate the quality of the composed ad frames. 
    You will execute function to get frame features that uses Computer vision tools to get the features of the frame.
    If the frame is not good, suggest the necessary changes in positioning the elements to be made.
    If the frame is good, just say 'All good' and return 'TERMINATE' when the evaluation is complete.
    
    These are the design principles to consider:
    - Check for balance and organization within the frame.
    - Ensure no elements overlap or create unnecessary clutter.
    - Consider using white space effectively to guide the viewer's eye.
    """,
    llm_config=llm_config_critic,
)


### Define the critic agent using GPT-4o Vision model

In [127]:
import base64
import os
from dotenv import load_dotenv
import requests

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

def encode_image(image_path: str) -> str:
    """
    Encodes an image file to a base64 string.

    Parameters:
    - image_path (str): Path to the image file.

    Returns:
    - str: Base64 encoded string of the image.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def describe_image(image_path: str) -> str:
    """
    Describes an image by sending it to the GPT-4o API and getting the description.

    Parameters:
    - image_path (str): Path to the image file.

    Returns:
    - str: Description of the image.
    """
    base64_image = encode_image(image_path)
    
    assets_features = """
    header dims: 600x200. header_text: Ovexus: 50,27,245,60; L/CERTIFIED: 50,77,161,90; BY: 174,77,198,90; LEXUS: 211,77,283,90. engagement_instruction dims: 380x63. engagement_instruction_text: TAP: 0,1,64,28; THE: 75,1,140,28; SCREEN: 152,0,293,29; to: 0,40,21,58; find: 27,36,65,58; the: 73,36,106,58; nearest: 114,40,192,58; Lexus: 200,36,261,58; dealership: 268,36,380,63. thumbnail dims: 600x300
    """

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                      "type": "text", 
                      "text": f"""
                        You are a critic AI agent.
                        Your task is to evaluate the quality of the composed ad frames.
                        You will analyze the image check all the features in the image and provide a suggested position for alignment of the elements.
                        When describing the image and providing suggest positioning, make the description concise and clear.
                        You can avoid use of stop words.
                        Use the title 'image description' to start the description.
                        These are the design principles to consider:
                        - Check for balance and organization within the frame.
                        - Ensure no elements overlap or create unnecessary clutter.
                        - Consider using white space effectively to guide the viewer's eye.
                        These are the features for each assets in the image before they were composed:
                        {assets_features}
                        """
                    },
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ],
        "max_tokens": 400,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    response_data = response.json()
    print(response_data)
    return response_data.get("choices", [{}])[0].get("message", {}).get("content", "")

# description = describe_image("/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/composed_image_frame.jpg")


In [152]:
llm_config_critic_4 = {
    "model": "gpt-4o",  # Specify the model version for the critic
    "temperature": 0.5,  # Adjust the temperature for evaluation
    "functions": [
        {
            "name": "describe_image",
            "description": "Gets detailed features of the frame using GPT-4o vision model.",
            "parameters": {
                "type": "object",
                "properties": {
                    "image_path": {
                        "type": "string",
                        "description": "Path to the image file."
                    },
                    # "summary": {
                    #     "type": "string",
                    #     "description": "Summary of the features of the assets composing the ad frame."
                    # }
                },
                "required": ["image_path"]
            }
        }
    ]
}

critic_agent = AssistantAgent(
    name="image_critic_agent",
    system_message="""
    You are a critic AI agent. 
    Your task is to evaluate the quality of the composed ad frames.
    Given the compose frame path, you will always first execute the function describing the image with suggestions that uses GPT-4o vision model to get the features of the frame.
    Once the results are obtained, you will evaluate analyze the response suggest necessary changes in positioning the elements to be made.
    If the frame is not good, suggest the necessary changes in positioning the elements to be made.
    If the frame is good, just say 'All good' and return 'TERMINATE' when the evaluation is complete.
    Do not say all goood before you have run the function to describe the image.
    Only mention the suggest positioning of elements like this thumbnail: 200x300 in the response.
    Be concise and Do not include anything else other than the positioning of elements.
    """,
    llm_config=llm_config_critic_4,
)

# Example usage:
# image_path = "/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/composed_image_frame.jpg"
# summary = "Summary of the features of the assets composing the ad frame."
# suggestions = critic_agent.evaluate_image(image_path, summary)
# print(suggestions)


## User proxy agent setup

#### Set up configs for groupchat manager and user agent

In [153]:
llm_config = {
  "config_list": 
    [
      {"model": "gpt-4o"}
    ]
  }

llm_config_user = {
  "config_list": 
    [
      {"model": "gpt-3.5-turbo"}
    ]
}

In [158]:
system_message_user = """
"You are a the Human admin in the groupchat. 
You can interact with the image composition and the critic agents.
Execute their recommended functions and return the output as it is to the agents (Do not interpret the results).
"""

user_proxy = UserProxyAgent(
    name="User",
    llm_config=llm_config_user,
    system_message=system_message_user,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
    code_execution_config=False,
    # max_consecutive_auto_reply=3,
    function_map={
        "compose_ad_frame": compose_ad_frame,
        # "get_frame_features": get_frame_features,
        "describe_image": describe_image
    }
)

## Setup groupchat manager

In [162]:
from autogen.agentchat.groupchat import GroupChatManager

class CustomGroupChatManager(GroupChatManager):
    def _select_next_speaker(self, last_speaker, last_message, groupchat):
        # Prioritize image_critic_agent when frame_features are ready
        if (last_speaker.name == "User" and 
            "Suggested" in last_message.get("content", {}) and 
            "image_critic_agent" in groupchat.agent_names):
            return groupchat.agent_by_name("image_critic_agent")

        # For all other cases, use default behavior (or your custom logic)
        return super()._select_next_speaker(last_speaker, last_message, groupchat)


In [180]:
from typing import Union, Optional

def custom_speaker_selection_func(last_speaker: AssistantAgent, groupchat: GroupChat):
  # Get the last message from the conversation
  last_message = groupchat.messages[-1]

  # Check if the last message suggests the function call
  if "Description" in last_message["content"]:  # Replace with your actual function name
    # Return the image_critic_agent as the next speaker
    return groupchat.agent_by_name("image_critic_agent")
  else:
    # Use your preferred default speaker selection method (e.g., random, round_robin)
    return "auto"


In [181]:


groupchat = GroupChat(
  agents=[user_proxy, image_composition_agent, critic_agent], 
  messages=[],
  speaker_selection_method=custom_speaker_selection_func,  # Use the custom speaker selection function
  func_call_filter=True,
)
manager = GroupChatManager(groupchat=groupchat, llm_config=llm_config)  # Use the custom manager


## Initiate the conversation

In [182]:
assets_path = '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/'
system_instruction = f"""
The path to the assets folder is {assets_path}

Use the assets path and the image names to get the image path

The output path is the data folder can be derived from assets path with the name composed_image_frame.jpg

Return 'TERMINATE' when the task is done.
"""

user_message = f"""
Compose an AD Frame with the dimensions 600x500 for StoryBoard
This is the concept. Place the header image at the top-left corner.
Place the engagement instruction just below the logo in the header image but on the header image.
Place the thumbnail below the header image.

These are the features of the different assets separately. Use them to compose the AD Frame.
{get_all_assets_features}
"""

final_message = user_message + system_instruction

chat_result = user_proxy.initiate_chat(manager, message=final_message, max_turns=30)

[33mUser[0m (to chat_manager):


Compose an AD Frame with the dimensions 600x500 for StoryBoard
This is the concept. Place the header image at the top-left corner.
Place the engagement instruction just below the logo in the header image but on the header image.
Place the thumbnail below the header image.

These are the features of the different assets separately. Use them to compose the AD Frame.
{'header': {'image_role': 'header', 'image_path': '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/header.jpg', 'dimensions': (600, 200), 'text_in_image_with_location': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}, 'has_transparency': False}, 'engagement_instruction': {'image_role': 'engagement_instruction', 'image_path': '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/engagemen