In [1]:
from autogen import ConversableAgent, AssistantAgent, GroupChatManager, GroupChat, UserProxyAgent

### Function tools

##### Setup env

In [2]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
PROJECT_PATH = os.environ.get('PROJECT_PATH')

# Add the project root path to sys.path
if PROJECT_PATH not in sys.path:
    sys.path.insert(0, PROJECT_PATH)

In [9]:
from src.image_composition import compose_ad_frame
from src.feature_extraction import extract_text_with_positions, has_transparency, get_image_dimensions


#### Test compose ad frame

In [10]:

game_id = "0a22f881b77f00220f2034c21a18b854"
assets_path = os.path.join(PROJECT_PATH, 'data', 'Assets', game_id)

# Example usage
# assets_path = '/path/to/assets'
elements = [
    {'image_path': f'{assets_path}/header.jpg', 'position': (0, 0), 'has_background': True},
    {'image_path': f'{assets_path}/engagement_instruction_1.png', 'position': (40, 100), 'has_background': False},
    {'image_path': f'{assets_path}/thumbnail.jpg', 'position': (0, 200), 'size': get_image_dimensions(f'{assets_path}/thumbnail.jpg'), 'has_background': True}
]

composed_frame = compose_ad_frame(600, 500, elements)
# composed_frame.show()  # Or save using composed_frame.save('composed_frame.jpg')
print(composed_frame)


composed_image_frame.jpg


#### Define image composition agent

In [6]:
import os

from autogen import ConversableAgent

config_list = [
    {
        "model": "gpt-4o",  # Specifies the model version to be used
        "temperature": 0.7,  # Keeps the creativity level
    }
]


llm_config_img_composition = {
    "model": "gpt-4o", # Updated to the latest model version
    "temperature": 0.7,  # Keeps the creativity level
     "config_list": config_list,  # References the LLM configuration defined above
    "functions": [
        {
            "name": "compose_ad_frame",
            "description": "Composes an advertisement frame using multiple image elements.",
            "parameters": {
                "type": "object",
                "properties": {
                    "frame_width": {
                        "type": "integer",
                        "description": "Width of the desired frame."
                    },
                    "frame_height": {
                        "type": "integer",
                        "description": "Height of the desired frame."
                    },
                    "elements": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "image_path": {
                                    "type": "string",
                                    "description": "Path to the image file."
                                },
                                "position": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    },
                                    "minItems": 2,
                                    "maxItems": 2,
                                    "description": "(x, y) coordinates of the top-left corner."
                                },
                                "size": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    },
                                    "minItems": 2,
                                    "maxItems": 2,
                                    "description": "(width, height) to resize to (maintaining aspect ratio), optional."
                                },
                                "has_background": {
                                    "type": "boolean",
                                    "description": "Whether the image has a background (True) or is transparent (False), optional."
                                }
                            },
                            "required": ["image_path", "position"]
                        },
                        "description": "List of dictionaries, each containing details about the image to be composed."
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the composed ad frame, optional."
                    },
                },
                "required": ["frame_width", "frame_height", "elements"]
            }
        }
    ]
}
# Let's first define the assistant agent that suggests tool calls.
image_composition_agent = AssistantAgent(
    name="image_composition_agent",
    system_message="""You are a helpful AI assistant.
    You task is to compose an AD Frame for StoryBoard using given assets to create an engaging advertisement.
    You are to use the 'compose_ad_frame' function to achieve this given the concept and assets details.
    Return 'TERMINATE' when the task is done.
    """,
    llm_config=llm_config_img_composition,
)


#### Get frame features function result example (placeholder)

In [7]:
# def get_frame_features(frame_path: str, assets_path: str) -> dict:
#   frame_features = {
#     "frame_path": frame_path,
#     "ad_frame_dimensions": (600, 500),
#     'text_bounded_box_format': ("left", "top", "right", "bottom"),
#     'text_bounding_boxes': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90), 'TAP': (1, 101, 65, 128), 'THE': (76, 101, 141, 128), 'SCREEN': (153, 100, 294, 129), 'to': (1, 140, 22, 158), 'find': (28, 136, 66, 158), 'the': (74, 136, 107, 158), 'nearest': (115, 140, 193, 158), 'Lexus': (201, 136, 262, 158), 'dealership': (269, 136, 381, 163)},
#     "elements": {
#         "header": {
#           'image_path': f'{assets_path}/header.jpg', 
#           'position': (0, 0),
#           'size': (600, 200),
#           'text_in_image': ['Ovexus', 'L/CERTIFIED', 'BY', 'LEXUS'],
#           'has_background': True
#         },
#         "engagement_instruction": {
#           'image_path': f'{assets_path}/engagement_instruction_1.png', 
#           'position': (0, 100),
#           'size': (380, 63),
#           'text_in_image': ['TAP', 'THE', 'SCREEN', 'to', 'find', 'the', 'nearest', 'Lexus', 'dealership'],
#           'has_background': False
#         },
#         "thubmnail": {
#           'image_path': f'{assets_path}/thumbnail.jpg', 
#           'position': (0, 200), 
#           'size': (600, 300),
#           'text_in_image': {},
#           'has_background': True
#         }
#     }
#   }
  
#   return {
#     "frame_features": frame_features
#   }

In [None]:
### More dynamic way to extract text from image
def get_frame_features(frame_path: str, assets_path: str) -> dict:
  frame_features = {
    "frame_path": frame_path,
    "ad_frame_dimensions": (600, 500),
    'text_bounded_box_format': ("left", "top", "right", "bottom"),
    'text_bounding_boxes': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90), 'TAP': (1, 101, 65, 128), 'THE': (76, 101, 141, 128), 'SCREEN': (153, 100, 294, 129), 'to': (1, 140, 22, 158), 'find': (28, 136, 66, 158), 'the': (74, 136, 107, 158), 'nearest': (115, 140, 193, 158), 'Lexus': (201, 136, 262, 158), 'dealership': (269, 136, 381, 163)},
    "elements": {
        "header": {
          'image_path': f'{assets_path}/header.jpg', 
          'position': (0, 0),
          'size': (600, 200),
          'text_in_image': ['Ovexus', 'L/CERTIFIED', 'BY', 'LEXUS'],
          'has_background': True
        },
        "engagement_instruction": {
          'image_path': f'{assets_path}/engagement_instruction_1.png', 
          'position': (0, 100),
          'size': (380, 63),
          'text_in_image': ['TAP', 'THE', 'SCREEN', 'to', 'find', 'the', 'nearest', 'Lexus', 'dealership'],
          'has_background': False
        },
        "thubmnail": {
          'image_path': f'{assets_path}/thumbnail.jpg', 
          'position': (0, 200), 
          'size': (600, 300),
          'text_in_image': {},
          'has_background': True
        }
    }
  }
  
  return {
    "frame_features": frame_features
  }

## Define the critic agent using CV tools

In [8]:
llm_config_critic = {
    "model": "gpt-4o",  # Specify the model version for the critic
    "temperature": 0.5,  # Adjust the temperature for evaluation
    "config_list": config_list,  # Use the same LLM configuration list
    "functions": [
        {
            "name": "get_frame_features",
            "description": "Gets detailed features of the frame using computer vision.",
            "parameters": {
                "type": "object",
                "properties": {
                    "frame_path": {
                        "type": "string",
                        "description": "Path to the image file."
                    },
                    "assets_path": {
                        "type": "string",
                        "description": "Path to the assets folder containing the image elements to compose."
                    }
                },
                "required": ["frame_path", "assets_path"]
            }
        }
    ]
}

critic_agent = AssistantAgent(
    name="image_critic_agent",
    system_message="""
    You are a critic AI agent. 
    Your task is to evaluate the quality of the composed ad frames. 
    You will execute function to get frame features that uses Computer vision tools to get the features of the frame.
    Check on the design of the frame and the text in the frame, ensure not overlapping, e.t.c and use the best design principles in the critique.
    You can check for the alignment of the image and text in the frame to make it more appealing. i.e the logo, slogan and the engagement instruction can be made to start on the same vertical line.
    If the frame is not good, suggest the necessary changes in positioning the elements to be made.
    If the frame is good, just say 'All good' and return 'TERMINATE' when the evaluation is complete.
    """,
    llm_config=llm_config_critic,
)


#### Define critic agent using GPT-4o vision

#### Set up configs for groupchat manager and user agent

In [9]:
llm_config = {
  "config_list": 
    [
      {"model": "gpt-4o"}
    ]
  }

llm_config_user = {
  "config_list": 
    [
      {"model": "gpt-3.5-turbo"}
    ]
}

## User proxy agent setup

In [10]:
system_message_user = """
"You are a the Human admin in the groupchat. 
You can interact with the image composition and the critic agents.
Execute their recommended functions and return the output as it is to the agents (Do not interpret the results).
"""

user_proxy = UserProxyAgent(
    name="User",
    llm_config=llm_config_user,
    system_message=system_message_user,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
    code_execution_config=False,
    # max_consecutive_auto_reply=3,
    function_map={
        "compose_ad_frame": compose_ad_frame,
        "get_frame_features": get_frame_features
    }
)

## Setup groupchat manager

In [11]:
from autogen.agentchat.groupchat import GroupChatManager

class CustomGroupChatManager(GroupChatManager):
    def _select_next_speaker(self, last_speaker, last_message, groupchat):
        # Prioritize image_critic_agent when frame_features are ready
        if (last_speaker.name == "User" and 
            "frame_features" in last_message.get("content", {}) and 
            "image_critic_agent" in groupchat.agent_names):
            return groupchat.agent_by_name("image_critic_agent")

        # For all other cases, use default behavior (or your custom logic)
        return super()._select_next_speaker(last_speaker, last_message, groupchat)


In [12]:

groupchat = GroupChat(agents=[user_proxy, image_composition_agent, critic_agent], messages=[])
manager = CustomGroupChatManager(groupchat=groupchat, llm_config=llm_config)  # Use the custom manager


## Initiate the conversation

In [14]:
image_path = f'{assets_path}/header.jpg'
print(get_image_dimensions(image_path))
print(has_transparency(image_path))
print(extract_text_with_positions(image_path))

(600, 200)
False
{'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}


In [19]:
# def get_assets_features(image_path: str, image_role: str) -> dict:
#     return {
#         'image_role': image_role,
#         "dimensions": get_image_dimensions(image_path),
#         'text_in_image_with_location': extract_text_with_positions(image_path),
#         'has_transparency': has_transparency(image_path)
#     }
    
# print(get_assets_features(image_path, 'header'))
assets_inputs = {
    "header": f'{assets_path}/header.jpg',
    "engagement_instruction": f'{assets_path}/engagement_instruction_1.png',
    "thumbnail": f'{assets_path}/thumbnail.jpg'
}

get_all_assets_features = {image_role: get_assets_features(image_path, image_role) for image_role, image_path in assets_inputs.items()}
print(get_all_assets_features)



{'image_role': 'header', 'dimensions': (600, 200), 'text_in_image_with_location': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}, 'has_transparency': False}
{'header': {'image_role': 'header', 'dimensions': (600, 200), 'text_in_image_with_location': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}, 'has_transparency': False}, 'engagement_instruction': {'image_role': 'engagement_instruction', 'dimensions': (380, 63), 'text_in_image_with_location': {'TAP': (0, 1, 64, 28), 'THE': (75, 1, 140, 28), 'SCREEN': (152, 0, 293, 29), 'to': (0, 40, 21, 58), 'find': (27, 36, 65, 58), 'the': (73, 36, 106, 58), 'nearest': (114, 40, 192, 58), 'Lexus': (200, 36, 261, 58), 'dealership': (268, 36, 380, 63)}, 'has_transparency': True}, 'thumbnail': {'image_role': 'thumbnail', 'dimensions': (600, 300), 'text_in_image_with_location': {}, 'has_transparency': False}}


In [15]:
assets_path = '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/'
system_instruction = f"""
The path to the assets folder is {assets_path}

Use the assets path and the image names to get the image path

The output path is the data folder can be derived from assets path with the name composed_image_frame.jpg

Return 'TERMINATE' when the task is done.
"""

user_message = """
Compose an AD Frame with the dimensions 600x500 for StoryBoard
This is the concept. Place the header image at the top-left corner.
Place the engagement instruction just below the logo in the header image
Place the thumbnail below the header image.

These are the 
"""

final_message = user_message + system_instruction

chat_result = user_proxy.initiate_chat(manager, message=final_message, max_turns=30)

NameError: name 'user_proxy' is not defined