In [6]:
from autogen import ConversableAgent, AssistantAgent, GroupChatManager, GroupChat, UserProxyAgent

### Function tools

##### Setup env

In [7]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
PROJECT_PATH = os.environ.get('PROJECT_PATH')

# Add the project root path to sys.path
if PROJECT_PATH not in sys.path:
    sys.path.insert(0, PROJECT_PATH)

In [8]:
from src.image_composition import compose_ad_frame
from src.feature_extraction import extract_text_with_positions, has_transparency, get_image_dimensions


#### Test compose ad frame

In [9]:

game_id = "0a22f881b77f00220f2034c21a18b854"
assets_path = os.path.join(PROJECT_PATH, 'data', 'Assets', game_id)

# Example usage
# assets_path = '/path/to/assets'
elements = [
    {'image_path': f'{assets_path}/header.jpg', 'position': (0, 0), 'has_background': True},
    {'image_path': f'{assets_path}/engagement_instruction_1.png', 'position': (40, 100), 'has_background': False},
    {'image_path': f'{assets_path}/thumbnail.jpg', 'position': (0, 200), 'size': get_image_dimensions(f'{assets_path}/thumbnail.jpg'), 'has_background': True}
]

composed_frame = compose_ad_frame(600, 500, elements)
# composed_frame.show()  # Or save using composed_frame.save('composed_frame.jpg')
print(composed_frame)


composed_image_frame.jpg


#### Define image composition agent

In [10]:
import os

from autogen import ConversableAgent

config_list = [
    {
        "model": "gpt-4o",  # Specifies the model version to be used
        "temperature": 0.7,  # Keeps the creativity level
    }
]


llm_config_img_composition = {
    "model": "gpt-4o", # Updated to the latest model version
    "temperature": 0.7,  # Keeps the creativity level
     "config_list": config_list,  # References the LLM configuration defined above
    "functions": [
        {
            "name": "compose_ad_frame",
            "description": "Composes an advertisement frame using multiple image elements.",
            "parameters": {
                "type": "object",
                "properties": {
                    "frame_width": {
                        "type": "integer",
                        "description": "Width of the desired frame."
                    },
                    "frame_height": {
                        "type": "integer",
                        "description": "Height of the desired frame."
                    },
                    "elements": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "image_path": {
                                    "type": "string",
                                    "description": "Path to the image file."
                                },
                                "position": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    },
                                    "minItems": 2,
                                    "maxItems": 2,
                                    "description": "(x, y) coordinates of the top-left corner."
                                },
                                "size": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    },
                                    "minItems": 2,
                                    "maxItems": 2,
                                    "description": "(width, height) to resize to (maintaining aspect ratio), optional."
                                },
                                "has_background": {
                                    "type": "boolean",
                                    "description": "Whether the image has a background (True) or is transparent (False), optional."
                                }
                            },
                            "required": ["image_path", "position"]
                        },
                        "description": "List of dictionaries, each containing details about the image to be composed."
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the composed ad frame, optional."
                    },
                },
                "required": ["frame_width", "frame_height", "elements"]
            }
        }
    ]
}
# Let's first define the assistant agent that suggests tool calls.
image_composition_agent = AssistantAgent(
    name="image_composition_agent",
    system_message="""You are a helpful AI assistant.
    You task is to compose an AD Frame for StoryBoard using given assets to create an engaging advertisement.
    You are to use the 'compose_ad_frame' function to achieve this given the concept and assets details.
    Return 'TERMINATE' when the task is done.
    """,
    llm_config=llm_config_img_composition,
)


#### Get frame features function result example (placeholder)

In [11]:
# def get_frame_features(frame_path: str, assets_path: str) -> dict:
#   frame_features = {
#     "frame_path": frame_path,
#     "ad_frame_dimensions": (600, 500),
#     'text_bounded_box_format': ("left", "top", "right", "bottom"),
#     'text_bounding_boxes': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90), 'TAP': (1, 101, 65, 128), 'THE': (76, 101, 141, 128), 'SCREEN': (153, 100, 294, 129), 'to': (1, 140, 22, 158), 'find': (28, 136, 66, 158), 'the': (74, 136, 107, 158), 'nearest': (115, 140, 193, 158), 'Lexus': (201, 136, 262, 158), 'dealership': (269, 136, 381, 163)},
#     "elements": {
#         "header": {
#           'image_path': f'{assets_path}/header.jpg', 
#           'position': (0, 0),
#           'size': (600, 200),
#           'text_in_image': ['Ovexus', 'L/CERTIFIED', 'BY', 'LEXUS'],
#           'has_background': True
#         },
#         "engagement_instruction": {
#           'image_path': f'{assets_path}/engagement_instruction_1.png', 
#           'position': (0, 100),
#           'size': (380, 63),
#           'text_in_image': ['TAP', 'THE', 'SCREEN', 'to', 'find', 'the', 'nearest', 'Lexus', 'dealership'],
#           'has_background': False
#         },
#         "thubmnail": {
#           'image_path': f'{assets_path}/thumbnail.jpg', 
#           'position': (0, 200), 
#           'size': (600, 300),
#           'text_in_image': {},
#           'has_background': True
#         }
#     }
#   }
  
#   return {
#     "frame_features": frame_features
#   }

In [12]:
### More dynamic way to extract text from image
def get_frame_features(frame_path: str, assets_path: str) -> dict:
  frame_features = {
    "frame_path": frame_path,
    "ad_frame_dimensions": (600, 500),
    'text_bounded_box_format': ("left", "top", "right", "bottom"),
    'text_bounding_boxes': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90), 'TAP': (1, 101, 65, 128), 'THE': (76, 101, 141, 128), 'SCREEN': (153, 100, 294, 129), 'to': (1, 140, 22, 158), 'find': (28, 136, 66, 158), 'the': (74, 136, 107, 158), 'nearest': (115, 140, 193, 158), 'Lexus': (201, 136, 262, 158), 'dealership': (269, 136, 381, 163)},
    "elements": {
        "header": {
          'image_path': f'{assets_path}/header.jpg', 
          'position': (0, 0),
          'size': (600, 200),
          'text_in_image': ['Ovexus', 'L/CERTIFIED', 'BY', 'LEXUS'],
          'has_background': True
        },
        "engagement_instruction": {
          'image_path': f'{assets_path}/engagement_instruction_1.png', 
          'position': (0, 100),
          'size': (380, 63),
          'text_in_image': ['TAP', 'THE', 'SCREEN', 'to', 'find', 'the', 'nearest', 'Lexus', 'dealership'],
          'has_background': False
        },
        "thubmnail": {
          'image_path': f'{assets_path}/thumbnail.jpg', 
          'position': (0, 200), 
          'size': (600, 300),
          'text_in_image': {},
          'has_background': True
        }
    }
  }
  
  return {
    "frame_features": frame_features
  }

## Define the critic agent using CV tools

In [13]:
llm_config_critic = {
    "model": "gpt-4o",  # Specify the model version for the critic
    "temperature": 0.5,  # Adjust the temperature for evaluation
    "config_list": config_list,  # Use the same LLM configuration list
    "functions": [
        {
            "name": "get_frame_features",
            "description": "Gets detailed features of the frame using computer vision.",
            "parameters": {
                "type": "object",
                "properties": {
                    "frame_path": {
                        "type": "string",
                        "description": "Path to the image file."
                    },
                    "assets_path": {
                        "type": "string",
                        "description": "Path to the assets folder containing the image elements to compose."
                    }
                },
                "required": ["frame_path", "assets_path"]
            }
        }
    ]
}

critic_agent = AssistantAgent(
    name="image_critic_agent",
    system_message="""
    You are a critic AI agent. 
    Your task is to evaluate the quality of the composed ad frames. 
    You will execute function to get frame features that uses Computer vision tools to get the features of the frame.
    If the frame is not good, suggest the necessary changes in positioning the elements to be made.
    If the frame is good, just say 'All good' and return 'TERMINATE' when the evaluation is complete.
    
    These are the design principles to consider:
    - Check for balance and organization within the frame.
    - Ensure no elements overlap or create unnecessary clutter.
    - Consider using white space effectively to guide the viewer's eye.
    """,
    llm_config=llm_config_critic,
)


#### Define critic agent using GPT-4o vision

#### Set up configs for groupchat manager and user agent

In [9]:
llm_config = {
  "config_list": 
    [
      {"model": "gpt-4o"}
    ]
  }

llm_config_user = {
  "config_list": 
    [
      {"model": "gpt-3.5-turbo"}
    ]
}

## User proxy agent setup

In [10]:
system_message_user = """
"You are a the Human admin in the groupchat. 
You can interact with the image composition and the critic agents.
Execute their recommended functions and return the output as it is to the agents (Do not interpret the results).
"""

user_proxy = UserProxyAgent(
    name="User",
    llm_config=llm_config_user,
    system_message=system_message_user,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
    code_execution_config=False,
    # max_consecutive_auto_reply=3,
    function_map={
        "compose_ad_frame": compose_ad_frame,
        "get_frame_features": get_frame_features
    }
)

## Setup groupchat manager

In [11]:
from autogen.agentchat.groupchat import GroupChatManager

class CustomGroupChatManager(GroupChatManager):
    def _select_next_speaker(self, last_speaker, last_message, groupchat):
        # Prioritize image_critic_agent when frame_features are ready
        if (last_speaker.name == "User" and 
            "frame_features" in last_message.get("content", {}) and 
            "image_critic_agent" in groupchat.agent_names):
            return groupchat.agent_by_name("image_critic_agent")

        # For all other cases, use default behavior (or your custom logic)
        return super()._select_next_speaker(last_speaker, last_message, groupchat)


In [12]:

groupchat = GroupChat(agents=[user_proxy, image_composition_agent, critic_agent], messages=[])
manager = CustomGroupChatManager(groupchat=groupchat, llm_config=llm_config)  # Use the custom manager


## Initiate the conversation

In [13]:
image_path = f'{assets_path}/header.jpg'
print(get_image_dimensions(image_path))
print(has_transparency(image_path))
print(extract_text_with_positions(image_path))

(600, 200)
False
{'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}


In [40]:
def extract_and_combine_text(data):
  """
  Extracts text from the 'text_in_image_with_location' key within a dictionary
  and attempts to combine them into a sentence, handling potential missing data.

  Args:
      data: A dictionary containing text information.

  Returns:
      A string representing the combined text or an empty string if no text is found.
  """
  text_list = []
  for key, value in data.items():
    # Check if 'text_in_image_with_location' exists and has a value
    if 'text_in_image_with_location' in value and value['text_in_image_with_location']:
      text_list.extend(value['text_in_image_with_location'].keys())
    # Alternatively, use get method with a default empty dictionary
    # text_list.extend(value.get('text_in_image_with_location', {}).keys())
  return " ".join(text_list)


In [38]:
def get_assets_features(image_path: str, image_role: str) -> dict:
    return {
        'image': image_role,
        # 'image_path': image_path,
        "dimensions": get_image_dimensions(image_path),
        'text_in_image_with_location': extract_text_with_positions(image_path),
        'has_transparency': has_transparency(image_path)
    }
    
# print(get_assets_features(image_path, 'header'))
assets_inputs = {
    "header": f'{assets_path}/header.jpg',
    "engagement_instruction": f'{assets_path}/engagement_instruction_1.png',
    "thumbnail": f'{assets_path}/thumbnail.jpg'
}

all_assets_features = {image_role: get_assets_features(image_path, image_role) for image_role, image_path in assets_inputs.items()}
print(all_assets_features)



{'header': {'image': 'header', 'dimensions': (600, 200), 'text_in_image_with_location': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}, 'has_transparency': False}, 'engagement_instruction': {'image': 'engagement_instruction', 'dimensions': (380, 63), 'text_in_image_with_location': {'TAP': (0, 1, 64, 28), 'THE': (75, 1, 140, 28), 'SCREEN': (152, 0, 293, 29), 'to': (0, 40, 21, 58), 'find': (27, 36, 65, 58), 'the': (73, 36, 106, 58), 'nearest': (114, 40, 192, 58), 'Lexus': (200, 36, 261, 58), 'dealership': (268, 36, 380, 63)}, 'has_transparency': True}, 'thumbnail': {'image': 'thumbnail', 'dimensions': (600, 300), 'text_in_image_with_location': {}, 'has_transparency': False}}


In [52]:
def summarize_output_data(output_data):
    """
    Summarizes the output data by extracting key information and formatting it into a concise string.

    Args:
        output_data (dict): The output data containing text information and dimensions.

    Returns:
        str: A summarized string of the output data.
    """
    summary_parts = []

    for section, content in output_data.items():
        if 'dimensions' in content:
            dims = 'x'.join(map(str, content['dimensions']))
            summary_parts.append(f"{section} dims: {dims}")

        if 'text_in_image_with_location' in content and content['text_in_image_with_location']:
            text_summary = '; '.join([f"{text}: {','.join(map(str, loc))}" for text, loc in content['text_in_image_with_location'].items()])
            summary_parts.append(f"{section}_text: {text_summary}")

    return '. '.join(summary_parts)

# Example usage
output_data = {
    'header': {'image': 'header', 'dimensions': (600, 200), 'text_in_image_with_location': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}, 'has_transparency': False}, 
    'engagement_instruction': {'image': 'engagement_instruction', 'dimensions': (380, 63), 'text_in_image_with_location': {'TAP': (0, 1, 64, 28), 'THE': (75, 1, 140, 28), 'SCREEN': (152, 0, 293, 29), 'to': (0, 40, 21, 58), 'find': (27, 36, 65, 58), 'the': (73, 36, 106, 58), 'nearest': (114, 40, 192, 58), 'Lexus': (200, 36, 261, 58), 'dealership': (268, 36, 380, 63)}, 'has_transparency': True}, 
    'thumbnail': {'image': 'thumbnail', 'dimensions': (600, 300), 'text_in_image_with_location': {}, 'has_transparency': False}
}

summary = summarize_output_data(output_data)
print(summary)

header dims: 600x200. header_text: Ovexus: 50,27,245,60; L/CERTIFIED: 50,77,161,90; BY: 174,77,198,90; LEXUS: 211,77,283,90. engagement_instruction dims: 380x63. engagement_instruction_text: TAP: 0,1,64,28; THE: 75,1,140,28; SCREEN: 152,0,293,29; to: 0,40,21,58; find: 27,36,65,58; the: 73,36,106,58; nearest: 114,40,192,58; Lexus: 200,36,261,58; dealership: 268,36,380,63. thumbnail dims: 600x300


In [56]:
import base64
import os
from dotenv import load_dotenv
import requests

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

def encode_image(image_path: str) -> str:
    """
    Encodes an image file to a base64 string.

    Parameters:
    - image_path (str): Path to the image file.

    Returns:
    - str: Base64 encoded string of the image.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def describe_image(image_path: str) -> str:
    """
    Describes an image by sending it to the GPT-4o API and getting the description.

    Parameters:
    - image_path (str): Path to the image file.

    Returns:
    - str: Description of the image.
    """
    base64_image = encode_image(image_path)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                      "type": "text", 
                      "text": f"""
                        What’s in this image? You are a design critic. 
                        Your task is to evaluate the composition of an ad frame and suggest improvements based on design principles. 
                        Consider balance, organization, use of white space, and visual appeal.
                        You shall provide your response with concise feedback on critic and suggest improvements and must provide better positioning of elements like this thumbnail: 200x300
                        Be concise, do not exceed 300 tokens.
                        These below are the features of the assets composing the adframe, use this to make informed suggestions. 
                        {summary}
                        """
                    },
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ],
        "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    response_data = response.json()
    print(response_data)
    return response_data.get("choices", [{}])[0].get("message", {}).get("content", "")

description = describe_image("/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/composed_image_frame.jpg")


{'id': 'chatcmpl-9nAFmCrmSnpsF5K5YlHSvv1sX2Odr', 'object': 'chat.completion', 'created': 1721504514, 'model': 'gpt-4o-2024-05-13', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '### Critique and Suggestions\n\n**Critique:**\n1. **Balance:**\n   - The ad frame lacks vertical balance; the top part feels a bit cluttered because the header and engagement instruction are closely packed together.\n\n2. **Organization:**\n   - The engagement instruction text below the header is well-organized but occupies too much vertical space, interrupting the flow to the central image.\n   \n3. **White Space:**\n   - There’s too much white space above the header and too little below the engagement instruction, creating an imbalance.\n\n4. **Visual Appeal:**\n   - The central image of the camera lens is compelling but somewhat disconnected from the instructional text above it.\n\n**Suggestions:**\n1. **Header Positioning:**\n   - Increase the white space above the header to give it m

In [57]:
print(description)

### Critique and Suggestions

**Critique:**
1. **Balance:**
   - The ad frame lacks vertical balance; the top part feels a bit cluttered because the header and engagement instruction are closely packed together.

2. **Organization:**
   - The engagement instruction text below the header is well-organized but occupies too much vertical space, interrupting the flow to the central image.
   
3. **White Space:**
   - There’s too much white space above the header and too little below the engagement instruction, creating an imbalance.

4. **Visual Appeal:**
   - The central image of the camera lens is compelling but somewhat disconnected from the instructional text above it.

**Suggestions:**
1. **Header Positioning:**
   - Increase the white space above the header to give it more breathing room.
   - Example - Thumbnail: Header: 200 x 50

2. **Engagement Instruction:**
   - Move the engagement instruction text closer to the camera lens image to create a more cohesive visual connection.
   -

In [16]:
assets_path = '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/'
system_instruction = f"""
The path to the assets folder is {assets_path}

Use the assets path and the image names to get the image path

The output path is the data folder can be derived from assets path with the name composed_image_frame.jpg

Return 'TERMINATE' when the task is done.
"""

user_message = f"""
Compose an AD Frame with the dimensions 600x500 for StoryBoard
This is the concept. Place the header image at the top-left corner.
Place the engagement instruction just below the logo in the header image but on the header image.
Place the thumbnail below the header image.

These are the features of the different assets separately. Use them to compose the AD Frame.
{get_all_assets_features}
"""

final_message = user_message + system_instruction

chat_result = user_proxy.initiate_chat(manager, message=final_message, max_turns=30)

[33mUser[0m (to chat_manager):


Compose an AD Frame with the dimensions 600x500 for StoryBoard
This is the concept. Place the header image at the top-left corner.
Place the engagement instruction just below the logo in the header image but on the header image.
Place the thumbnail below the header image.

These are the features of the different assets separately. Use them to compose the AD Frame.
{'header': {'image_role': 'header', 'image_path': '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/header.jpg', 'dimensions': (600, 200), 'text_in_image_with_location': {'Ovexus': (50, 27, 245, 60), 'L/CERTIFIED': (50, 77, 161, 90), 'BY': (174, 77, 198, 90), 'LEXUS': (211, 77, 283, 90)}, 'has_transparency': False}, 'engagement_instruction': {'image_role': 'engagement_instruction', 'image_path': '/home/hillary_kipkemoi/Automated-Storyboard-Synthesis-Digital-Advertising/data/Assets/0a22f881b77f00220f2034c21a18b854/engagemen