In [35]:
from dotenv import load_dotenv
import pandas as pd
import os
import base64
import requests

In [36]:
load_dotenv()

IMAGE_INPUT_FOLDER = "../data/s2w_sample1/"

S2W = "../data/s2w_summarized.csv"


s2w = pd.read_csv(S2W)
s2w.drop(columns=["Unnamed: 0"], inplace=True)

In [37]:
def encode_image(id):
    image_path = IMAGE_INPUT_FOLDER + f"{id}.jpg"
    with open(image_path,"rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')    
   

In [38]:
def describe_image(id):
    api_key = os.getenv("OPENAI_API_KEY")
    base64_image = encode_image(id)

    summary = s2w[s2w["screenId"]==id]["s2w_summary"].iloc[0]

    print(base64_image)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "Given a screenshot of a mobile page and a short description, append missing functionalities, that are present in the screenshot, to the provided description."
                }
            ] 
        },
        {
            "role": "user",
            "content": [
            {
                "type": "text",
                "text": summary
            },
            {
                "type": "image_url",
                "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }
            ]
        }
        ],
        "max_tokens": 300
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response

In [39]:
screens = s2w["screenId"]
for screen_id in screens:
    result = describe_image(screen_id)
    s2w.loc[s2w['screenId'] == screen_id, 'llm_summary'] = result.json().get("choices")[0].get("message").get("content")
    print(f"{screen_id}: DONE")

../data/s2w_sample1/300.jpg
/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAeABDgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDLooor3T87CiiigBcZ7UcjtVuz0q91EsLK1mn2/eKISB9TVz/hFdd76Vdf98VDnFaNmqo1JK6i2vRmR+FH4Vr/APCK67/0Crr/AL4o/wCEV13/AKBV1/3xT54d0P6vV/lf3Mx+RRVy90y/

  s2w.loc[s2w['screenId'] == screen_id, 'llm_summary'] = result.json().get("choices")[0].get("message").get("content")


300: DONE
../data/s2w_sample1/33.jpg
/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAeABDgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDzaiiimZhRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFF

In [40]:
s2w.to_csv("../data/s2w_summarized.csv")

In [43]:
s2w["llm_summary"].values

array(['The pop-up screen you\'re describing is showing feedback to the user after they have submitted an answer to a quiz question. It indicates the answer was incorrect by showing a red "X" and the word "Wrong," and provides two options at the bottom: "BACK" and "TRY AGAIN." It seems the user has attempted the first question out of five, as indicated by "1/5" at the top. Additionally, there\'s a comment section available, denoted by "26 COMMENTS" indicating that users can interact or discuss the question. The header of the pop-up also offers options to unlock further content or share presumably the current content, as indicated by the "UNLOCK" and "SHARE" buttons. Moreover, there are social media sharing buttons right at the top for platforms such as Facebook, Twitter, and others. There\'s also a notification icon with a red dot indicating probably a new message.',
       'Based on the screenshot provided, in addition to displaying different buttons for selecting numbers, the mobile 

In [44]:
s2w

Unnamed: 0,screenId,s2w_summary,llm_summary
0,300,The mobile screen is a pop-up in a learning ap...,The pop-up screen you're describing is showing...
1,33,The mobile screen displays different buttons f...,"Based on the screenshot provided, in addition ..."
2,486,The Screen displays a pop-up with multiple sha...,The screenshot shows a mobile interface with a...
3,494,The mobile screen is part of a music app and a...,The mobile screen shows a music app interface ...
4,498,The screen is a pop-up in a podcast applicatio...,The pop-up window in the podcast application i...
5,549,The mobile screen is a settings page for a rel...,"Based on the screenshot provided, the Settings..."
6,70,The screen is part of a scientific app and dis...,The screenshot shows a mobile application with...
7,761,The mobile screen is part of a settings app an...,The mobile screen shown is indeed from a setti...
8,596,The mobile screen is part of a printer app and...,The mobile screen shows an app's advanced sett...
9,495,The screen is part of a podcast application th...,The screen shows an episode from a podcast wit...
