In [113]:
from dotenv import load_dotenv
import pandas as pd
import os
import base64
import requests

In [114]:
load_dotenv()

IMAGE_INPUT_FOLDER = "../data/s2w_sample1/"

S2W = "../data/s2w_summarized.csv"


s2w = pd.read_csv(S2W)
s2w.drop(columns=["Unnamed: 0"], inplace=True)

In [115]:
def encode_image(id):
    image_path = IMAGE_INPUT_FOLDER + f"{id}.jpg"
    with open(image_path,"rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')    
   

In [116]:
def describe_image(id):
    api_key = os.getenv("OPENAI_API_KEY")
    base64_image = encode_image(id)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": 
                    """
                    Q: Given a screenshot of a mobile page. Provide a text in the form of a list, that describes the functional requirements of the given mobile page. Focus solely on the required functionality. Ignore Layout and design characteristics. Include only information, that is visible in the screenshot, by Ignoring the android status bar at the top of the screenshot and the android navigation bar at the bottom. If a popup is visible, ignore everything else but the the popup.

                    A:
                    The mobile page is a language-learning application with the following functionalities:

                    1. Display a sentence in a foreign language that the user is expected to translate.
                    2. Provide audio playback of the sentence for the user to hear the pronunciation.
                    3. Allow the user to input their translation into a text field.
                    4. Submit the translation for verification by clicking a 'CHECK' button.
                    5. Offer an option to skip the current sentence and move to the next one with a 'SKIP' button.
                    6. Display a progress bar or indicator to show the user's progress through the lesson or activity.

                    Q: Given a screenshot of a mobile page. Provide a text in the form of a list, that describes the functional requirements of the given mobile page. Focus solely on the required functionality. Ignore Layout and design characteristics. Include only information, that is visible in the screenshot, by Ignoring the android status bar at the top of the screenshot and the android navigation bar at the bottom. If a popup is visible, ignore everything else but the the popup.

                    A:
                    The mobile page is a fitness app with the following functionalities:

                    1. Start a workout session by interacting with a 'START WORKOUT' button.
                    2. Select different types of workouts, such as "Full Body" workouts, as an option for the user.
                    3. Choose the number of circuits the user wishes to perform during the workout.
                    4. Select an instructor or possibly a type of guidance for the workout.
                    5. Provide access to a navigation menu at the bottom, with tabs for: 'Workout' (where the user is currently) to presumably start and configure workouts. 'Learn' to offer educational content or instructions. 'Achievements' to show the user's progress or rewards. 'Track' to presumably monitor the user's workout activity or progress over time. 

                    Q: Given a screenshot of a mobile page. Provide a text in the form of a list, that describes the functional requirements of the given mobile page. Focus solely on the required functionality. Ignore Layout and design characteristics. Include only information, that is visible in the screenshot, by Ignoring the android status bar at the top of the screenshot and the android navigation bar at the bottom. If a popup is visible, ignore everything else but the the popup.

                    A:                    
                    """
                }
            ]
        },
        {
            "role": "user",
            "content": [
            {
                "type": "image_url",
                "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }
            ]
        }
        ],
        "max_tokens": 300
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response

In [117]:
screens = s2w["screenId"]
for screen_id in screens:
    result = describe_image(screen_id)
    s2w.loc[s2w['screenId'] == screen_id, 'llm_summary'] = result.json().get("choices")[0].get("message").get("content")
    print(f"{screen_id}: DONE")

300: DONE
33: DONE
486: DONE
494: DONE
498: DONE
549: DONE
70: DONE
761: DONE
596: DONE
495: DONE


In [118]:
s2w.to_csv("../data/s2w_summarized.csv")

In [119]:
s2w["llm_summary"].values

array(['Based on the screenshot provided, the mobile page is an educational app focused on teaching JavaScript with the following functionalities:\n\n1. Present a multiple-choice question related to JavaScript to the user.\n2. Offer several answer options in radio button format for the user to select.\n3. Allow the user to submit their selected answer for validation.\n4. Display feedback indicating whether the submitted answer is correct or incorrect, in this case by displaying "Wrong" with an "X" symbol.\n5. Provide navigation options for the user to either return to the previous screen or question with a \'BACK\' button or to retry the question with a \'TRY AGAIN\' button.\n6. Show a hint that there are comments related to the question by displaying a comment count ("26 COMMENTS"), suggesting user interaction or discussion is possible.\n7. Include additional action options on the page such as \'UNLOCK\' and \'SHARE\', indicating possible features such as sharing content with others o

In [120]:
s2w.to_csv("../data/s2w_summarized.csv")