In [3]:
!pip install "numpy<2"

Collecting numpy<2
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.3.0
    Uninstalling numpy-2.3.0:
      Successfully uninstalled numpy-2.3.0
Successfully installed numpy-1.26.4


In [1]:
import pandas as pd
import ast

# vqa 2000

In [10]:
file = pd.read_csv("vqa_dataset.csv")
file.head(1)

Unnamed: 0,Full Prompt
0,Prompt: A mystical dragonfly perched on a will...


In [4]:
import re
from typing import List, Dict

In [5]:
def extract_prompt_data(text: str) -> List[Dict[str, str]]:
    # Step 1: Normalize text
    text = text.strip()

    # Step 2: Try to find the "Prompt:" and "Questions:" section
    # (.*?): . matches any char except for new line, * matches 0/more of the preceeding char, ? means non-greedy
    match = re.search(r'Prompt:\s*(.*?)\s*Questions:', text, re.DOTALL | re.IGNORECASE)
    if not match:
        return []  # Return empty if not matched

    description = match.group(1).strip()

    # Step 3: Extract individual question blocks
    # text.split("Questions:")[-1]: extracts the text blocks after "Questions:"
    # re.split(r'\n?\s*\d+[).]', ...): splits the text into a list of question blocks by detecting question numbers (1., 2), etc.)
    # [).]: matches either a closing parenthesis ) or a dot . (to handle both 1. and 2) formats)
    question_blocks = re.split(r'\n?\s*\d+[).]', text.split("Questions:")[-1])

    results = []

    for block in question_blocks:
        block = block.strip()
        # if not empty string, continue
        if not block:
            continue

        # Extract question (first line before - or choice pattern)
        lines = block.splitlines()
        question_line = lines[0].strip() if lines else ""
        
        # Try to extract choices using several common patterns
        # inside (): \[: matches one [, [^\]]+: matches 1+ non-] char, \]: matches one ]
        # still inside () after |: [(]: matches one (, [^)]*: matches 0+ non-) char, [)]: natches one )
        choices_match = re.search(r'choices\s*[:=]?\s*(\[[^\]]+\]|[(][^)]*[)])', block, re.IGNORECASE)
        if not choices_match:
            choices_match = re.search(r'\(([^)]*?)\?\)', block)  # Pattern like (A) this, (B) that?

        if choices_match:
            raw_choices = choices_match.group(1)
            choices = raw_choices.strip()
        else:
            choices = ""

        # Extract answer using flexible regex
        # (?:[\n\r]|$): non-capturing group that matches either line break or end of string
        answer_match = re.search(r'answer[s]?\s*[:=]?\s*["]?(.*?)["]?(?:[\n\r]|$)', block, re.IGNORECASE)
        if answer_match:
            answer = answer_match.group(1).strip().rstrip(".")
        else:
            answer = ""

        results.append({
            "description": description,
            "question": question_line,
            "choices": choices,
            "answer": answer
        })

    return results


In [7]:
first = extract_prompt_data(file.loc[0, "Full Prompt"])
first

[{'description': 'A mystical dragonfly perched on a willow branch, emitting an ethereal glow.',
  'question': 'Is the creature in this image a reptile?',
  'choices': '["no", "yes"]',
  'answer': 'yes'},
 {'description': 'A mystical dragonfly perched on a willow branch, emitting an ethereal glow.',
  'question': 'What type of light is the dragonfly radiating?',
  'choices': '["red", "green", "blue", "ethereal"]',
  'answer': 'ethereal'},
 {'description': 'A mystical dragonfly perched on a willow branch, emitting an ethereal glow.',
  'question': 'Is there a pond or other body of water in this scene?',
  'choices': '["no", "yes"]',
  'answer': '[none]'},
 {'description': 'A mystical dragonfly perched on a willow branch, emitting an ethereal glow.',
  'question': 'What color is the willow branch?',
  'choices': '["blue", "green", "purple", "red"]',
  'answer': 'green'}]

In [13]:
type(file["Full Prompt"])

pandas.core.series.Series

In [14]:
columns = ["description", "question", "choices", "answer"]
final_df = pd.DataFrame(columns=columns)

rows = []
for raw_text in file["Full Prompt"].tolist():
    prompt_data = extract_prompt_data(raw_text)
    for row in prompt_data:
        if isinstance(row["answer"], str) and "none" in row["answer"].lower():
            row["answer"] = None
        rows.append(row)

final_df = pd.DataFrame(rows)

In [16]:
final_df.head(12)

Unnamed: 0,description,question,choices,answer
0,A mystical dragonfly perched on a willow branc...,Is the creature in this image a reptile?,"[""no"", ""yes""]",yes
1,A mystical dragonfly perched on a willow branc...,What type of light is the dragonfly radiating?,"[""red"", ""green"", ""blue"", ""ethereal""]",ethereal
2,A mystical dragonfly perched on a willow branc...,Is there a pond or other body of water in this...,"[""no"", ""yes""]",
3,A mystical dragonfly perched on a willow branc...,What color is the willow branch?,"[""blue"", ""green"", ""purple"", ""red""]",green
4,A mystical dragonfly perched on a willow branc...,Is the creature in this image a reptile?,"[""no"", ""yes""]",yes
5,A mystical dragonfly perched on a willow branc...,What type of light is the dragonfly radiating?,"[""red"", ""green"", ""blue"", ""ethereal""]",ethereal
6,A mystical dragonfly perched on a willow branc...,Is there a pond or other body of water in this...,"[""no"", ""yes""]",
7,A mystical dragonfly perched on a willow branc...,What color is the willow branch?,"[""blue"", ""green"", ""purple"", ""red""]",green
8,"A majestic unicorn in a forest clearing, surro...",Is there an animal present in the image?,"[""no"", ""yes""]",yes
9,"A majestic unicorn in a forest clearing, surro...",What type of animal is it?,"[""lion"", ""unicorn"", ""elephant""]",unicorn


In [20]:
final_df.to_csv("vqa2000.csv", index=False)

# vqa 8000

In [2]:
import pandas as pd
import re
from typing import List, Dict

Different versions of generated prompts:
version 1:
""XXX

Prompt: XXX
Questions:
1. Is the night market located in a major city?
    - choices: [yes/no]
    - answer: yes
2. What type of fruit is the vendor selling?
    - choices: [apples, durians, bananas, oranges]
    - answer: durians
3. What is the material of the Buddha statue?
    - choices: [wood, stone, bronze, gold]
    - answer: gold
4. Is the market situated indoors or outdoors?
    - choices: [indoors, outdoors, on a rooftop, underwater]
    - answer: outdoors"

version 2:
"--- Sample 3 (a fairy circle deep within a mossy glade) ---
XXX

\*\*Description:\*\* XXX

\*\*Questions:\*\*

1. Is the faerie queen surrounded by a group of other faeries?
    - choices: Yes, No
    - answer: No
2. Is the fairy circle made of natural materials?
    - choices: Yes, No, Man-made
    - answer: No
3. Is the faerie queen holding a magical staff or object?
    - choices: Yes, No, Can't see
    - answer: Can't see
4. Is the mossy glade located in a dense forest?
    - choices: Yes, No, Undisclosed
    - answer: Undisclosed"

version 3:
"XXX

\*\*Description:\*\* XXX
\*\*Questions:\*\*

1. Is Lyra a human?
    - a) Yes
    - b) No
    - answer: a) Yes

2. What is Lyra doing on the bridge?
    - a) Gazing at the clouds
    - b) Playing a harp
    - c) Flying with the birds
    - answer: a) Gazing at the clouds

3. Is the bridge made of a natural material?
    - a) Wood
    - b) Clouds
    - c) Metal
    - answer: b) Clouds

4. What is the atmosphere above the island like?
    - a) Sunny
    - b) Misty
    - c) Stormy
    - answer: b) Misty"
and sometimes the "\*\*Description:\*\*" can also be "\*\*Image Description:\*\*"

version 4:
"XXX
Prompt: XXX

Questions:
1. Is the astronaut in the image a male or a female?
    - Male
    - Female
    - Neither (it's a robot or an alien)
    - Answer: Female

2. What is the astronaut looking at?
    - The horizon
    - A distant building
    - A strange rock formation
    - A bio-dome
    - Answer: The horizon

3. What is the purpose of the bio-domes in the image?
    - To grow crops
    - To house Martian creatures
    - To provide a safe environment for humans
    - To filter the air
    - Answer: To provide a safe environment for humans

4. Is the terraformed Mars colony self-sustaining?
    - Yes
    - No
    - Partially
    - Answer: Partially

XXX""

In [None]:
def parse_prompt_to_dicts(text):
    """Parse prompt text into list of question dictionaries"""
    # Extract description (handles all variants)
    # ?: matches any of the following texts
    # (.*?): group 1
    desc_match = re.search(
        r'(?:Description|Image Description|Prompt):\s*(.*?)\s*(?:\*\*Questions:|Questions:)', 
        text, 
        re.DOTALL | re.IGNORECASE
    )
    description = desc_match.group(1).strip() if desc_match else None
    
    # Extract all questions and answers
    questions = []
    # add \n to make sure that we split the number that starts a new line
    # [1:] discards the empty first element after split
    question_blocks = re.split(r'\n\s*\d+\.', text)[1:]  # split by question numbers
    
    for block in question_blocks:
        if not block.strip(): # if the block is empty
            continue # skip the current loop and continue to the next loop
            
        # Extract question text
        # matches until sees end of string ($) or sees -
        question_match = re.match(r'\s*(.*?)\s*(?:-|$)', block)
        if not question_match:
            continue
        question = question_match.group(1).strip()
        
        # Extract choices (handles all formats)
        choices = []
        choice_lines = re.findall(
            r'(?:-\s*choices?:\s*([^\n]+))|'  # Version 1/2, ([^\n]+): () captures this as group 1, [^\n] matches any character except a newline, + matches 1 or more of these characters
            r'(?:-\s*([^\n]+)\n(?!\s*- Answer))|'  # Version 4
            r'(?:[a-z]\)\s*([^\n]+)\n)',  # Version 3
            block,
            re.IGNORECASE
        )
        for choice_group in choice_lines:
            choice = next((c for c in choice_group if c), None)
            if choice and 'answer:' not in choice.lower():
                choices.append(choice.strip())
        
        # Extract answer (handles all formats)
        answer_match = re.search(
            r'(?:answer|Answer):?\s*([^\n]+)', 
            block, 
            re.IGNORECASE
        )
        answer = answer_match.group(1).strip() if answer_match else None
        
        if answer:
            # Clean answer (remove a)/b) prefixes if present)
            answer = re.sub(r'^[a-z]\)\s*', '', answer, flags=re.IGNORECASE)
            
            # Remove "Answer:" prefix if it exists in the choices
            choices = [c for c in choices if not re.match(r'^Answer:', c, re.IGNORECASE)]
            
            questions.append({
                'description': description,
                'question': question,
                'choices': choices,
                'answer': answer
            })
    
    return questions


In [4]:
file = pd.read_csv("generated_prompts_8000.csv")
file.head(1)

Unnamed: 0,generated_prompt
0,--- Sample 1 (a terraformed Mars colony with r...


In [16]:
first = parse_prompt_to_dicts(file.loc[0, "generated_prompt"])
first

[{'description': 'A terraformed Mars colony with red soil and bio-domes, where a lone astronaut gazes out at the rust-hued landscape.',
  'question': 'Is the astronaut in the image a male or a female?',
  'choices': ['Male', 'Female'],
  'answer': 'Female'},
 {'description': 'A terraformed Mars colony with red soil and bio-domes, where a lone astronaut gazes out at the rust-hued landscape.',
  'question': 'What is the astronaut looking at?',
  'choices': ['The horizon', 'A distant building', 'A strange rock formation'],
  'answer': 'The horizon'},
 {'description': 'A terraformed Mars colony with red soil and bio-domes, where a lone astronaut gazes out at the rust-hued landscape.',
  'question': 'What is the purpose of the bio',
  'choices': ['domes in the image?',
   'To grow crops',
   'To house Martian creatures',
   'To provide a safe environment for humans'],
  'answer': 'To provide a safe environment for humans'},
 {'description': 'A terraformed Mars colony with red soil and bio-d

In [34]:
columns = ["description", "question", "choices", "answer"]
final_df = pd.DataFrame(columns=columns)

rows = []
for raw_text in file["generated_prompt"].tolist():
    prompt_data = parse_prompt_to_dicts(raw_text)
    for row in prompt_data:
        if isinstance(row["answer"], str) and "none" in row["answer"].lower():
            row["answer"] = None
        rows.append(row)

final_df = pd.DataFrame(rows)

In [35]:
final_df.head(12)

Unnamed: 0,description,question,choices,answer
0,A terraformed Mars colony with red soil and bi...,Is the astronaut in the image a male or a female?,"[Male, Female]",Female
1,A terraformed Mars colony with red soil and bi...,What is the astronaut looking at?,"[The horizon, A distant building, A strange ro...",The horizon
2,A terraformed Mars colony with red soil and bi...,What is the purpose of the bio,"[domes in the image?, To grow crops, To house ...",To provide a safe environment for humans
3,A terraformed Mars colony with red soil and bi...,Is the terraformed Mars colony self,"[sustaining?, Yes, No]",Partially
4,Vibrant street lamps illuminate a bustling nig...,Is the night market located in a major city?,[[yes/no]],yes
5,Vibrant street lamps illuminate a bustling nig...,What type of fruit is the vendor selling?,"[[apples, durians, bananas, oranges]]",durians
6,Vibrant street lamps illuminate a bustling nig...,What is the material of the Buddha statue?,"[[wood, stone, bronze, gold]]",gold
7,Vibrant street lamps illuminate a bustling nig...,Is the market situated indoors or outdoors?,"[[indoors, outdoors, on a rooftop, underwater]]",outdoors
8,** A delicate faerie queen stands amidst a mos...,Is the faerie queen surrounded by a group of o...,"[Yes, No]",No
9,** A delicate faerie queen stands amidst a mos...,Is the fairy circle made of natural materials?,"[Yes, No, Man-made]",No


In [36]:
final_df.loc[8, 'description']

'** A delicate faerie queen stands amidst a mossy glade, surrounded by a shimmering fairy circle.'

process `description` column

In [37]:
final_df = final_df.dropna(subset=['description'])  # removes None/NaN
final_df = final_df[final_df['description'] != '']  # removes empty strings

In [38]:
for i in range(len(final_df)):
    description = final_df.iloc[i]['description']
    if "*" in description:
        description = description.strip("*") # removes any leading **
        description = description.strip() # removes any leading spaces
        final_df.loc[i, 'description'] = description

final_df.head(12)

Unnamed: 0,description,question,choices,answer
0,A terraformed Mars colony with red soil and bi...,Is the astronaut in the image a male or a female?,"[Male, Female]",Female
1,A terraformed Mars colony with red soil and bi...,What is the astronaut looking at?,"[The horizon, A distant building, A strange ro...",The horizon
2,A terraformed Mars colony with red soil and bi...,What is the purpose of the bio,"[domes in the image?, To grow crops, To house ...",To provide a safe environment for humans
3,A terraformed Mars colony with red soil and bi...,Is the terraformed Mars colony self,"[sustaining?, Yes, No]",Partially
4,Vibrant street lamps illuminate a bustling nig...,Is the night market located in a major city?,[[yes/no]],yes
5,Vibrant street lamps illuminate a bustling nig...,What type of fruit is the vendor selling?,"[[apples, durians, bananas, oranges]]",durians
6,Vibrant street lamps illuminate a bustling nig...,What is the material of the Buddha statue?,"[[wood, stone, bronze, gold]]",gold
7,Vibrant street lamps illuminate a bustling nig...,Is the market situated indoors or outdoors?,"[[indoors, outdoors, on a rooftop, underwater]]",outdoors
8,A delicate faerie queen stands amidst a mossy ...,Is the faerie queen surrounded by a group of o...,"[Yes, No]",No
9,A delicate faerie queen stands amidst a mossy ...,Is the fairy circle made of natural materials?,"[Yes, No, Man-made]",No


process `choices` column

In [39]:
final_df.dtypes

description    object
question       object
choices        object
answer         object
dtype: object

In [40]:
type(final_df.iloc[0]['choices'])

list

In [44]:
len(final_df.iloc[0]['choices'])

2

In [41]:
type(final_df.iloc[8]['choices'])

list

In [42]:
len(final_df.iloc[8]['choices'])

1

In [43]:
final_df.iloc[8]['choices'][0].split(',')

['Yes', ' No']

In [45]:
final_df = final_df.dropna(subset=['choices'])  # removes None/NaN

In [46]:
def clean_choices(choice_list):
    # Handle cases where input is already properly formatted
    if isinstance(choice_list, list) and all(isinstance(x, str) for x in choice_list):
        return choice_list
    
    # Convert string representation of list to actual list if needed
    if isinstance(choice_list, str):
        try:
            choice_list = ast.literal_eval(choice_list)
        except:
            choice_list = [choice_list]
    
    # Flatten nested lists (like [['yes/no']])
    flattened = []
    for item in choice_list:
        if isinstance(item, list):
            flattened.extend(item)
        else:
            flattened.append(item)
    
    # Split comma-separated strings (like ['Yes, No'])
    result = []
    for item in flattened:
        if isinstance(item, str):
            # Handle cases like "yes/no" or "Yes, No"
            if '/' in item:
                result.extend(x.strip() for x in item.split('/'))
            elif ',' in item:
                result.extend(x.strip() for x in item.split(','))
            else:
                result.append(item.strip())
        else:
            result.append(str(item))
    
    # Remove empty strings and duplicates
    result = [x for x in result if x]
    return list(dict.fromkeys(result))  # Preserves order while removing duplicates

In [47]:
# Apply to your DataFrame
final_df['choices'] = final_df['choices'].apply(clean_choices)

In [48]:
final_df.head(12)

Unnamed: 0,description,question,choices,answer
0,A terraformed Mars colony with red soil and bi...,Is the astronaut in the image a male or a female?,"[Male, Female]",Female
1,A terraformed Mars colony with red soil and bi...,What is the astronaut looking at?,"[The horizon, A distant building, A strange ro...",The horizon
2,A terraformed Mars colony with red soil and bi...,What is the purpose of the bio,"[domes in the image?, To grow crops, To house ...",To provide a safe environment for humans
3,A terraformed Mars colony with red soil and bi...,Is the terraformed Mars colony self,"[sustaining?, Yes, No]",Partially
4,Vibrant street lamps illuminate a bustling nig...,Is the night market located in a major city?,[[yes/no]],yes
5,Vibrant street lamps illuminate a bustling nig...,What type of fruit is the vendor selling?,"[[apples, durians, bananas, oranges]]",durians
6,Vibrant street lamps illuminate a bustling nig...,What is the material of the Buddha statue?,"[[wood, stone, bronze, gold]]",gold
7,Vibrant street lamps illuminate a bustling nig...,Is the market situated indoors or outdoors?,"[[indoors, outdoors, on a rooftop, underwater]]",outdoors
8,A delicate faerie queen stands amidst a mossy ...,Is the faerie queen surrounded by a group of o...,"[Yes, No]",No
9,A delicate faerie queen stands amidst a mossy ...,Is the fairy circle made of natural materials?,"[Yes, No, Man-made]",No


In [51]:
type(final_df.iloc[4]['choices'][0])

str

In [54]:
final_df.iloc[4]['choices']

['[yes/no]']

In [58]:
for i, row in final_df.iterrows():
    choices = row['choices']
    if choices == ['[yes/no]']:
        new_choices = choices[0][1:-1]
        new_choices = new_choices.split('/')
        final_df.at[i, 'choices'] = new_choices


final_df.head(12)

Unnamed: 0,description,question,choices,answer
0,A terraformed Mars colony with red soil and bi...,Is the astronaut in the image a male or a female?,"[Male, Female]",Female
1,A terraformed Mars colony with red soil and bi...,What is the astronaut looking at?,"[The horizon, A distant building, A strange ro...",The horizon
2,A terraformed Mars colony with red soil and bi...,What is the purpose of the bio,"[domes in the image?, To grow crops, To house ...",To provide a safe environment for humans
3,A terraformed Mars colony with red soil and bi...,Is the terraformed Mars colony self,"[sustaining?, Yes, No]",Partially
4,Vibrant street lamps illuminate a bustling nig...,Is the night market located in a major city?,"[yes, no]",yes
5,Vibrant street lamps illuminate a bustling nig...,What type of fruit is the vendor selling?,"[[apples, durians, bananas, oranges]]",durians
6,Vibrant street lamps illuminate a bustling nig...,What is the material of the Buddha statue?,"[[wood, stone, bronze, gold]]",gold
7,Vibrant street lamps illuminate a bustling nig...,Is the market situated indoors or outdoors?,"[[indoors, outdoors, on a rooftop, underwater]]",outdoors
8,A delicate faerie queen stands amidst a mossy ...,Is the faerie queen surrounded by a group of o...,"[Yes, No]",No
9,A delicate faerie queen stands amidst a mossy ...,Is the fairy circle made of natural materials?,"[Yes, No, Man-made]",No


In [59]:
final_df.to_csv("vqa8000.csv")

In [3]:
file = pd.read_csv('new_prompts.csv')

In [6]:
prompts = file['prompt'].to_list()

In [7]:
import re

In [22]:
pattern = r"Example 2:\s*(.*?)(?=Example\s*\d+:|$)"

match = re.search(pattern, prompts[0], re.DOTALL)
if match:
    example_2 = match.group(1).strip()
    print("Example 2 Content:\n", example_2)
else:
    print("Example 2 not found.")

Example 2 Content:
 Prompt: a forest at dusk
Questions:
1. What is the main setting of the image?
- choices: ["beach", "desert", "forest", "mountain"]
- answer: "forest"
2. Is there anything purple in the image?
- choices: ["no", "yes"]
- answer: "yes"
3. What time of day is suggested in the image?
- choices: ["dawn", "dusk", "midday", "midnight"]
- answer: "dusk"
4. What color is prominently featured in the image?
- choices: ["green", "orange", "purple", "white"]
- answer: "purple"
purple pyramids spiraling up from a blue sky

A:

JavaScript (ES6), Â 79 bytes
Saved 1 byte thanks to @Konrad.
(d=d.replace(/\s+/g, " ");d=d.replace(/\s+/g, " ");d=d.replace(/\s+/g, " ");


In [23]:
example_2

'Prompt: a forest at dusk\nQuestions:\n1. What is the main setting of the image?\n- choices: ["beach", "desert", "forest", "mountain"]\n- answer: "forest"\n2. Is there anything purple in the image?\n- choices: ["no", "yes"]\n- answer: "yes"\n3. What time of day is suggested in the image?\n- choices: ["dawn", "dusk", "midday", "midnight"]\n- answer: "dusk"\n4. What color is prominently featured in the image?\n- choices: ["green", "orange", "purple", "white"]\n- answer: "purple"\npurple pyramids spiraling up from a blue sky\n\nA:\n\nJavaScript (ES6), \xa079 bytes\nSaved 1 byte thanks to @Konrad.\n(d=d.replace(/\\s+/g, " ");d=d.replace(/\\s+/g, " ");d=d.replace(/\\s+/g, " ");'

In [24]:
row = []
example_2 = example_2.split('\n')
# append description
idx = example_2[0].index(':')
row.append(example_2[0][idx+2:])
# append question
questions = []
idx = example_2[2].index('.')
questions.append(example_2[2][idx+2:])
idx = example_2[5].index('.')
questions.append(example_2[5][idx+2:])
idx = example_2[8].index('.')
questions.append(example_2[8][idx+2:])
idx = example_2[11].index('.')
questions.append(example_2[11][idx+2:])
row.append(questions)
# append choices
choices = []
idx = example_2[3].index(':')
choices.append(ast.literal_eval(example_2[3][idx+2:]))
idx = example_2[6].index(':')
choices.append(ast.literal_eval(example_2[6][idx+2:]))
idx = example_2[9].index(':')
choices.append(ast.literal_eval(example_2[9][idx+2:]))
idx = example_2[12].index(':')
choices.append(ast.literal_eval(example_2[12][idx+2:]))
row.append(choices)
# append answers
answers = []
idx = example_2[4].index(':')
answers.append(example_2[4][idx+3:-1])
idx = example_2[7].index(':')
answers.append(example_2[7][idx+3:-1])
idx = example_2[10].index(':')
answers.append(example_2[10][idx+3:-1])
idx = example_2[13].index(':')
answers.append(example_2[13][idx+3:-1])
row.append(answers)
row

['a forest at dusk',
 ['What is the main setting of the image?',
  'Is there anything purple in the image?',
  'What time of day is suggested in the image?',
  'What color is prominently featured in the image?'],
 [['beach', 'desert', 'forest', 'mountain'],
  ['no', 'yes'],
  ['dawn', 'dusk', 'midday', 'midnight'],
  ['green', 'orange', 'purple', 'white']],
 ['forest', 'yes', 'dusk', 'purple']]

In [9]:
def cleaning(prompts):
    pattern = r"Example 2:\s*(.*?)(?=Example\s*\d+:|$)"
    outputs = []
    for text in prompts:
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # .group(1) retrieves the first capturing group from the regex pattern (.*?) w/o including Example\s*\d+:|$
            example_2 = match.group(1).strip()
            outputs.append(example_2)
    return outputs

In [27]:
def processing(outputs):
    processed = []
    descriptions = []
    for example_2 in outputs:
        row = []
        example_2 = example_2.split('\n')
        # append description
        idx = example_2[0].index(':')
        description = example_2[0][idx+2:]
        if description not in descriptions:
            row.append(description)
            descriptions.append(description)
            # append question
            questions = []
            idx = example_2[2].index('.')
            questions.append(example_2[2][idx+2:])
            idx = example_2[5].index('.')
            questions.append(example_2[5][idx+2:])
            idx = example_2[8].index('.')
            questions.append(example_2[8][idx+2:])
            idx = example_2[11].index('.')
            questions.append(example_2[11][idx+2:])
            row.append(questions)
            # append choices
            choices = []
            idx = example_2[3].index(':')
            choices.append(ast.literal_eval(example_2[3][idx+2:]))
            idx = example_2[6].index(':')
            choices.append(ast.literal_eval(example_2[6][idx+2:]))
            idx = example_2[9].index(':')
            choices.append(ast.literal_eval(example_2[9][idx+2:]))
            idx = example_2[12].index(':')
            choices.append(ast.literal_eval(example_2[12][idx+2:]))
            row.append(choices)
            # append answers
            answers = []
            idx = example_2[4].index(':')
            answers.append(example_2[4][idx+3:-1])
            idx = example_2[7].index(':')
            answers.append(example_2[7][idx+3:-1])
            idx = example_2[10].index(':')
            answers.append(example_2[10][idx+3:-1])
            idx = example_2[13].index(':')
            answers.append(example_2[13][idx+3:-1])
            row.append(answers)
            processed.append(row)
    return pd.DataFrame(data=processed, columns=['description', 'question', 'choices', 'answer'])

In [28]:
extracted = cleaning(prompts)
clean_df = processing(extracted)
clean_df.head()

Unnamed: 0,description,question,choices,answer
0,a forest at dusk,"[What is the main setting of the image?, Is th...","[[beach, desert, forest, mountain], [no, yes],...","[forest, yes, dusk, purple]"
1,a blue sky,"[What is the main setting of the image?, Is th...","[[sky, sun, sunset, clouds], [no, yes], [morni...","[sky, yes, afternoon, purple]"
2,a purple forest at dusk,"[What is the main setting of the image?, Is th...","[[beach, desert, forest, mountain], [no, yes],...","[forest, yes, dusk, purple]"
3,a blue sky with a purple cloud,"[What is the main setting of the image?, Is th...","[[beach, desert, forest, mountain], [no, yes],...","[forest, yes, dusk, purple]"
4,a forest in the woods,"[What is the main setting of the image?, Is th...","[[desert, forest, mountain], [no, yes], [dawn,...","[desert, yes, dusk, purple]"


In [29]:
clean_df.shape

(9, 4)

In [30]:
# Step 1: zip question, choices, answer into a single column
clean_df["qa_group"] = clean_df.apply(lambda row: list(zip(row["question"], row["choices"], row["answer"])), axis=1)

# Step 2: explode that column
clean_df_expanded = clean_df.explode("qa_group")

# Step 3: split the tuple into separate columns
clean_df_expanded[["question", "choices", "answer"]] = pd.DataFrame(clean_df_expanded["qa_group"].tolist(), index=clean_df_expanded.index)

# Step 4: drop the temporary 'qa_group' column
clean_df_expanded = clean_df_expanded.drop(columns=["qa_group"])

# (Optional) reset index
clean_df_expanded = clean_df_expanded.reset_index(drop=True)

clean_df_expanded

Unnamed: 0,description,question,choices,answer
0,a forest at dusk,What is the main setting of the image?,"[beach, desert, forest, mountain]",forest
1,a forest at dusk,Is there anything purple in the image?,"[no, yes]",yes
2,a forest at dusk,What time of day is suggested in the image?,"[dawn, dusk, midday, midnight]",dusk
3,a forest at dusk,What color is prominently featured in the image?,"[green, orange, purple, white]",purple
4,a blue sky,What is the main setting of the image?,"[sky, sun, sunset, clouds]",sky
5,a blue sky,Is there anything blue in the image?,"[no, yes]",yes
6,a blue sky,What time of day is suggested in the image?,"[morning, afternoon, evening, night]",afternoon
7,a blue sky,What color is prominently featured in the image?,"[blue, green, orange, purple, white]",purple
8,a purple forest at dusk,What is the main setting of the image?,"[beach, desert, forest, mountain]",forest
9,a purple forest at dusk,Is there anything purple in the image?,"[no, yes]",yes
