In [1]:
import base64, os
import requests
import pandas as pd
import numpy as np
from openai import OpenAI

In [2]:
# OpenAI API Key
import os
os.environ["OPENAI_API_KEY"] = ""
current_directory = os.getcwd()

In [3]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [4]:
os.chdir(os.path.join(current_directory, "Stimuli/Male"))
male_images = [f for f in sorted(os.listdir(os.getcwd())) if f.endswith('.jpg')][80:93]
male_image_directories = [os.path.join(os.getcwd(), image) for image in male_images]

In [5]:
os.chdir(os.path.join(current_directory, "Stimuli/Female"))
female_images = [f for f in sorted(os.listdir(os.getcwd())) if f.endswith('.jpg')][80:90]
female_image_directories = [os.path.join(os.getcwd(), image) for image in female_images]

In [6]:
def generate_text(image_list, cpi): 
  
  all_responses = []

  for image_path in image_list: 

    base64_image = encode_image(image_path)

    client = OpenAI()

    response = client.chat.completions.create(
    model="gpt-4o-mini", # points to gpt-4o-mini-2024-07-18
    messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "Write a 50-word story about this American individual. Note that this is not a real person. Be as detailed as possible."},
            {"type": "image_url", "image_url": {
                "url": f"data:image/png;base64,{base64_image}"}
            }  
        ]}
    ],
    max_tokens = 100,
    n = cpi
    )
  
    response_list = [response.choices[i].message.content for i in range(cpi)]
    all_responses.extend(response_list)

  return(all_responses)

## Data Collection in Five Batches

In [7]:
completions_per_image = 10

In [8]:
male_text_1 = generate_text(male_image_directories, completions_per_image)
female_text_1 = generate_text(female_image_directories, completions_per_image)

In [9]:
male_text_2 = generate_text(male_image_directories, completions_per_image)
female_text_2 = generate_text(female_image_directories, completions_per_image)

In [10]:
male_text_3 = generate_text(male_image_directories, completions_per_image)
female_text_3 = generate_text(female_image_directories, completions_per_image)

In [11]:
male_text_4 = generate_text(male_image_directories, completions_per_image)
female_text_4 = generate_text(female_image_directories, completions_per_image)

In [12]:
male_text_5 = generate_text(male_image_directories, completions_per_image)
female_text_5 = generate_text(female_image_directories, completions_per_image)

## Save Collected Text as a .csv File

In [13]:
text_list = male_text_1 + male_text_2 + male_text_3 + male_text_4 + male_text_5 + female_text_1 + female_text_2 + female_text_3 + female_text_4 + female_text_5

In [14]:
batch_numbers = [41, 42, 43, 44, 45, 41, 42, 43, 44, 45]
repeat_times = [len(male_text_1), len(male_text_2), len(male_text_3), len(male_text_4), len(male_text_5), len(female_text_1), len(female_text_2), len(female_text_3), len(female_text_4), len(female_text_5)]
batch_list = [number for number, times in zip(batch_numbers, repeat_times) for _ in range(times)]

In [15]:
image_list = [b for b in male_images for _ in range(completions_per_image)] * 5 + [w for w in female_images for _ in range(completions_per_image)] * 5

In [16]:
# Create a list for each gender, multiplying by the number of directories and completions per image
male_list = ['male'] * completions_per_image * len(male_image_directories) * 5
female_list = ['female'] * completions_per_image * len(female_image_directories) * 5

# Combine the lists
gender_list = male_list + female_list

In [17]:
response_df = pd.DataFrame({'batch': batch_list, 'race': gender_list, 'image': image_list, 'text': text_list})
response_df.to_csv('../../day_9.csv', index = False)