In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI
import base64, os, requests, random

In [2]:
ganfd_df = pd.read_csv('../GANFD/image_lookup.csv')
ganfd_dict = dict(zip(ganfd_df['full_ID'], ganfd_df['condition']))

## Data Collection

In [3]:
MODEL_NAME = "gpt-4o-mini"
os.environ["OPENAI_API_KEY"] = ""

In [4]:
DEFAULT_SYSTEM = """You are a helpful chat assistant. 
You are going to generate texts in response to images depicting fictional individuals."""

In [5]:
TEXT_PROMPT = "Write a 50-word story about the individual inside the image."

In [6]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [7]:
def generate_text(image_dict, model, system_message, writing_prompt):

    all_responses = []
    image_list = []
    condition_list = []

    # Initialize API client
    client = OpenAI()

    for image_key, image_value in image_dict.items():
        
        image_path = '../GANFD/Images/' + image_key + '.jpg'
        base64_image = encode_image(image_path)
    
        # Generate completions
        completions = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": [
                    {"type": "text", "text": writing_prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
                ]}
            ],
            max_tokens=150, 
            n = 50
        )
        
        # Extract and collect responses
        response_list = [choice.message.content for choice in completions.choices]
        all_responses.extend(response_list)
        image_list.extend([image_key for _ in range(len(response_list))])
        condition_list.extend([image_value for _ in range(len(response_list))])

    return image_list, condition_list, all_responses

In [8]:
# Combine text data
image_column, condition_column, text_column = generate_text(ganfd_dict, 
                                                            MODEL_NAME, 
                                                            DEFAULT_SYSTEM, 
                                                            TEXT_PROMPT)

In [9]:
response_df = pd.DataFrame({
    'image': image_column,
    'condition': condition_column,  # Ensure the list matches the length of text_list
    'text': text_column
})

In [None]:
# Define non-compliance phrases
non_compliance = [
    "Unable to identify", "Unable to view", "unable to provide", "cannot provide assumption",
    "cannot provide information", "make assumptions about", "won't speculate or create stories",
    "cannot see images", "cannot provide assistance", "can't provide assistance", "sorry", "Sorry",
    "fictional", "image", "cannot assist", "can't assist"
]

# Create regex pattern for non-compliance phrases
regex_pattern = '|'.join([f'\\b{phrase}\\b' for phrase in non_compliance])

# Filter the DataFrame
non_compliance_count = response_df['text'].str.contains(regex_pattern, case=False, na=False).sum()
data_filtered = response_df[~response_df['text'].str.contains(regex_pattern, case=False, na=False)]

# Print the number of completions dropped
print(f"Number of completions dropped: {non_compliance_count}")

In [None]:
data_filtered.to_csv("gpt4omini.csv", index = False)