In [3]:
%reload_ext autoreload
%autoreload 2

import argparse
import re
from pathlib import Path
import pandas as pd
import os
import json
from build_stimulus_table import build_stimulus_table
from sample_experiment import generate_profile, generate_debug_profile, save_profile

In [4]:
# Paths
WORK = Path(os.environ.get("WORK", Path.cwd()))
DATA = WORK / "color-concept-entanglement" / "data"
OBJ_IMG_ROOT = DATA / "color_images" / "gpt-4o"

## Create stimulus table for each condition

### 1. Objects with color priors

In [5]:
df_priors = build_stimulus_table(
    dataset_root=OBJ_IMG_ROOT / "image_priors",
    stimulus_type="correct_prior",
    data_root=DATA,
)
# Drop images colored with black
df_priors = df_priors[df_priors["target_color"] != "black"]

df_priors.to_csv(DATA / "prolific_stimuli" / f"stimulus_table_image_priors.csv", index=False)
display(df_priors.head())
display(df_priors.value_counts("target_color"))

Unnamed: 0,object,stimulus_type,manipulation_color,target_color,variant_region,percent_colored,mode,variant_label,image_path
0,Band Aid,correct_prior,pink,white,BG,0,seq,BG 0% (seq),color_images/gpt-4o/image_priors/Band_Aid_1_e7...
1,Band Aid,correct_prior,pink,pink,BG,1,seq,BG 1% (seq),color_images/gpt-4o/image_priors/Band_Aid_1_e7...
2,Band Aid,correct_prior,pink,pink,BG,2,seq,BG 2% (seq),color_images/gpt-4o/image_priors/Band_Aid_1_e7...
3,Band Aid,correct_prior,pink,pink,BG,3,seq,BG 3% (seq),color_images/gpt-4o/image_priors/Band_Aid_1_e7...
4,Band Aid,correct_prior,pink,pink,BG,4,seq,BG 4% (seq),color_images/gpt-4o/image_priors/Band_Aid_1_e7...


target_color
brown     2832
green     1120
grey      1112
red       1074
blue       556
orange     520
yellow     440
white      398
pink       160
purple     120
Name: count, dtype: int64

### 2. Objects with counterfact color

In [6]:
df_cf = build_stimulus_table(
    dataset_root=OBJ_IMG_ROOT / "counterfact",
    stimulus_type="counterfact",
    data_root=DATA,
)
# Drop images colored with black
df_cf = df_cf[df_cf["target_color"] != "black"]

df_cf.to_csv(DATA / "prolific_stimuli" / f"stimulus_table_counterfact.csv", index=False)
display(df_cf.head())
display(df_cf.value_counts("target_color"))

Unnamed: 0,object,stimulus_type,manipulation_color,target_color,variant_region,percent_colored,mode,variant_label,image_path
0,Band Aid,counterfact,purple,white,BG,0,seq,BG 0% (seq),color_images/gpt-4o/counterfact/Band_Aid_1_e73...
1,Band Aid,counterfact,purple,purple,BG,1,seq,BG 1% (seq),color_images/gpt-4o/counterfact/Band_Aid_1_e73...
2,Band Aid,counterfact,purple,purple,BG,2,seq,BG 2% (seq),color_images/gpt-4o/counterfact/Band_Aid_1_e73...
3,Band Aid,counterfact,purple,purple,BG,3,seq,BG 3% (seq),color_images/gpt-4o/counterfact/Band_Aid_1_e73...
4,Band Aid,counterfact,purple,purple,BG,4,seq,BG 4% (seq),color_images/gpt-4o/counterfact/Band_Aid_1_e73...


target_color
pink      1592
purple    1592
blue      1234
green     1156
yellow     998
red        794
orange     760
brown      434
white      430
Name: count, dtype: int64

### 3. Shapes

In [7]:
df_shapes = build_stimulus_table(
    dataset_root= DATA / "shapes" / "shape_colored",
    stimulus_type="shape",
    data_root=DATA,
)
# Drop images colored with black
df_shapes = df_shapes[df_shapes["target_color"] != "black"]

df_shapes.to_csv(DATA / "prolific_stimuli" / f"stimulus_table_shapes.csv", index=False)
display(df_shapes.head())
display(df_shapes.value_counts("target_color"))

Unnamed: 0,object,stimulus_type,manipulation_color,target_color,variant_region,percent_colored,mode,variant_label,image_path
0,circle,shape,black,white,BG,0,seq,BG 0% (seq),shapes/shape_colored/circle_v0_black/BG_000_se...
21,circle,shape,black,white,FG,0,seq,FG 0% (seq),shapes/shape_colored/circle_v0_black/FG_000_se...
42,circle,shape,blue,white,BG,0,seq,BG 0% (seq),shapes/shape_colored/circle_v0_blue/BG_000_seq...
43,circle,shape,blue,blue,BG,1,seq,BG 1% (seq),shapes/shape_colored/circle_v0_blue/BG_001_seq...
44,circle,shape,blue,blue,BG,2,seq,BG 2% (seq),shapes/shape_colored/circle_v0_blue/BG_002_seq...


target_color
blue      1000
brown     1000
green     1000
grey      1000
orange    1000
pink      1000
purple    1000
red       1000
yellow    1000
white      500
Name: count, dtype: int64

## Create survey profiles

In [13]:
N_BASE_PROFILES = 37

profiles = []

for base_id in range(N_BASE_PROFILES):
    for introspection_pos in ["first", "last"]:

        profile = generate_profile(
            df_priors=df_priors,
            df_cf=df_cf,
            df_shapes=df_shapes,
            seed=base_id,
            introspection_position=introspection_pos,
        )

        profiles.append({
            "profile_id": f"{base_id}_{introspection_pos}",
            "base_id": base_id,
            "introspection_position": introspection_pos,
            "questions": profile,
        })

In [14]:
out_dir = DATA / "prolific_stimuli" / "profiles"
out_dir.mkdir(exist_ok=True)

for p in profiles:
    out_path = out_dir / f"profile_{p['profile_id']}.json"
    with open(out_path, "w") as f:
        json.dump(p, f, indent=2)

In [15]:
save_profile(
    generate_debug_profile(df_priors, df_shapes),
    out_dir / "debug_profile.json",
)
# test server.py with http://127.0.0.1:5000/?PROLIFIC_PID=DEBUG