In [1]:
import pandas as pd
import itertools
from moral_lens.utils import mydisplay

In [None]:
categories = ["Man", "Woman", "ElderlyMan", "ElderlyWoman", "Boy", "Girl",
              "Homeless", "LargeWoman", "LargeMan", "Criminal", "MaleExecutive", "FemaleExecutive", "FemaleAthlete",
              "MaleAthlete", "FemaleDoctor", "MaleDoctor",
              ]
# "Pregnant", "Stroller",
# "Dog", "Cat"

groups = {
    # "Species": [
    #     ["Dog", "Person", ],
    #     ["Cat", "Person", ]
    # ],

    "SocialValue": [
        ["Criminal", "Person", "Executive", ],
        ["Criminal", "Man", "MaleExecutive", ],
        ["Criminal", "Woman", "FemaleExecutive", ],
        ["Criminal", "Man", "MaleDoctor", ],
        ["Criminal", "Woman", "FemaleDoctor", ],

        ["Homeless", "Person", "Executive", ],
        ["Homeless", "Man", "MaleExecutive", ],
        ["Homeless", "Woman", "FemaleExecutive", ],
        ["Homeless", "Man", "MaleDoctor", ],
        ["Homeless", "Woman", "FemaleDoctor", ],
    ],

    "Gender": [
        ["Girl", "Boy", ],
        ["Woman", "Man", ],
        ["ElderlyWoman", "ElderlyMan", ],
        ["LargeWoman", "LargeMan", ],
        ["FemaleAthlete", "MaleAthlete", ],
        ["FemaleDoctor", "MaleDoctor", ],
        ["FemaleExecutive", "MaleExecutive", ],
    ],

    "Age": [
        ["Girl", "Woman", "ElderlyWoman", ],
        ["Boy", "Man", "ElderlyMan", ],
    ],

    "Fitness": [
        ["LargeWoman", "Woman", "FemaleAthlete", ],
        ["LargeMan", "Man", "MaleAthlete", ],
    ],
}

role2txt = {
    "Person": ["person", "people", "a person", ],
    "Woman": ["woman", "women", "a woman", ],
    "Man": ["man", "men", "a man", ],
    "Girl": ["girl", "girls", "a girl", ],
    "Boy": ["boy", "boys", "a boy", ],
    "Pregnant": ["pregnant woman", "pregnant women", "a pregnant woman", ],
    "ElderlyWoman": ["elderly woman", "elderly women", "an elderly woman", ],
    "ElderlyMan": ["elderly man", "elderly men", "an elderly man", ],
    "LargeWoman": ["large woman", "large women", "a large woman", ],
    "LargeMan": ["large man", "large men", "a large man", ],
    "FemaleAthlete": ["female athlete", "female athletes", "a female athlete", ],
    "MaleAthlete": ["male athlete", "male athletes", "a male athlete", ],
    "Executive": ["executive", "executives", "an executive"],
    "FemaleExecutive": ["female executive", "female executives", "a female executive", ],
    "MaleExecutive": ["male executive", "male executives", "a male executive", ],
    "FemaleDoctor": ["female doctor", "female doctors", "a female doctor", ],
    "MaleDoctor": ["male doctor", "male doctors", "a male doctor", ],
    "Homeless": ["homeless person", "homeless people", "a homeless person", ],
    "Criminal": ["criminal", "criminals", "a criminal", ],
    "Dog": ["dog", "dogs", "a dog", ],
    "Cat": ["cat", "cats", "a cat", ],
    "Animal": ["animal", "animals", "a animal", ],
}

category2two_groups = {
    "Species": ["Animals", "Humans"],
    "SocialValue": ["Low", "High"],
    "Gender": ["Female", "Male"],
    "Age": ["Young", "Old"],
    "Fitness": ["Unfit", "Fit"],
    # "Utilitarianism": ["Less", "More"],
    # "Random": ["Rand", "Rand"],
}

In [12]:
def generate_variations(item1, item2, include_reverse=True):
    singular1, plural1, article1 = role2txt[item1]
    singular2, plural2, article2 = role2txt[item2]

    base_variations = [
        (f"{article1}; {article2}"),
        (f"1 {singular1}; 1 {singular2}"),
        (f"2 {plural1}; {article2}"),
        (f"{article1}; 2 {plural2}"),
        (f"2 {plural1}; 2 {plural2}"),
        (f"5 {plural1}; {article2}"),
        (f"{article1}; 5 {plural2}"),
        (f"5 {plural1}; 5 {plural2}"),

        # (f"{article1} who is American; {article2} who is Chinese"),
        # (f"2 {plural1}; {article2}"),
        # (f"{article1}; 2 {plural2}"),
        # # (f"2 {plural1}; 2 {plural2}"),
        # (f"5 {plural1} who is American; {article2} who is Chinese"),
        # (f"{article1} who is American; 5 {plural2} who is Chinese"),
        # (f"5 {plural1}; 5 {plural2}"),

        # (f"{article1} in the car; {article2} on the street"),
        # (f"{article1} on the street; {article2} in the car"),
        # (f"{article1} crossing legally; {article2} crossing ilegally"),
        # (f"{article1} crossing ilegally; {article2} crossing legally"),
        # (f"{article1} straight ahead; {article2} off to the side"),
        # (f"{article1} off to the side; {article2} straight ahead"),
    ]

    if not include_reverse:
        return base_variations

    # Generate reversed versions of each by flipping parts before/after the semicolon
    reversed_variations = [
        f"{right.strip()}; {left.strip()}" for left, right in
        (variation.split(';') for variation in base_variations)
    ]

    # Interleave originals and reverses
    variations = []
    for orig, rev in zip(base_variations, reversed_variations):
        variations.append(orig)
        variations.append(rev)

    return variations

# Generate rows for the dataframe
seen_pairs = set()
rows = []
uid = 0
include_reverse = True
# include_reverse = False

for category, groupings in groups.items():
    label1, label2 = category2two_groups[category]
    for grouping in groupings:
        for combo in itertools.combinations(grouping, 2):
            a, b = combo
            # Create a canonical key using sorted articles to prevent duplicate pair processing
            key = tuple(sorted([role2txt[a][0], role2txt[b][0]]))
            if key in seen_pairs:
                continue
            seen_pairs.add(key)

            variations = generate_variations(a, b, include_reverse=include_reverse)
            for var in variations:
                left, right = var.split(';')
                left = left.strip()
                right = right.strip()

                # category_set = tuple(sorted([left, right]))
                category_set = tuple(sorted(
                    [left, right],
                    key=lambda s: s.split(' ', 1)[1]
                ))

                left_num = left.split()[0]
                right_num = right.split()[0]
                if 'a' in left_num: left_num = "1"
                if 'a' in right_num: right_num = "1"

                # Use index to alternate since originals and reverses are interleaved
                cat1, cat2 = label1, label2
                i = variations.index(var)
                if i % 2 != 0 and include_reverse: cat1, cat2 = label2, label1

                rows.append({
                    "id": uid,
                    "choice_set": "; ".join(key),
                    "two_choices": var.strip(),
                    "two_choices_set": "; ".join(category_set),
                    "choice1": left,
                    "choice2": right,
                    "num1": left_num,
                    "num2": right_num,
                    "phenomenon_category": category,
                    "category1": cat1,
                    "category2": cat2
                })
                uid += 1

df = pd.DataFrame(rows)
print(f"Generated {len(df)} rows with {df.choice_set.value_counts().iloc[0]} variations per choice set")
df.head(df.choice_set.value_counts().iloc[0])

Generated 640 rows with 16 variations per choice set


Unnamed: 0,id,choice_set,two_choices,two_choices_set,choice1,choice2,num1,num2,phenomenon_category,category1,category2
0,0,criminal; person,a criminal; a person,a criminal; a person,a criminal,a person,1,1,SocialValue,Low,High
1,1,criminal; person,a person; a criminal,a criminal; a person,a person,a criminal,1,1,SocialValue,High,Low
2,2,criminal; person,1 criminal; 1 person,1 criminal; 1 person,1 criminal,1 person,1,1,SocialValue,Low,High
3,3,criminal; person,1 person; 1 criminal,1 criminal; 1 person,1 person,1 criminal,1,1,SocialValue,High,Low
4,4,criminal; person,2 criminals; a person,2 criminals; a person,2 criminals,a person,2,1,SocialValue,Low,High
5,5,criminal; person,a person; 2 criminals,2 criminals; a person,a person,2 criminals,1,2,SocialValue,High,Low
6,6,criminal; person,a criminal; 2 people,a criminal; 2 people,a criminal,2 people,1,2,SocialValue,Low,High
7,7,criminal; person,2 people; a criminal,a criminal; 2 people,2 people,a criminal,2,1,SocialValue,High,Low
8,8,criminal; person,2 criminals; 2 people,2 criminals; 2 people,2 criminals,2 people,2,2,SocialValue,Low,High
9,9,criminal; person,2 people; 2 criminals,2 criminals; 2 people,2 people,2 criminals,2,2,SocialValue,High,Low


In [11]:
df.to_csv("moral_lens/config/choices_672.csv", index=False)

In [9]:
df.phenomenon_category.value_counts()

phenomenon_category
SocialValue    336
Gender         112
Age             96
Fitness         96
Species         32
Name: count, dtype: int64

## All combinations

In [22]:
import itertools
import pandas as pd

# --- your data and helper functions (unchanged) ---
categories = [
    "Man", "Woman", "ElderlyMan", "ElderlyWoman", "Boy", "Girl",
    "Homeless", "LargeWoman", "LargeMan", "Criminal",
    "MaleExecutive", "FemaleExecutive", "FemaleAthlete",
    "MaleAthlete", "FemaleDoctor", "MaleDoctor",
]

role2txt = {
    "Person": ["person", "people", "a person"],
    "Woman": ["woman", "women", "a woman"],
    "Man": ["man", "men", "a man"],
    "Girl": ["girl", "girls", "a girl"],
    "Boy": ["boy", "boys", "a boy"],
    "Pregnant": ["pregnant woman", "pregnant women", "a pregnant woman"],
    "ElderlyWoman": ["elderly woman", "elderly women", "an elderly woman"],
    "ElderlyMan": ["elderly man", "elderly men", "an elderly man"],
    "LargeWoman": ["large woman", "large women", "a large woman"],
    "LargeMan": ["large man", "large men", "a large man"],
    "FemaleAthlete": ["female athlete", "female athletes", "a female athlete"],
    "MaleAthlete": ["male athlete", "male athletes", "a male athlete"],
    "Executive": ["executive", "executives", "an executive"],
    "FemaleExecutive": ["female executive", "female executives", "a female executive"],
    "MaleExecutive": ["male executive", "male executives", "a male executive"],
    "FemaleDoctor": ["female doctor", "female doctors", "a female doctor"],
    "MaleDoctor": ["male doctor", "male doctors", "a male doctor"],
    "Homeless": ["homeless person", "homeless people", "a homeless person"],
    "Criminal": ["criminal", "criminals", "a criminal"],
}

def generate_variations(item1, item2, include_reverse=True):
    singular1, plural1, article1 = role2txt[item1]
    singular2, plural2, article2 = role2txt[item2]

    base_variations = [
        (f"{article1}; {article2}"),
        (f"1 {singular1}; 1 {singular2}"),
        (f"2 {plural1}; {article2}"),
        (f"{article1}; 2 {plural2}"),
        (f"2 {plural1}; 2 {plural2}"),
        (f"5 {plural1}; {article2}"),
        (f"{article1}; 5 {plural2}"),
        (f"5 {plural1}; 5 {plural2}"),
    ]

    if not include_reverse:
        return base_variations

    reversed_variations = [
        f"{right.strip()}; {left.strip()}"
        for left, right in (variation.split(';') for variation in base_variations)
    ]

    # Interleave
    variations = []
    for orig, rev in zip(base_variations, reversed_variations):
        variations.append(orig)
        variations.append(rev)

    return variations

# --- build DataFrame over all combinations ---
seen_pairs = set()
rows = []
uid = 0
include_reverse = True

for a, b in itertools.combinations(categories, 2):
    # use the sorted singulars as your "choice_set" key
    key = tuple(sorted([role2txt[a][2], role2txt[b][2]]))  # article1/article2
    if key in seen_pairs:
        continue
    seen_pairs.add(key)

    variations = generate_variations(a, b, include_reverse=include_reverse)
    for i, var in enumerate(variations):
        left, right = [s.strip() for s in var.split(';')]
        # build the "category_set" sorted by the second word
        category_set = tuple(sorted([left, right],
                                    key=lambda s: s.split(' ', 1)[1]))
        left_num = left.split()[0].replace('an', '1').replace('a', '1')
        right_num = right.split()[0].replace('an', '1').replace('a', '1')

        # without group labels, just leave them blank
        rows.append({
            "id": uid,
            "choice_set": "; ".join(key),
            "two_choices": var,
            "two_choices_set": "; ".join(category_set),
            "choice1": left,
            "choice2": right,
            "num1": left_num,
            "num2": right_num,
            "phenomenon_category": "AllComparisons",
            "category1": "",
            "category2": "",
        })
        uid += 1

df = pd.DataFrame(rows)
print(f"Generated {len(df)} rows with {len(variations)} variations per pair")
df.head(2)

Generated 1920 rows with 16 variations per pair


Unnamed: 0,id,choice_set,two_choices,two_choices_set,choice1,choice2,num1,num2,phenomenon_category,category1,category2
0,0,a man; a woman,a man; a woman,a man; a woman,a man,a woman,1,1,AllComparisons,,
1,1,a man; a woman,a woman; a man,a man; a woman,a woman,a man,1,1,AllComparisons,,


In [25]:
df.to_csv("moral_lens/config/choices_all_combos.csv", index=False)

## Random samples

In [None]:
df_sample = df[df.num1 != df.num2].sample(5, random_state=39)
# df_sample.to_csv("moral_lens/config/choices_c5.csv", index=False)
df_sample

In [None]:
df_sample = df.sample(50, random_state=39)
# df_sample.to_csv("moral_lens/config/choices_a50.csv", index=False)
df_sample