In [7]:
import pandas as pd
import json

In [8]:
attr_df = pd.read_csv("data/CelebA-RAW/CelebA-ATRRIBUTES-CSV.csv")
attr_df

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
3,000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
4,000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202594,202595.jpg,-1,-1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
202595,202596.jpg,-1,-1,-1,-1,-1,1,1,-1,-1,...,-1,1,1,-1,-1,-1,-1,-1,-1,1
202596,202597.jpg,-1,-1,-1,-1,-1,-1,-1,-1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
202597,202598.jpg,-1,1,1,-1,-1,-1,1,-1,1,...,-1,1,-1,1,1,-1,1,-1,-1,1


In [9]:
import random

TEMPLATES = [
    "A {age} {gender} who is {traits}.",
    "This is {article} {age} {gender} {traits}.",
    "Portrait of {article} {age} {gender} {traits}.",
    "A {age} {gender} with {traits}."
]

def generate_caption(row):
    # === 1. Gender & age ===
    gender = "man" if row["Male"] == 1 else "woman"
    age = "young" if row["Young"] == 1 else "older"
    article = "an" if age == "older" or gender.startswith(("a","e","i","o","u")) else "a"

    # === 2. Trait groups ===
    def pick(cols, negate=False, fmt=None):
        fmt = fmt or (lambda name: name.replace('_', ' ').lower())
        items = []
        for col in cols:
            val = row[col]
            if (val == 1 and not negate) or (val == -1 and negate):
                name = fmt(col)
                # Negative phrasing
                if negate:
                    name = f"without {name}"
                items.append(name)
        return items

    hair = pick(["Black_Hair","Blond_Hair","Brown_Hair","Gray_Hair"], fmt=lambda c: c.split('_')[0].lower()+" hair") \
         + pick(["Bald","Bangs","Straight_Hair","Wavy_Hair"], fmt=lambda c: c.replace('_',' ').lower())

    facial_hair = pick(["Mustache","Goatee","5_o_Clock_Shadow","Sideburns"], fmt=lambda c: c.replace('_',' ').lower())

    features = pick([
        "Arched_Eyebrows","Bushy_Eyebrows","Smiling","Big_Lips","Big_Nose","Pointy_Nose",
        "Narrow_Eyes","High_Cheekbones","Chubby","Double_Chin","Oval_Face","Pale_Skin",
        "Rosy_Cheeks","Bags_Under_Eyes","Mouth_Slightly_Open"
    ], fmt=lambda c: c.replace('_',' ').lower())

    accessories = pick([
        "Eyeglasses","Wearing_Hat","Wearing_Earrings",
        "Wearing_Lipstick","Wearing_Necklace","Wearing_Necktie"
    ], fmt=lambda c: c.replace('Wearing_','wearing ').replace('_',' ').lower())

    makeup    = ["heavy makeup"] if row["Heavy_Makeup"] == 1 else ["without heavy makeup"]

    # === 3. Consolidate & remove duplicates ===
    all_traits = []
    for grp in (hair, facial_hair, features, accessories, makeup):
        all_traits += grp
    # remove duplicates while preserving order
    seen = set(); traits = []
    for t in all_traits:
        if t not in seen:
            seen.add(t); traits.append(t)

    # === 4. If no positive features at all, add a generic phrase ===
    if not traits:
        traits = [f"{'without any distinguishing facial features'}"]

    # === 5. Build the trait string ===
    if len(traits) == 1:
        trait_str = traits[0]
    else:
        trait_str = ", ".join(traits[:-1]) + ", and " + traits[-1]

    # === 6. Pick a random template ===
    template = random.choice(TEMPLATES)
    caption = template.format(age=age, gender=gender, traits=trait_str, article=article)

    # Capitalize first letter
    return caption[0].upper() + caption[1:]

# Usage:
# celebahq_attr_df['caption'] = celebahq_attr_df.apply(generate_best_caption, axis=1)


In [None]:
captions = attr_df.drop(columns='image_id').apply(generate_caption, axis=1)
caption_df= pd.DataFrame(data=captions, columns=['caption'])
caption_df['image_id'] = attr_df['image_id'].copy()
# Add filepath column
# caption_df['filepath'] = 'data/CelebA/CelebA-img/' + caption_df['image_id']
caption_df = caption_df[['image_id', 'caption']]

# Captioning

In [11]:
pd.set_option('display.max_colwidth', None)
caption_df

Unnamed: 0,image_id,filepath,caption
0,000001.jpg,data/CelebAMask-HQ/CelebA-HQ-img/000001.jpg,"A young woman who is brown hair, straight hair, arched eyebrows, smiling, pointy nose, high cheekbones, mouth slightly open, wearing earrings, wearing lipstick, and heavy makeup."
1,000002.jpg,data/CelebAMask-HQ/CelebA-HQ-img/000002.jpg,"Portrait of a young woman brown hair, smiling, big nose, high cheekbones, bags under eyes, mouth slightly open, and without heavy makeup."
2,000003.jpg,data/CelebAMask-HQ/CelebA-HQ-img/000003.jpg,"Portrait of a young man wavy hair, big lips, pointy nose, narrow eyes, and without heavy makeup."
3,000004.jpg,data/CelebAMask-HQ/CelebA-HQ-img/000004.jpg,"Portrait of a young woman straight hair, pointy nose, wearing earrings, wearing lipstick, wearing necklace, and without heavy makeup."
4,000005.jpg,data/CelebAMask-HQ/CelebA-HQ-img/000005.jpg,"Portrait of a young woman arched eyebrows, big lips, pointy nose, narrow eyes, wearing lipstick, and heavy makeup."
...,...,...,...
202594,202595.jpg,data/CelebAMask-HQ/CelebA-HQ-img/202595.jpg,"A young woman with blond hair, big lips, wearing lipstick, and without heavy makeup."
202595,202596.jpg,data/CelebAMask-HQ/CelebA-HQ-img/202596.jpg,"Portrait of a young man blond hair, bangs, straight hair, smiling, big lips, narrow eyes, high cheekbones, pale skin, mouth slightly open, and without heavy makeup."
202596,202597.jpg,data/CelebAMask-HQ/CelebA-HQ-img/202597.jpg,"A young man who is black hair, smiling, high cheekbones, mouth slightly open, eyeglasses, and without heavy makeup."
202597,202598.jpg,data/CelebAMask-HQ/CelebA-HQ-img/202598.jpg,"A young woman with black hair, wavy hair, arched eyebrows, bushy eyebrows, smiling, big lips, pointy nose, high cheekbones, oval face, rosy cheeks, wearing earrings, wearing lipstick, and heavy makeup."


In [12]:
# export caption dataframe
caption_df.to_csv('data/CelebA-RAW/CelebA-Captions.csv', index=False)

In [13]:
image_folder = "data/CelebA-RAW/images"  # folder where images are stored
caption_df["filepath"] = caption_df["image_id"].apply(lambda x: f"{image_folder}/{x}")

# Prepare LAION-style entries (only filepath and caption)
records = caption_df[["filepath", "caption"]].rename(columns={"caption": "text"}).to_dict(orient="records")

# Save as JSONL
with open("data/CelebA-RAW/CelebA_LAION.jsonl", "w", encoding="utf-8") as f:
    for record in records:
        json.dump(record, f)
        f.write("\n")