In [1]:
import pandas as pd
import json

In [2]:
attr_df = pd.read_csv("data/CelebAMask-HQ/CelebA-HQ-Attribute.csv")
attr_df

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,0.jpg,-1,1,1,1,-1,-1,1,-1,-1,...,-1,1,-1,1,-1,-1,1,-1,-1,1
1,1.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,-1,1,-1,-1,1,-1,-1,1
2,2.jpg,-1,-1,1,1,-1,-1,1,-1,-1,...,-1,1,-1,1,1,-1,1,-1,-1,1
3,3.jpg,-1,-1,1,-1,-1,-1,-1,1,1,...,-1,1,-1,1,-1,-1,1,-1,-1,1
4,4.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,1,-1,1,-1,-1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29995.jpg,-1,-1,-1,-1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,1,-1,-1
29996,29996.jpg,1,-1,-1,1,-1,-1,-1,1,-1,...,1,1,-1,-1,-1,-1,-1,-1,-1,1
29997,29997.jpg,-1,-1,-1,-1,-1,1,-1,-1,1,...,-1,-1,1,-1,-1,-1,-1,-1,-1,1
29998,29998.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,1,-1,-1,1


In [3]:
import random

TEMPLATES = [
    "A {age} {gender} who is {traits}.",
    "This is {article} {age} {gender} {traits}.",
    "Portrait of {article} {age} {gender} {traits}.",
    "A {age} {gender} with {traits}."
]

def generate_caption(row):
    # === 1. Gender & age ===
    gender = "man" if row["Male"] == 1 else "woman"
    age = "young" if row["Young"] == 1 else "older"
    article = "an" if age == "older" or gender.startswith(("a","e","i","o","u")) else "a"

    # === 2. Trait groups ===
    def pick(cols, negate=False, fmt=None):
        fmt = fmt or (lambda name: name.replace('_', ' ').lower())
        items = []
        for col in cols:
            val = row[col]
            if (val == 1 and not negate) or (val == -1 and negate):
                name = fmt(col)
                # Negative phrasing
                if negate:
                    name = f"without {name}"
                items.append(name)
        return items

    hair = pick(["Black_Hair","Blond_Hair","Brown_Hair","Gray_Hair"], fmt=lambda c: c.split('_')[0].lower()+" hair") \
         + pick(["Bald","Bangs","Straight_Hair","Wavy_Hair"], fmt=lambda c: c.replace('_',' ').lower())

    facial_hair = pick(["Mustache","Goatee","5_o_Clock_Shadow","Sideburns"], fmt=lambda c: c.replace('_',' ').lower())

    features = pick([
        "Arched_Eyebrows","Bushy_Eyebrows","Smiling","Big_Lips","Big_Nose","Pointy_Nose",
        "Narrow_Eyes","High_Cheekbones","Chubby","Double_Chin","Oval_Face","Pale_Skin",
        "Rosy_Cheeks","Bags_Under_Eyes","Mouth_Slightly_Open"
    ], fmt=lambda c: c.replace('_',' ').lower())

    accessories = pick([
        "Eyeglasses","Wearing_Hat","Wearing_Earrings",
        "Wearing_Lipstick","Wearing_Necklace","Wearing_Necktie"
    ], fmt=lambda c: c.replace('Wearing_','wearing ').replace('_',' ').lower())

    makeup    = ["heavy makeup"] if row["Heavy_Makeup"] == 1 else ["without heavy makeup"]

    # === 3. Consolidate & remove duplicates ===
    all_traits = []
    for grp in (hair, facial_hair, features, accessories, makeup):
        all_traits += grp
    # remove duplicates while preserving order
    seen = set(); traits = []
    for t in all_traits:
        if t not in seen:
            seen.add(t); traits.append(t)

    # === 4. If no positive features at all, add a generic phrase ===
    if not traits:
        traits = [f"{'without any distinguishing facial features'}"]

    # === 5. Build the trait string ===
    if len(traits) == 1:
        trait_str = traits[0]
    else:
        trait_str = ", ".join(traits[:-1]) + ", and " + traits[-1]

    # === 6. Pick a random template ===
    template = random.choice(TEMPLATES)
    caption = template.format(age=age, gender=gender, traits=trait_str, article=article)

    # Capitalize first letter
    return caption[0].upper() + caption[1:]

# Usage:
# celebahq_attr_df['caption'] = celebahq_attr_df.apply(generate_best_caption, axis=1)


In [4]:
captions = attr_df.drop(columns='image_id').apply(generate_caption, axis=1)
caption_df= pd.DataFrame(data=captions, columns=['caption'])
caption_df['image_id'] = attr_df['image_id'].copy()
# Add filepath column
caption_df['filepath'] = 'data/CelebAMask-HQ/CelebA-HQ-img/' + caption_df['image_id']
caption_df = caption_df[['image_id', 'filepath', 'caption']]

# Captioning

In [5]:
pd.set_option('display.max_colwidth', None)
caption_df

Unnamed: 0,image_id,filepath,caption
0,0.jpg,data/CelebAMask-HQ/CelebA-HQ-img/0.jpg,"Portrait of a young woman brown hair, wavy hair, arched eyebrows, bushy eyebrows, smiling, big lips, pointy nose, high cheekbones, bags under eyes, mouth slightly open, wearing lipstick, and heavy makeup."
1,1.jpg,data/CelebAMask-HQ/CelebA-HQ-img/1.jpg,"A young woman who is blond hair, wavy hair, arched eyebrows, smiling, mouth slightly open, wearing lipstick, and heavy makeup."
2,2.jpg,data/CelebAMask-HQ/CelebA-HQ-img/2.jpg,"Portrait of a young woman brown hair, wavy hair, smiling, big lips, high cheekbones, bags under eyes, wearing earrings, wearing lipstick, and without heavy makeup."
3,3.jpg,data/CelebAMask-HQ/CelebA-HQ-img/3.jpg,"A young woman who is black hair, wavy hair, bushy eyebrows, smiling, big nose, high cheekbones, mouth slightly open, wearing lipstick, and heavy makeup."
4,4.jpg,data/CelebAMask-HQ/CelebA-HQ-img/4.jpg,"This is a young woman brown hair, straight hair, big lips, mouth slightly open, wearing earrings, and without heavy makeup."
...,...,...,...
29995,29995.jpg,data/CelebAMask-HQ/CelebA-HQ-img/29995.jpg,"A older man who is mustache, goatee, smiling, big nose, chubby, double chin, mouth slightly open, eyeglasses, wearing necklace, and without heavy makeup."
29996,29996.jpg,data/CelebAMask-HQ/CelebA-HQ-img/29996.jpg,"This is a young man 5 o clock shadow, sideburns, bushy eyebrows, smiling, big nose, pointy nose, bags under eyes, and without heavy makeup."
29997,29997.jpg,data/CelebAMask-HQ/CelebA-HQ-img/29997.jpg,"A young woman with black hair, bangs, straight hair, and heavy makeup."
29998,29998.jpg,data/CelebAMask-HQ/CelebA-HQ-img/29998.jpg,"A young woman with brown hair, wavy hair, arched eyebrows, wearing lipstick, and heavy makeup."


In [6]:
# export caption dataframe
caption_df.to_csv('data/CelebAMask-HQ/CelebA-HQ-Captions.csv', index=False)

In [None]:
# laion_style_df = caption_df.rename(columns={'image_id': 'file_name', 'caption': 'text'})
# # Optional: ensure correct formatting (e.g., remove whitespace)
# laion_style_df['file_name'] = laion_style_df['file_name'].str.strip()
# laion_style_df['text'] = laion_style_df['text'].str.strip()


# laion_style_df.to_csv("data/CelebAMask-HQ/LAION-CelebA-HQ", index=False)

In [8]:
image_folder = "data/CelebAMask-HQ/CelebA-HQ-img"  # folder where images are stored
caption_df["filepath"] = caption_df["image_id"].apply(lambda x: f"{image_folder}/{x}")

# Prepare LAION-style entries (only filepath and caption)
records = caption_df[["filepath", "caption"]].rename(columns={"caption": "text"}).to_dict(orient="records")

# Save as JSONL
with open("data/CelebAMask-HQ/CelebAMask-HQ-LAION.jsonl", "w", encoding="utf-8") as f:
    for record in records:
        json.dump(record, f)
        f.write("\n")