<a href="https://colab.research.google.com/github/janewarren/ai_alt_text/blob/main/alt_text_analysis_pt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Plots

##Chatgpt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sz1 = 12
sz2 = 60

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

CSV_PATH = "/content/drive/MyDrive/image_captioning_bias/chatgpt/captions.csv"
GENDER_MAP = {1: "male", 2: "female"}
RACE_MAP = {1: "skin1", 2: "skin2", 3: "skin3", 4: "skin4", 5: "skin5", 6: "skin6"}
WARMTH_LEXICON = {"caring","nurturing","friendly","warm","kind","gentle","compassionate","loving","supportive","affectionate"}
COMPETENCE_LEXICON = {"skilled","competent","expert","talented","proficient","capable","strong","efficient","accomplished"}
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(EMBED_MODEL_NAME)

def find_text_column(df):
    candidates = ["caption","Caption","captions","text","Text","caption_text","description","Description"]
    for c in candidates:
        if c in df.columns:
            return c
        lowcols = {col.lower(): col for col in df.columns}
        if c.lower() in lowcols:
            return lowcols[c.lower()]
    string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
    if not string_cols:
        raise KeyError("No string-like columns found")
    avg_len = {col: df[col].astype(str).map(len).mean() for col in string_cols}
    return max(avg_len, key=avg_len.get)

def extract_numbers(filename):
    parts = str(filename).split('_')
    if len(parts) >= 2:
        try:
            return int(parts[0]), int(parts[1])
        except Exception:
            return None, None
    return None, None

def load_and_prepare(csv_path):
    df = pd.read_csv(csv_path)
    text_col = find_text_column(df)
    df = df.copy()
    df.rename(columns={text_col: "caption"}, inplace=True)
    df[['first_num','second_num']] = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
    df['gender_label'] = df['first_num'].map(GENDER_MAP).fillna("unknown")
    df['race_label'] = df['second_num'].map(RACE_MAP).fillna("unknown")
    split_dataframes = {}
    for first in [1,2]:
        for second in range(1,7):
            name = f"df_{first}_{second}"
            sdf = df[(df['first_num']==first) & (df['second_num']==second)].copy().reset_index(drop=True)
            if not sdf.empty:
                sdf['group_label'] = f"{sdf['gender_label'].iloc[0]}_{sdf['race_label'].iloc[0]}"
            else:
                sdf['group_label'] = ""
            split_dataframes[name] = sdf
    return df, split_dataframes

def compute_caption_embeddings(captions, batch_size=64):
    embs = model.encode(list(captions), convert_to_numpy=True, show_progress_bar=False, batch_size=batch_size)
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    embs = embs / norms
    return embs

def compute_embedding_centroid(seeds):
    if not seeds:
        raise ValueError("Empty seeds")
    emb = model.encode(seeds, convert_to_numpy=True, show_progress_bar=False)
    centroid = emb.mean(axis=0)
    norm = np.linalg.norm(centroid)
    if norm > 0:
        centroid = centroid / norm
    return centroid

def cos_sim_to_centroid(caption_embs, centroid):
    return np.dot(caption_embs, centroid)

def simple_tokenize(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = [t for t in text.split() if t]
    return tokens
def lexicon_scores_for_caption(caption, warmth_lex=WARMTH_LEXICON, comp_lex=COMPETENCE_LEXICON):
    tokens = simple_tokenize(caption)
    token_count = len(tokens)
    warmth_count = sum(1 for t in tokens if t in warmth_lex)
    comp_count = sum(1 for t in tokens if t in comp_lex)
    return {"word_count": token_count, "warmth_count": warmth_count, "competence_count": comp_count, "warmth_rate": warmth_count / token_count if token_count else 0.0, "competence_rate": comp_count / token_count if token_count else 0.0}
def analyze_group_df(group_df, warmth_seeds=list(WARMTH_LEXICON), comp_seeds=list(COMPETENCE_LEXICON)):
    if group_df is None or len(group_df)==0:
        return pd.DataFrame(), None
    lex_rows = [lexicon_scores_for_caption(c) for c in group_df['caption']]
    lex_df = pd.DataFrame(lex_rows)
    cap_embs = compute_caption_embeddings(group_df['caption'].tolist())
    warmth_centroid = compute_embedding_centroid(warmth_seeds)
    comp_centroid = compute_embedding_centroid(comp_seeds)
    warmth_sim = cos_sim_to_centroid(cap_embs, warmth_centroid)
    comp_sim = cos_sim_to_centroid(cap_embs, comp_centroid)
    out = group_df.copy().reset_index(drop=True)
    out = pd.concat([out, lex_df], axis=1)
    out['warmth_emb_sim'] = warmth_sim
    out['competence_emb_sim'] = comp_sim
    return out, cap_embs
if __name__ == "__main__":
    base_df, split_dataframes = load_and_prepare(CSV_PATH)
    analysis_results = {}
    embeddings_store = {}
    for name, sdf in split_dataframes.items():
        if sdf is None or sdf.empty:
            analysis_results[name] = pd.DataFrame()
            embeddings_store[name] = None
            continue
        res_df, embs = analyze_group_df(sdf)
        analysis_results[name] = res_df
        embeddings_store[name] = embs

    male_all = pd.concat([analysis_results.get(f"df_1_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    female_all = pd.concat([analysis_results.get(f"df_2_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    male_all = male_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    female_all = female_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    warmth_centroid = compute_embedding_centroid(list(WARMTH_LEXICON))
    comp_centroid = compute_embedding_centroid(list(COMPETENCE_LEXICON))

    outdir = "/content/drive/MyDrive/image_captioning_bias/images/chatgpt"
    os.makedirs(outdir, exist_ok=True)
    plt.figure(figsize=(8,6))
    jitter = 0.0

    x_m = male_all['warmth_emb_sim'].values + np.random.normal(0, jitter, size=male_all.shape[0])
    y_m = male_all['competence_emb_sim'].values + np.random.normal(0, jitter, size=male_all.shape[0])
    x_f = female_all['warmth_emb_sim'].values + np.random.normal(0, jitter, size=female_all.shape[0])
    y_f = female_all['competence_emb_sim'].values + np.random.normal(0, jitter, size=female_all.shape[0])
    plt.scatter(x_m, y_m, alpha=0.6, label='male', s=sz1)
    plt.scatter(x_f, y_f, alpha=0.6, label='female', s=sz1)

    if len(x_m) >= 2 and np.nanstd(x_m) > 0:
        coeffs_m = np.polyfit(x_m, y_m, 1)
        xs = np.linspace(np.nanmin(np.concatenate([x_m,x_f])), np.nanmax(np.concatenate([x_m,x_f])), 200)
        ys_m = coeffs_m[0]*xs + coeffs_m[1]
        plt.plot(xs, ys_m, color='black', linestyle='-', linewidth=2, label=f'male fit (slope={coeffs_m[0]:.4f})')

    if len(x_f) >= 2 and np.nanstd(x_f) > 0:
        coeffs_f = np.polyfit(x_f, y_f, 1)
        xs = np.linspace(np.nanmin(np.concatenate([x_m,x_f])), np.nanmax(np.concatenate([x_m,x_f])), 200)
        ys_f = coeffs_f[0]*xs + coeffs_f[1]
        plt.plot(xs, ys_f, color='gray', linestyle='--', linewidth=2, label=f'female fit (slope={coeffs_f[0]:.4f})')
    cx_m, cy_m = np.mean(x_m), np.mean(y_m)
    cx_f, cy_f = np.mean(x_f), np.mean(y_f)
    plt.scatter([cx_m],[cy_m], marker='X', s=60)
    plt.text(cx_m, cy_m, "  male centroid", fontsize=9)
    plt.scatter([cx_f],[cy_f], marker='D', s=60)
    plt.text(cx_f, cy_f, "  female centroid", fontsize=9)
    plt.xlabel("Warmth similarity (embedding centroid)")
    plt.ylabel("Competence similarity (embedding centroid)")
    plt.title("Warmth vs Competence — Male vs Female (ChatGPT)")
    plt.legend(loc='best')
    plt.tight_layout()
    outfig = os.path.join(outdir, "chatgpt_agg_male_vs_female_warmth_competence_with_slopes.png")
    plt.savefig(outfig, dpi=300)
    plt.show()
    SKIN_GROUPS = {"light":[1,2], "mid":[3,4], "dark":[5,6]}
    skin_agg_results = {}
    skin_embs = {}

    for s in range(1,7):
        name_m = f"df_1_{s}"
        name_f = f"df_2_{s}"
        combined_raw = pd.concat([analysis_results.get(name_m, pd.DataFrame()), analysis_results.get(name_f, pd.DataFrame())], ignore_index=True)
        if combined_raw is None or combined_raw.empty:
            continue
        skin_res = combined_raw.copy()
        skin_res['skin'] = f"skin{s}"
        skin_agg_results[s] = skin_res

    group_results = {}
    group_embs = {}

    for group_name, skin_list in SKIN_GROUPS.items():
        dfs = []
        for s in skin_list:
            sdf = skin_agg_results.get(s)
            if sdf is not None and not sdf.empty:
                dfs.append(sdf)
        if not dfs:
            continue
        combined = pd.concat(dfs, ignore_index=True)
        combined = combined.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
        group_results[group_name] = combined
    group_colors = {"light":"#1f77b4", "mid":"#ff7f0e", "dark":"#2ca02c"}
    group_markers = {"light":"o", "mid":"^", "dark":"s"}
    plt.figure(figsize=(9,7))

    slopes = {}

    for group_name in group_results.keys():
        gdf = group_results[group_name]
        x = gdf['warmth_emb_sim'].values
        y = gdf['competence_emb_sim'].values
        plt.scatter(x, y, label=group_name, alpha=0.6, s=sz1, color=group_colors.get(group_name,None))
        if len(x) >= 2 and np.nanstd(x) > 0:
            coeffs = np.polyfit(x, y, 1)
            xs_line = np.linspace(float(np.nanmin(x)), float(np.nanmax(x)), 200)
            ys_line = coeffs[0]*xs_line + coeffs[1]
            plt.plot(xs_line, ys_line, color=group_colors.get(group_name,None), linewidth=2, label=f"{group_name} fit (slope={coeffs[0]:.4f})")
            slopes[group_name] = float(coeffs[0])
        cx = np.mean(x); cy = np.mean(y)
        plt.scatter([cx],[cy], s=60, color=group_colors.get(group_name,None))
        plt.text(cx, cy, f"  {group_name}", fontsize=9)

    plt.xlabel("Warmth similarity")
    plt.ylabel("Competence similarity")
    plt.title("Warmth vs Competence — Skin groups (light/mid/dark) (ChatGPT)")
    plt.legend(title="skin group", loc='best')
    plt.tight_layout()
    out_comb_warm = os.path.join(outdir, "chatgpt_combined_skingroups_warmth_competence_with_slopes.png")
    plt.savefig(out_comb_warm, dpi=300)
    plt.show()
    print("Saved:", outfig)
    print("Saved:", out_comb_warm)


##Claude

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

CSV_PATH = "/content/drive/MyDrive/image_captioning_bias/claude/captions.csv"
GENDER_MAP = {1: "male", 2: "female"}
RACE_MAP = {1: "skin1", 2: "skin2", 3: "skin3", 4: "skin4", 5: "skin5", 6: "skin6"}
WARMTH_LEXICON = {"caring","nurturing","friendly","warm","kind","gentle","compassionate","loving","supportive","affectionate"}
COMPETENCE_LEXICON = {"skilled","competent","expert","talented","proficient","capable","strong","efficient","accomplished"}
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(EMBED_MODEL_NAME)

def find_text_column(df):
    candidates = ["caption","Caption","captions","text","Text","caption_text","description","Description"]
    for c in candidates:
        if c in df.columns:
            return c
        lowcols = {col.lower(): col for col in df.columns}
        if c.lower() in lowcols:
            return lowcols[c.lower()]
    string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
    if not string_cols:
        raise KeyError("No string-like columns found")
    avg_len = {col: df[col].astype(str).map(len).mean() for col in string_cols}
    return max(avg_len, key=avg_len.get)

def extract_numbers(filename):
    parts = str(filename).split('_')
    if len(parts) >= 2:
        try:
            return int(parts[0]), int(parts[1])
        except Exception:
            return None, None
    return None, None

def load_and_prepare(csv_path):
    df = pd.read_csv(csv_path)
    text_col = find_text_column(df)
    df = df.copy()
    df.rename(columns={text_col: "caption"}, inplace=True)
    df[['first_num','second_num']] = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
    df['gender_label'] = df['first_num'].map(GENDER_MAP).fillna("unknown")
    df['race_label'] = df['second_num'].map(RACE_MAP).fillna("unknown")
    split_dataframes = {}
    for first in [1,2]:
        for second in range(1,7):
            name = f"df_{first}_{second}"
            sdf = df[(df['first_num']==first) & (df['second_num']==second)].copy().reset_index(drop=True)
            if not sdf.empty:
                sdf['group_label'] = f"{sdf['gender_label'].iloc[0]}_{sdf['race_label'].iloc[0]}"
            else:
                sdf['group_label'] = ""
            split_dataframes[name] = sdf
    return df, split_dataframes

def compute_caption_embeddings(captions, batch_size=64):
    embs = model.encode(list(captions), convert_to_numpy=True, show_progress_bar=False, batch_size=batch_size)
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    embs = embs / norms
    return embs

def compute_embedding_centroid(seeds):
    if not seeds:
        raise ValueError("Empty seeds")
    emb = model.encode(seeds, convert_to_numpy=True, show_progress_bar=False)
    centroid = emb.mean(axis=0)
    norm = np.linalg.norm(centroid)
    if norm > 0:
        centroid = centroid / norm
    return centroid

def cos_sim_to_centroid(caption_embs, centroid):
    return np.dot(caption_embs, centroid)

def simple_tokenize(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = [t for t in text.split() if t]
    return tokens

def lexicon_scores_for_caption(caption, warmth_lex=WARMTH_LEXICON, comp_lex=COMPETENCE_LEXICON):
    tokens = simple_tokenize(caption)
    token_count = len(tokens)
    warmth_count = sum(1 for t in tokens if t in warmth_lex)
    comp_count = sum(1 for t in tokens if t in comp_lex)
    return {"word_count": token_count, "warmth_count": warmth_count, "competence_count": comp_count, "warmth_rate": warmth_count / token_count if token_count else 0.0, "competence_rate": comp_count / token_count if token_count else 0.0}

def analyze_group_df(group_df, warmth_seeds=list(WARMTH_LEXICON), comp_seeds=list(COMPETENCE_LEXICON)):
    if group_df is None or len(group_df)==0:
        return pd.DataFrame(), None
    lex_rows = [lexicon_scores_for_caption(c) for c in group_df['caption']]
    lex_df = pd.DataFrame(lex_rows)
    cap_embs = compute_caption_embeddings(group_df['caption'].tolist())
    warmth_centroid = compute_embedding_centroid(warmth_seeds)
    comp_centroid = compute_embedding_centroid(comp_seeds)
    warmth_sim = cos_sim_to_centroid(cap_embs, warmth_centroid)
    comp_sim = cos_sim_to_centroid(cap_embs, comp_centroid)
    out = group_df.copy().reset_index(drop=True)
    out = pd.concat([out, lex_df], axis=1)
    out['warmth_emb_sim'] = warmth_sim
    out['competence_emb_sim'] = comp_sim
    return out, cap_embs

if __name__ == "__main__":
    base_df, split_dataframes = load_and_prepare(CSV_PATH)
    analysis_results = {}
    embeddings_store = {}
    for name, sdf in split_dataframes.items():
        if sdf is None or sdf.empty:
            analysis_results[name] = pd.DataFrame()
            embeddings_store[name] = None
            continue
        res_df, embs = analyze_group_df(sdf)
        analysis_results[name] = res_df
        embeddings_store[name] = embs

    male_all = pd.concat([analysis_results.get(f"df_1_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    female_all = pd.concat([analysis_results.get(f"df_2_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    male_all = male_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    female_all = female_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    warmth_centroid = compute_embedding_centroid(list(WARMTH_LEXICON))
    comp_centroid = compute_embedding_centroid(list(COMPETENCE_LEXICON))

    outdir = "/content/drive/MyDrive/image_captioning_bias/images/claude"
    os.makedirs(outdir, exist_ok=True)
    plt.figure(figsize=(8,6))
    jitter = 0.0

    x_m = male_all['warmth_emb_sim'].values + np.random.normal(0, jitter, size=male_all.shape[0])
    y_m = male_all['competence_emb_sim'].values + np.random.normal(0, jitter, size=male_all.shape[0])
    x_f = female_all['warmth_emb_sim'].values + np.random.normal(0, jitter, size=female_all.shape[0])
    y_f = female_all['competence_emb_sim'].values + np.random.normal(0, jitter, size=female_all.shape[0])

    plt.scatter(x_m, y_m, alpha=0.6, label='male', s=sz1)
    plt.scatter(x_f, y_f, alpha=0.6, label='female', s=sz1)

    if len(x_m) >= 2 and np.nanstd(x_m) > 0:
        coeffs_m = np.polyfit(x_m, y_m, 1)
        xs = np.linspace(np.nanmin(np.concatenate([x_m,x_f])), np.nanmax(np.concatenate([x_m,x_f])), 200)
        ys_m = coeffs_m[0]*xs + coeffs_m[1]
        plt.plot(xs, ys_m, color='black', linestyle='-', linewidth=2, label=f'male fit (slope={coeffs_m[0]:.4f})')

    if len(x_f) >= 2 and np.nanstd(x_f) > 0:
        coeffs_f = np.polyfit(x_f, y_f, 1)
        xs = np.linspace(np.nanmin(np.concatenate([x_m,x_f])), np.nanmax(np.concatenate([x_m,x_f])), 200)
        ys_f = coeffs_f[0]*xs + coeffs_f[1]
        plt.plot(xs, ys_f, color='gray', linestyle='--', linewidth=2, label=f'female fit (slope={coeffs_f[0]:.4f})')

    cx_m, cy_m = np.mean(x_m), np.mean(y_m)
    cx_f, cy_f = np.mean(x_f), np.mean(y_f)

    plt.scatter([cx_m],[cy_m], marker='X', s=sz2, c='black')
    plt.text(cx_m, cy_m, "  male centroid", fontsize=9)
    plt.scatter([cx_f],[cy_f], marker='D', s=sz2, c='black')
    plt.text(cx_f, cy_f, "  female centroid", fontsize=9)
    plt.xlabel("Warmth similarity (embedding centroid)")
    plt.ylabel("Competence similarity (embedding centroid)")
    plt.title("Warmth vs Competence — Male vs Female (Claude)")
    plt.legend(loc='best')
    plt.tight_layout()
    outfig = os.path.join(outdir, "claude_agg_male_vs_female_warmth_competence_with_slopes.png")
    plt.savefig(outfig, dpi=300)
    plt.show()

    SKIN_GROUPS = {"light":[1,2], "mid":[3,4], "dark":[5,6]}
    skin_agg_results = {}
    skin_embs = {}

    for s in range(1,7):
        name_m = f"df_1_{s}"
        name_f = f"df_2_{s}"
        combined_raw = pd.concat([analysis_results.get(name_m, pd.DataFrame()), analysis_results.get(name_f, pd.DataFrame())], ignore_index=True)
        if combined_raw is None or combined_raw.empty:
            continue
        skin_res = combined_raw.copy()
        skin_res['skin'] = f"skin{s}"
        skin_agg_results[s] = skin_res
    group_results = {}
    group_embs = {}

    for group_name, skin_list in SKIN_GROUPS.items():
        dfs = []
        for s in skin_list:
            sdf = skin_agg_results.get(s)
            if sdf is not None and not sdf.empty:
                dfs.append(sdf)
        if not dfs:
            continue
        combined = pd.concat(dfs, ignore_index=True)
        combined = combined.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
        group_results[group_name] = combined
    group_colors = {"light":"#1f77b4", "mid":"#ff7f0e", "dark":"#2ca02c"}
    group_markers = {"light":"o", "mid":"^", "dark":"s"}
    plt.figure(figsize=(9,7))

   slopes = {}

    for group_name in group_results.keys():
        gdf = group_results[group_name]
        x = gdf['warmth_emb_sim'].values
        y = gdf['competence_emb_sim'].values
        plt.scatter(x, y, label=group_name, alpha=0.6, s=sz1, color=group_colors.get(group_name,None))
        if len(x) >= 2 and np.nanstd(x) > 0:
            coeffs = np.polyfit(x, y, 1)
            xs_line = np.linspace(float(np.nanmin(x)), float(np.nanmax(x)), 200)
            ys_line = coeffs[0]*xs_line + coeffs[1]
            plt.plot(xs_line, ys_line, color=group_colors.get(group_name,None), linewidth=2, label=f"{group_name} fit (slope={coeffs[0]:.4f})")
            slopes[group_name] = float(coeffs[0])
        cx = np.mean(x); cy = np.mean(y)
        plt.scatter([cx],[cy], marker='X', s=sz2, color=group_colors.get(group_name,None))
        plt.text(cx, cy, f"  {group_name}", fontsize=9)

    plt.xlabel("Warmth similarity")
    plt.ylabel("Competence similarity")
    plt.title("Warmth vs Competence — Skin groups (light/mid/dark) (Claude)")
    plt.legend(title="skin group", loc='best')
    plt.tight_layout()
    out_comb_warm = os.path.join(outdir, "claude_combined_skingroups_warmth_competence_with_slopes.png")
    plt.savefig(out_comb_warm, dpi=300)
    plt.show()
    print("Saved:", outfig)
    print("Saved:", out_comb_warm)


##Gemini

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

CSV_PATH = "/content/drive/MyDrive/image_captioning_bias/gemini/captions.csv"
GENDER_MAP = {1: "male", 2: "female"}
RACE_MAP = {1: "skin1", 2: "skin2", 3: "skin3", 4: "skin4", 5: "skin5", 6: "skin6"}
WARMTH_LEXICON = {"caring","nurturing","friendly","warm","kind","gentle","compassionate","loving","supportive","affectionate"}
COMPETENCE_LEXICON = {"skilled","competent","expert","talented","proficient","capable","strong","efficient","accomplished"}
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(EMBED_MODEL_NAME)

def find_text_column(df):
    candidates = ["caption","Caption","captions","text","Text","caption_text","description","Description"]
    for c in candidates:
        if c in df.columns:
            return c
        lowcols = {col.lower(): col for col in df.columns}
        if c.lower() in lowcols:
            return lowcols[c.lower()]
    string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
    if not string_cols:
        raise KeyError("No string-like columns found")
    avg_len = {col: df[col].astype(str).map(len).mean() for col in string_cols}
    return max(avg_len, key=avg_len.get)

def extract_numbers(filename):
    parts = str(filename).split('_')
    if len(parts) >= 2:
        try:
            return int(parts[0]), int(parts[1])
        except Exception:
            return None, None
    return None, None

def load_and_prepare(csv_path):
    df = pd.read_csv(csv_path)
    text_col = find_text_column(df)
    df = df.copy()
    df.rename(columns={text_col: "caption"}, inplace=True)
    df[['first_num','second_num']] = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
    df['gender_label'] = df['first_num'].map(GENDER_MAP).fillna("unknown")
    df['race_label'] = df['second_num'].map(RACE_MAP).fillna("unknown")
    split_dataframes = {}
    for first in [1,2]:
        for second in range(1,7):
            name = f"df_{first}_{second}"
            sdf = df[(df['first_num']==first) & (df['second_num']==second)].copy().reset_index(drop=True)
            if not sdf.empty:
                sdf['group_label'] = f"{sdf['gender_label'].iloc[0]}_{sdf['race_label'].iloc[0]}"
            else:
                sdf['group_label'] = ""
            split_dataframes[name] = sdf
    return df, split_dataframes

def compute_caption_embeddings(captions, batch_size=64):
    embs = model.encode(list(captions), convert_to_numpy=True, show_progress_bar=False, batch_size=batch_size)
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    embs = embs / norms
    return embs

def compute_embedding_centroid(seeds):
    if not seeds:
        raise ValueError("Empty seeds")
    emb = model.encode(seeds, convert_to_numpy=True, show_progress_bar=False)
    centroid = emb.mean(axis=0)
    norm = np.linalg.norm(centroid)
    if norm > 0:
        centroid = centroid / norm
    return centroid

def cos_sim_to_centroid(caption_embs, centroid):
    return np.dot(caption_embs, centroid)

def simple_tokenize(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = [t for t in text.split() if t]
    return tokens

def lexicon_scores_for_caption(caption, warmth_lex=WARMTH_LEXICON, comp_lex=COMPETENCE_LEXICON):
    tokens = simple_tokenize(caption)
    token_count = len(tokens)
    warmth_count = sum(1 for t in tokens if t in warmth_lex)
    comp_count = sum(1 for t in tokens if t in comp_lex)
    return {"word_count": token_count, "warmth_count": warmth_count, "competence_count": comp_count, "warmth_rate": warmth_count / token_count if token_count else 0.0, "competence_rate": comp_count / token_count if token_count else 0.0}

def analyze_group_df(group_df, warmth_seeds=list(WARMTH_LEXICON), comp_seeds=list(COMPETENCE_LEXICON)):
    if group_df is None or len(group_df)==0:
        return pd.DataFrame(), None
    lex_rows = [lexicon_scores_for_caption(c) for c in group_df['caption']]
    lex_df = pd.DataFrame(lex_rows)
    cap_embs = compute_caption_embeddings(group_df['caption'].tolist())
    warmth_centroid = compute_embedding_centroid(warmth_seeds)
    comp_centroid = compute_embedding_centroid(comp_seeds)
    warmth_sim = cos_sim_to_centroid(cap_embs, warmth_centroid)
    comp_sim = cos_sim_to_centroid(cap_embs, comp_centroid)
    out = group_df.copy().reset_index(drop=True)
    out = pd.concat([out, lex_df], axis=1)
    out['warmth_emb_sim'] = warmth_sim
    out['competence_emb_sim'] = comp_sim
    return out, cap_embs

if __name__ == "__main__":
    base_df, split_dataframes = load_and_prepare(CSV_PATH)
    analysis_results = {}
    embeddings_store = {}

    for name, sdf in split_dataframes.items():
        if sdf is None or sdf.empty:
            analysis_results[name] = pd.DataFrame()
            embeddings_store[name] = None
            continue
        res_df, embs = analyze_group_df(sdf)
        analysis_results[name] = res_df
        embeddings_store[name] = embs

    male_all = pd.concat([analysis_results.get(f"df_1_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    female_all = pd.concat([analysis_results.get(f"df_2_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    male_all = male_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    female_all = female_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    warmth_centroid = compute_embedding_centroid(list(WARMTH_LEXICON))
    comp_centroid = compute_embedding_centroid(list(COMPETENCE_LEXICON))
    outdir = "/content/drive/MyDrive/image_captioning_bias/images/gemini"
    os.makedirs(outdir, exist_ok=True)
    plt.figure(figsize=(8,6))
    jitter = 0.0

    x_m = male_all['warmth_emb_sim'].values + np.random.normal(0, jitter, size=male_all.shape[0])
    y_m = male_all['competence_emb_sim'].values + np.random.normal(0, jitter, size=male_all.shape[0])
    x_f = female_all['warmth_emb_sim'].values + np.random.normal(0, jitter, size=female_all.shape[0])
    y_f = female_all['competence_emb_sim'].values + np.random.normal(0, jitter, size=female_all.shape[0])

    plt.scatter(x_m, y_m, alpha=0.6, label='male', s=sz1)
    plt.scatter(x_f, y_f, alpha=0.6, label='female', s=sz1)

    if len(x_m) >= 2 and np.nanstd(x_m) > 0:
        coeffs_m = np.polyfit(x_m, y_m, 1)
        xs = np.linspace(np.nanmin(np.concatenate([x_m,x_f])), np.nanmax(np.concatenate([x_m,x_f])), 200)
        ys_m = coeffs_m[0]*xs + coeffs_m[1]
        plt.plot(xs, ys_m, color='black', linestyle='-', linewidth=2, label=f'male fit (slope={coeffs_m[0]:.4f})')
    if len(x_f) >= 2 and np.nanstd(x_f) > 0:
        coeffs_f = np.polyfit(x_f, y_f, 1)
        xs = np.linspace(np.nanmin(np.concatenate([x_m,x_f])), np.nanmax(np.concatenate([x_m,x_f])), 200)
        ys_f = coeffs_f[0]*xs + coeffs_f[1]
        plt.plot(xs, ys_f, color='gray', linestyle='--', linewidth=2, label=f'female fit (slope={coeffs_f[0]:.4f})')

    cx_m, cy_m = np.mean(x_m), np.mean(y_m)
    cx_f, cy_f = np.mean(x_f), np.mean(y_f)

    plt.scatter([cx_m],[cy_m], marker='X', s=sz2, c='black')
    plt.text(cx_m, cy_m, "  male centroid", fontsize=9)
    plt.scatter([cx_f],[cy_f], marker='D', s=sz2, c='black')
    plt.text(cx_f, cy_f, "  female centroid", fontsize=9)
    plt.xlabel("Warmth similarity (embedding centroid)")
    plt.ylabel("Competence similarity (embedding centroid)")
    plt.title("Warmth vs Competence — Male vs Female (Gemini)")
    plt.legend(loc='best')
    plt.tight_layout()
    outfig = os.path.join(outdir, "gemini_agg_male_vs_female_warmth_competence_with_slopes.png")
    plt.savefig(outfig, dpi=300)
    plt.show()

    SKIN_GROUPS = {"light":[1,2], "mid":[3,4], "dark":[5,6]}
    skin_agg_results = {}
    skin_embs = {}

    for s in range(1,7):
        name_m = f"df_1_{s}"
        name_f = f"df_2_{s}"
        combined_raw = pd.concat([analysis_results.get(name_m, pd.DataFrame()), analysis_results.get(name_f, pd.DataFrame())], ignore_index=True)
        if combined_raw is None or combined_raw.empty:
            continue
        skin_res = combined_raw.copy()
        skin_res['skin'] = f"skin{s}"
        skin_agg_results[s] = skin_res

    group_results = {}
    group_embs = {}

    for group_name, skin_list in SKIN_GROUPS.items():
        dfs = []
        for s in skin_list:
            sdf = skin_agg_results.get(s)
            if sdf is not None and not sdf.empty:
                dfs.append(sdf)
        if not dfs:
            continue
        combined = pd.concat(dfs, ignore_index=True)
        combined = combined.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
        group_results[group_name] = combined

    group_colors = {"light":"#1f77b4", "mid":"#ff7f0e", "dark":"#2ca02c"}
    group_markers = {"light":"o", "mid":"^", "dark":"s"}
    plt.figure(figsize=(9,7))
    slopes = {}

    for group_name in group_results.keys():
        gdf = group_results[group_name]
        x = gdf['warmth_emb_sim'].values
        y = gdf['competence_emb_sim'].values
        plt.scatter(x, y, label=group_name, alpha=0.6, s=sz1, color=group_colors.get(group_name,None))
        if len(x) >= 2 and np.nanstd(x) > 0:
            coeffs = np.polyfit(x, y, 1)
            xs_line = np.linspace(float(np.nanmin(x)), float(np.nanmax(x)), 200)
            ys_line = coeffs[0]*xs_line + coeffs[1]
            plt.plot(xs_line, ys_line, color=group_colors.get(group_name,None), linewidth=2, label=f"{group_name} fit (slope={coeffs[0]:.4f})")
            slopes[group_name] = float(coeffs[0])
        cx = np.mean(x); cy = np.mean(y)
        plt.scatter([cx],[cy], marker='X', s=sz2, color=group_colors.get(group_name,None))
        plt.text(cx, cy, f"  {group_name}", fontsize=9)

    plt.xlabel("Warmth similarity")
    plt.ylabel("Competence similarity")
    plt.title("Warmth vs Competence — Skin groups (light/mid/dark) (Gemini)")
    plt.legend(title="skin group", loc='best')
    plt.tight_layout()
    out_comb_warm = os.path.join(outdir, "gemini_combined_skingroups_warmth_competence_with_slopes.png")
    plt.savefig(out_comb_warm, dpi=300)
    plt.show()
    print("Saved:", outfig)
    print("Saved:", out_comb_warm)

In [None]:
import os
import re
import pickle
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
CSV_PATHS = {
    "chatgpt": "/content/drive/MyDrive/image_captioning_bias/chatgpt/captions.csv",
    "claude":  "/content/drive/MyDrive/image_captioning_bias/claude/captions.csv",
    "gemini":  "/content/drive/MyDrive/image_captioning_bias/gemini/captions.csv",
}
OUTDIR = "/content/drive/MyDrive/image_captioning_bias/processed"
os.makedirs(OUTDIR, exist_ok=True)

GENDER_MAP = {1: "male", 2: "female"}
RACE_MAP = {1: "skin1", 2: "skin2", 3: "skin3", 4: "skin4", 5: "skin5", 6: "skin6"}
WARMTH_LEXICON = {"caring","nurturing","friendly","warm","kind","gentle","compassionate","loving","supportive","affectionate"}
COMPETENCE_LEXICON = {"skilled","competent","expert","talented","proficient","capable","strong","efficient","accomplished"}

EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
print("Loading embedding model:", EMBED_MODEL_NAME)
model = SentenceTransformer(EMBED_MODEL_NAME)

def find_text_column(df):
    candidates = ["caption","Caption","captions","text","Text","caption_text","description","Description"]
    for c in candidates:
        if c in df.columns:
            return c
        lowcols = {col.lower(): col for col in df.columns}
        if c.lower() in lowcols:
            return lowcols[c.lower()]
    string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
    if not string_cols:
        raise KeyError("No string-like columns found")
    avg_len = {col: df[col].astype(str).map(len).mean() for col in string_cols}
    return max(avg_len, key=avg_len.get)

def extract_numbers(filename):
    parts = str(filename).split('_')
    if len(parts) >= 2:
        try:
            return int(parts[0]), int(parts[1])
        except Exception:
            return None, None
    return None, None

def load_and_prepare(csv_path):
    df = pd.read_csv(csv_path)
    text_col = find_text_column(df)
    df = df.copy()
    df.rename(columns={text_col: "caption"}, inplace=True)
    df[['first_num','second_num']] = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
    df['gender_label'] = df['first_num'].map(GENDER_MAP).fillna("unknown")
    df['race_label'] = df['second_num'].map(RACE_MAP).fillna("unknown")
    split_dataframes = {}
    for first in [1,2]:
        for second in range(1,7):
            name = f"df_{first}_{second}"
            sdf = df[(df['first_num']==first) & (df['second_num']==second)].copy().reset_index(drop=True)
            if not sdf.empty:
                sdf['group_label'] = f"{sdf['gender_label'].iloc[0]}_{sdf['race_label'].iloc[0]}"
            else:
                sdf['group_label'] = ""
            split_dataframes[name] = sdf
    return df, split_dataframes

def compute_caption_embeddings(captions, batch_size=64):
    embs = model.encode(list(captions), convert_to_numpy=True, show_progress_bar=False, batch_size=batch_size)
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    embs = embs / norms
    return embs

def compute_embedding_centroid(seeds):
    if not seeds:
        raise ValueError("Empty seeds")
    emb = model.encode(seeds, convert_to_numpy=True, show_progress_bar=False)
    centroid = emb.mean(axis=0)
    norm = np.linalg.norm(centroid)
    if norm > 0:
        centroid = centroid / norm
    return centroid

def cos_sim_to_centroid(caption_embs, centroid):
    return np.dot(caption_embs, centroid)

def simple_tokenize(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = [t for t in text.split() if t]
    return tokens

def lexicon_scores_for_caption(caption, warmth_lex=WARMTH_LEXICON, comp_lex=COMPETENCE_LEXICON):
    tokens = simple_tokenize(caption)
    token_count = len(tokens)
    warmth_count = sum(1 for t in tokens if t in warmth_lex)
    comp_count = sum(1 for t in tokens if t in comp_lex)
    return {"word_count": token_count,
            "warmth_count": warmth_count,
            "competence_count": comp_count,
            "warmth_rate": warmth_count / token_count if token_count else 0.0,
            "competence_rate": comp_count / token_count if token_count else 0.0}

def analyze_group_df(group_df, warmth_seeds=list(WARMTH_LEXICON), comp_seeds=list(COMPETENCE_LEXICON)):
    if group_df is None or len(group_df)==0:
        return pd.DataFrame(), None
    lex_rows = [lexicon_scores_for_caption(c) for c in group_df['caption']]
    lex_df = pd.DataFrame(lex_rows)
    cap_embs = compute_caption_embeddings(group_df['caption'].tolist())
    warmth_centroid = compute_embedding_centroid(warmth_seeds)
    comp_centroid = compute_embedding_centroid(comp_seeds)
    warmth_sim = cos_sim_to_centroid(cap_embs, warmth_centroid)
    comp_sim = cos_sim_to_centroid(cap_embs, comp_centroid)
    out = group_df.copy().reset_index(drop=True)
    out = pd.concat([out, lex_df], axis=1)
    out['warmth_emb_sim'] = warmth_sim
    out['competence_emb_sim'] = comp_sim
    return out, cap_embs

def process_csv_and_extract(csv_path):
    base_df, split_dataframes = load_and_prepare(csv_path)
    analysis_results = {}
    embeddings_store = {}
    for name, sdf in split_dataframes.items():
        if sdf is None or sdf.empty:
            analysis_results[name] = pd.DataFrame()
            embeddings_store[name] = None
            continue
        res_df, embs = analyze_group_df(sdf)
        analysis_results[name] = res_df
        embeddings_store[name] = embs

    male_all = pd.concat([analysis_results.get(f"df_1_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    female_all = pd.concat([analysis_results.get(f"df_2_{i}", pd.DataFrame()) for i in range(1,7)], ignore_index=True)
    male_all = male_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
    female_all = female_all.dropna(subset=['warmth_emb_sim','competence_emb_sim'])

    skin_agg_results = {}
    for s in range(1,7):
        name_m = f"df_1_{s}"
        name_f = f"df_2_{s}"
        combined_raw = pd.concat([analysis_results.get(name_m, pd.DataFrame()), analysis_results.get(name_f, pd.DataFrame())], ignore_index=True)
        if combined_raw is None or combined_raw.empty:
            continue
        skin_res = combined_raw.copy()
        skin_res['skin'] = f"skin{s}"
        skin_agg_results[s] = skin_res

    group_results = {}
    SKIN_GROUPS = {"light":[1,2], "mid":[3,4], "dark":[5,6]}
    for group_name, skin_list in SKIN_GROUPS.items():
        dfs = []
        for s in skin_list:
            sdf = skin_agg_results.get(s)
            if sdf is not None and not sdf.empty:
                dfs.append(sdf)
        if not dfs:
            continue
        combined = pd.concat(dfs, ignore_index=True)
        combined = combined.dropna(subset=['warmth_emb_sim','competence_emb_sim'])
        group_results[group_name] = combined

    return {
        "male_all": male_all,
        "female_all": female_all,
        "group_results": group_results
    }

processed = {}
for model_name, path in CSV_PATHS.items():
    print("Processing:", model_name, "from", path)
    processed[model_name] = process_csv_and_extract(path)

save_path = os.path.join(OUTDIR, "processed_data.pkl")
with open(save_path, "wb") as f:
    pickle.dump(processed, f, protocol=4)

print("Saved processed data to:", save_path)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

PICKLE_PATH = "/content/drive/MyDrive/image_captioning_bias/processed/processed_data.pkl"
OUTDIR = "/content/drive/MyDrive/image_captioning_bias/images/combined"
os.makedirs(OUTDIR, exist_ok=True)

TITLE_FS = 15
AXIS_LABEL_FS = 15
AXIS_TICK_FS = 15
LEGEND_FS = 15
CENTROID_TEXT_FS = 13

sz1 = 10
sz2 = 80

SKIN_GROUPS = {"light":[1,2], "mid":[3,4], "dark":[5,6]}
group_colors = {"light":"#1f77b4", "mid":"#ff7f0e", "dark":"#2ca02c"}
group_markers = {"light":"o", "mid":"^", "dark":"s"}

male_color = "tab:blue"
female_color = "tab:orange"
male_centroid_marker = "D"
female_centroid_marker = "D"

with open(PICKLE_PATH, "rb") as f:
    processed = pickle.load(f)

model_order = ["chatgpt", "claude", "gemini"]

def compute_global_limits_for_gender(processed, model_order):
    all_x, all_y = [], []
    for m in model_order:
        p = processed.get(m)
        if p is None:
            continue
        for k in ["male_all", "female_all"]:
            df = p.get(k)
            if df is not None and not getattr(df, "empty", True):
                all_x.append(df["warmth_emb_sim"].values)
                all_y.append(df["competence_emb_sim"].values)

    x = np.concatenate(all_x)
    y = np.concatenate(all_y)
    pad_x = (x.max() - x.min()) * 0.08
    pad_y = (y.max() - y.min()) * 0.08
    return (x.min()-pad_x, x.max()+pad_x), (y.min()-pad_y, y.max()+pad_y)


def compute_global_limits_for_groups(processed, model_order):
    all_x, all_y = [], []
    for m in model_order:
        grp = processed[m].get("group_results", {})
        for v in grp.values():
            if v is not None and not getattr(v, "empty", True):
                all_x.append(v["warmth_emb_sim"].values)
                all_y.append(v["competence_emb_sim"].values)

    x = np.concatenate(all_x)
    y = np.concatenate(all_y)
    pad_x = (x.max() - x.min()) * 0.08
    pad_y = (y.max() - y.min()) * 0.08
    return (x.min()-pad_x, x.max()+pad_x), (y.min()-pad_y, y.max()+pad_y)

from matplotlib.transforms import Bbox
def remove_nearest_ticks(ax, x_centers=None, y_centers=None):
    if x_centers is not None:
        xticks = ax.get_xticks()
        xlabels = [lab.get_text() for lab in ax.get_xticklabels()]
        for xc in x_centers:
            idx = np.argmin(np.abs(xticks - xc))
            if idx < len(xlabels):
                xlabels[idx] = ""
        ax.set_xticklabels(xlabels)

    if y_centers is not None:
        yticks = ax.get_yticks()
        ylabels = [str(tick) if tick in yticks else "" for tick in yticks]
        for yc in y_centers:
            idx = np.argmin(np.abs(yticks - yc))
            if idx < len(ylabels):
                ylabels[idx] = ""
        ax.set_yticklabels(ylabels)

def remove_specific_ticks(ax, x_remove=None, y_remove=None):

    ax.figure.canvas.draw()

    if x_remove is not None:
        xticks = ax.get_xticks()
        xlabels = [lab.get_text() for lab in ax.get_xticklabels()]
        if not any(xlabels):
            xlabels = [f"{tick:.3f}" for tick in xticks]

        for x_val in x_remove:
            idx = np.argmin(np.abs(xticks - x_val))
            if idx < len(xlabels):
                xlabels[idx] = ""
        ax.set_xticklabels(xlabels)

    if y_remove is not None:
        yticks = ax.get_yticks()
        ylabels = [lab.get_text() for lab in ax.get_yticklabels()]
        if not any(ylabels):
            ylabels = [f"{tick:.3f}" for tick in yticks]

        for y_val in y_remove:
            idx = np.argmin(np.abs(yticks - y_val))
            if idx < len(ylabels):
                ylabels[idx] = ""
        ax.set_yticklabels(ylabels)

def add_staggered_x_labels(ax, x_values, ylim, fontsize=13):

    sorted_x = sorted(x_values)

    y_range = ylim[1] - ylim[0]
    base_offset = 0.02 * y_range
    increment = 0.030 * y_range

    for i, x_val in enumerate(sorted_x):
        y_pos = ylim[0] - base_offset - (i * increment)

        ax.text(x_val, y_pos, f"{x_val:.3f}",
                fontsize=fontsize,
                ha="center",
                va="top")

xlim, ylim = compute_global_limits_for_gender(processed, model_order)

fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharex=True, sharey=True)

for idx, (ax, mname) in enumerate(zip(axes, model_order)):
    p = processed[mname]
    male, female = p["male_all"], p["female_all"]

    x_m, y_m = male["warmth_emb_sim"], male["competence_emb_sim"]
    x_f, y_f = female["warmth_emb_sim"], female["competence_emb_sim"]

    ax.scatter(x_m, y_m, s=sz1, alpha=0.6, color=male_color)
    ax.scatter(x_f, y_f, s=sz1, alpha=0.6, color=female_color)

    slopes = {}
    for label, x, y, c, ls in [
        ("Male", x_m, y_m, "black", "-"),
        ("Female", x_f, y_f, "gray", "--")
    ]:
        if len(x) > 1 and np.std(x) > 0:
            b = np.polyfit(x, y, 1)
            slopes[label] = b[0]
            xs = np.linspace(*xlim, 200)
            ax.plot(xs, b[0]*xs + b[1], color=c, lw=2, ls=ls)

    mx, my = np.mean(x_m), np.mean(y_m)
    fx, fy = np.mean(x_f), np.mean(y_f)

    ax.scatter(mx, my, marker=male_centroid_marker, s=sz2,
               color=male_color, edgecolors='black', linewidths=1.5, zorder=2)
    ax.scatter(fx, fy, marker=female_centroid_marker, s=sz2,
               color=female_color, edgecolors='black', linewidths=1.5, zorder=2)

    add_staggered_x_labels(ax, [mx, fx], ylim, fontsize=CENTROID_TEXT_FS)

    ax.vlines(mx, ylim[0], my, colors=male_color, linestyles=":", lw=1)
    ax.hlines(my, xlim[0], mx, colors=male_color, linestyles=":", lw=1)

    ax.vlines(fx, ylim[0], fy, colors=female_color, linestyles=":", lw=1)
    ax.hlines(fy, xlim[0], fx, colors=female_color, linestyles=":", lw=1)


    ax.text(xlim[0], my, f"{my:.3f}", fontsize=CENTROID_TEXT_FS,
            ha="right", va="center")


    ax.text(xlim[0], fy, f"{fy:.3f}", fontsize=CENTROID_TEXT_FS,
            ha="right", va="center")


    ax.legend(
        handles=[
            Line2D(
                [0],[0], color="black", lw=2,
                label=f"Male fit: {slopes.get('Male', np.nan):.3f}"
            ),
            Line2D(
                [0],[0], color="gray", lw=2, ls="--",
                label=f"Female fit: {slopes.get('Female', np.nan):.3f}"
            )
        ],
        loc="upper left",
        fontsize=LEGEND_FS,
        frameon=True
    )

    ax.set_title(mname.capitalize(), fontsize=TITLE_FS)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.tick_params(labelsize=AXIS_TICK_FS)

    if idx == 0:  # ChatGPT
        remove_specific_ticks(ax, x_remove=[0.1, 0.2], y_remove=[0.1])
    elif idx == 1:  # Claude
        remove_specific_ticks(ax, x_remove=[], y_remove=[0.1])
    elif idx == 2:  # Gemini
        remove_specific_ticks(ax, x_remove=[], y_remove=[0.1])


axes[0].set_ylabel("Competence similarity", fontsize=AXIS_LABEL_FS)
for ax in axes:
    ax.set_xlabel("Warmth similarity", fontsize=AXIS_LABEL_FS)

legend_handles_gender = [
    Line2D([0],[0], marker='o', color='w', markerfacecolor=male_color, label="Male (points)"),
    Line2D([0],[0], marker='o', color='w', markerfacecolor=female_color, label="Female (points)"),
    Line2D([0],[0], color="black", lw=2, label="Male fit"),
    Line2D([0],[0], color="gray", lw=2, ls="--", label="Female fit"),
    Line2D([0],[0], marker=male_centroid_marker, color='w', markerfacecolor=male_color, label="Male centroid"),
    Line2D([0],[0], marker=female_centroid_marker, color='w', markerfacecolor=female_color, label="Female centroid"),
]

fig.legend(
    handles=legend_handles_gender,
    loc="upper center",
    ncol=6,
    fontsize=LEGEND_FS,
    frameon=True,
    bbox_to_anchor=(0.5, 1.01)
)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig(os.path.join(OUTDIR, "combined_male_vs_female_three_models.png"), dpi=300)
plt.show()

xlim2, ylim2 = compute_global_limits_for_groups(processed, model_order)

fig2, axes2 = plt.subplots(1, 3, figsize=(18, 7), sharex=True, sharey=True)

for ax, mname in zip(axes2, model_order):
    grp = processed[mname]["group_results"]
    slopes = {}

    for g in ["light", "mid", "dark"]:
        df = grp[g]
        x, y = df["warmth_emb_sim"], df["competence_emb_sim"]
        ax.scatter(x, y, s=sz1, alpha=0.6, color=group_colors[g])

        if len(x) > 1 and np.std(x) > 0:
            b = np.polyfit(x, y, 1)
            slopes[g] = b[0]
            xs = np.linspace(*xlim2, 200)
            ax.plot(xs, b[0]*xs + b[1], lw=2, color=group_colors[g])

        cx, cy = np.mean(x), np.mean(y)
        ax.scatter(cx, cy, s=sz2, marker="D",
                   color=group_colors[g], edgecolors='black',
                   linewidths=1.5, zorder=2)

        print(cx, cy, mname, g)

    ax.legend(
        handles=[
            Line2D([0],[0], color=group_colors["light"], lw=2, label=f"Light fit: {slopes.get('light', np.nan):.3f}"),
            Line2D([0],[0], color=group_colors["mid"], lw=2, label=f"Mid fit: {slopes.get('mid', np.nan):.3f}"),
            Line2D([0],[0], color=group_colors["dark"], lw=2, label=f"Dark fit: {slopes.get('dark', np.nan):.3f}")
        ],
        loc="upper left",
        fontsize=LEGEND_FS,
        frameon=True
    )

    ax.set_title(mname.capitalize(), fontsize=TITLE_FS)
    ax.set_xlim(xlim2)
    ax.set_ylim(ylim2)
    ax.tick_params(labelsize=AXIS_TICK_FS)

axes2[0].set_ylabel("Competence similarity", fontsize=AXIS_LABEL_FS)
for ax in axes2:
    ax.set_xlabel("Warmth similarity", fontsize=AXIS_LABEL_FS)

legend_handles_groups = []
for g in ["light", "mid", "dark"]:
    legend_handles_groups.extend([
        Line2D([0],[0], marker='o', color='w', markerfacecolor=group_colors[g], label=f"{g} points"),
        Line2D([0],[0], color=group_colors[g], lw=2, label=f"{g} fit"),
        Line2D([0],[0], marker='X', color='w', markerfacecolor=group_colors[g], label=f"{g} centroid"),
    ])

fig2.legend(
    handles=legend_handles_groups,
    loc="upper center",
    ncol=9,
    fontsize=LEGEND_FS-1,
    frameon=True,
    bbox_to_anchor=(0.5, 0.90)
)

plt.tight_layout(rect=[0, 0, 1, 0.85])
plt.savefig(os.path.join(OUTDIR, "combined_skingroups_three_models.png"), dpi=300)
plt.show()


# Other metrics

1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, glob, re, json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist
from datetime import datetime, timezone

DRIVE_BASE = "/content/drive/MyDrive/image_captioning_bias"
OUT_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs")
os.makedirs(OUT_DIR, exist_ok=True)
DEFAULT_MODEL_FILES = [
    "/content/drive/MyDrive/image_captioning_bias/chatgpt/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/claude/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/gemini/captions.csv"
]
MODEL_NAMES = ["chatgpt", "claude", "gemini"]
EMBED_MODEL = "all-MiniLM-L6-v2"

def secondnum_to_skingroup(second):
    try:
        s = int(second)
    except Exception:
        return "unknown"
    if s in (1,2):
        return "light"
    if s in (3,4):
        return "mid"
    if s in (5,6):
        return "dark"
    return "unknown"

def load_csv(path):
    df = pd.read_csv(path)
    candidates = ["caption","Caption","captions","text","Text","caption_text","description"]
    text_col = None
    for c in candidates:
        if c in df.columns:
            text_col = c; break
    if text_col is None:
        string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
        if not string_cols:
            raise KeyError("no text-like column")
        avglen = {col: df[col].astype(str).map(len).mean() for col in string_cols}
        text_col = max(avglen, key=avglen.get)
    if 'Filename' not in df.columns and 'filename' in df.columns:
        df.rename(columns={'filename':'Filename'}, inplace=True)
    df = df.copy()
    df.rename(columns={text_col: "Caption"}, inplace=True)
    df['Caption_norm'] = df['Caption'].fillna("").astype(str).str.strip()
    def extract_numbers(fname):
        try:
            parts = str(fname).split('_')
            return int(parts[0]), int(parts[1])
        except Exception:
            return None, None
    if 'Filename' in df.columns:
        nums = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
        df[['first_num','second_num']] = nums
    return df[['Filename','Caption','Caption_norm','first_num','second_num']]

def multivariate_energy_distance(X, Y):
    X = np.asarray(X); Y = np.asarray(Y)
    n = X.shape[0]; m = Y.shape[0]
    if n==0 or m==0: return float('nan')
    cross = cdist(X, Y, metric="euclidean")
    a = (2.0/(n*m))*np.sum(cross)
    xx = cdist(X,X,metric="euclidean") if n>1 else np.zeros((1,1))
    yy = cdist(Y,Y,metric="euclidean") if m>1 else np.zeros((1,1))
    b = (1.0/(n*n))*np.sum(xx) if n>1 else 0.0
    c = (1.0/(m*m))*np.sum(yy) if m>1 else 0.0
    return float(max(a - b - c, 0.0))

files = DEFAULT_MODEL_FILES
dfs = {name: load_csv(path) for name, path in zip(MODEL_NAMES, files)}

common = set.intersection(*[set(df['Filename'].dropna()) for df in dfs.values()])
common = sorted(list(common))
aligned = pd.DataFrame({"Filename": common})
for name, df in dfs.items():
    tmp = df.set_index('Filename')
    aligned[f"Caption_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption'] if x in tmp.index else "")
    aligned[f"Caption_norm_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption_norm'] if x in tmp.index else "")
    aligned[f"first_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'first_num'] if x in tmp.index else None)
    aligned[f"second_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'second_num'] if x in tmp.index else None)

def infer_label_from_aligned(aligned, col_prefix, fallback_vals=None):
    vals = []
    for i in range(len(aligned)):
        found = None
        for name in MODEL_NAMES:
            v = aligned.at[i, f"{col_prefix}_{name}"]
            if pd.notna(v) and v is not None:
                found = v
                break
        vals.append(found)
    return vals

first_nums = infer_label_from_aligned(aligned, 'first_num')
second_nums = infer_label_from_aligned(aligned, 'second_num')
gender_label = []
skin_group_label = []
for f,s in zip(first_nums, second_nums):
    if f in (1,2):
        gender_label.append('male' if int(f)==1 else 'female')
    else:
        gender_label.append('unknown')
    skin_group_label.append(secondnum_to_skingroup(s))

aligned['gender_label'] = gender_label
aligned['skin_group_label'] = skin_group_label

model = SentenceTransformer(EMBED_MODEL)
dim = model.get_sentence_embedding_dimension()

summary = {}
for name in MODEL_NAMES:
    caps = aligned[f"Caption_norm_{name}"].tolist()
    embs = model.encode(caps, convert_to_numpy=True, show_progress_bar=True, batch_size=64)
    norms = np.linalg.norm(embs, axis=1, keepdims=True); norms[norms==0]=1.0; embs = embs/norms

    mask_m = (aligned['gender_label'] == 'male')
    mask_f = (aligned['gender_label'] == 'female')
    male_cent = embs[mask_m.values].mean(axis=0) if mask_m.sum()>0 else np.zeros((dim,))
    female_cent = embs[mask_f.values].mean(axis=0) if mask_f.sum()>0 else np.zeros((dim,))
    if np.linalg.norm(male_cent)>0: male_cent = male_cent/np.linalg.norm(male_cent)
    if np.linalg.norm(female_cent)>0: female_cent = female_cent/np.linalg.norm(female_cent)
    sim_to_male = (embs @ male_cent).astype(float)
    sim_to_female = (embs @ female_cent).astype(float)
    aligned[f"sim_to_male_centroid_{name}"] = sim_to_male
    aligned[f"sim_to_female_centroid_{name}"] = sim_to_female
    gender_ed = multivariate_energy_distance(embs[mask_m.values], embs[mask_f.values]) if (mask_m.sum()>1 and mask_f.sum()>1) else float('nan')

    mask_light = (aligned['skin_group_label'] == 'light')
    mask_mid = (aligned['skin_group_label'] == 'mid')
    mask_dark = (aligned['skin_group_label'] == 'dark')
    light_cent = embs[mask_light.values].mean(axis=0) if mask_light.sum()>0 else np.zeros((dim,))
    mid_cent   = embs[mask_mid.values].mean(axis=0)   if mask_mid.sum()>0   else np.zeros((dim,))
    dark_cent  = embs[mask_dark.values].mean(axis=0)  if mask_dark.sum()>0  else np.zeros((dim,))
    if np.linalg.norm(light_cent)>0: light_cent = light_cent/np.linalg.norm(light_cent)
    if np.linalg.norm(mid_cent)>0:   mid_cent = mid_cent/np.linalg.norm(mid_cent)
    if np.linalg.norm(dark_cent)>0:  dark_cent = dark_cent/np.linalg.norm(dark_cent)

    sim_to_light = (embs @ light_cent).astype(float)
    sim_to_mid   = (embs @ mid_cent).astype(float)
    sim_to_dark  = (embs @ dark_cent).astype(float)
    aligned[f"sim_to_skin_light_centroid_{name}"] = sim_to_light
    aligned[f"sim_to_skin_mid_centroid_{name}"] = sim_to_mid
    aligned[f"sim_to_skin_dark_centroid_{name}"] = sim_to_dark

    ed_light_mid = multivariate_energy_distance(embs[mask_light.values], embs[mask_mid.values]) if (mask_light.sum()>1 and mask_mid.sum()>1) else float('nan')
    ed_light_dark = multivariate_energy_distance(embs[mask_light.values], embs[mask_dark.values]) if (mask_light.sum()>1 and mask_dark.sum()>1) else float('nan')
    ed_mid_dark   = multivariate_energy_distance(embs[mask_mid.values],   embs[mask_dark.values]) if (mask_mid.sum()>1 and mask_dark.sum()>1) else float('nan')

    summary[name] = {
        "gender": {"energy_distance": float(gender_ed), "n_m": int(mask_m.sum()), "n_f": int(mask_f.sum())},
        "skin_groups": {
            "n_light": int(mask_light.sum()), "n_mid": int(mask_mid.sum()), "n_dark": int(mask_dark.sum()),
            "energy_distance_light_mid": float(ed_light_mid),
            "energy_distance_light_dark": float(ed_light_dark),
            "energy_distance_mid_dark": float(ed_mid_dark)
        }
    }

    out = pd.DataFrame({
        "Filename": aligned['Filename'],
        "Caption": aligned[f"Caption_norm_{name}"],
        "sim_to_male_centroid": aligned[f"sim_to_male_centroid_{name}"],
        "sim_to_female_centroid": aligned[f"sim_to_female_centroid_{name}"],
        "sim_to_skin_light_centroid": aligned[f"sim_to_skin_light_centroid_{name}"],
        "sim_to_skin_mid_centroid": aligned[f"sim_to_skin_mid_centroid_{name}"],
        "sim_to_skin_dark_centroid": aligned[f"sim_to_skin_dark_centroid_{name}"],
        "gender_label": aligned['gender_label'],
        "skin_group_label": aligned['skin_group_label']
    })
    out_path = os.path.join(OUT_DIR, f"part1_bias_metrics_{name}.csv")
    out.to_csv(out_path, index=False)
    print("Saved Part1 CSV:", out_path)

with open(os.path.join(OUT_DIR, "part1_summary.json"), "w", encoding="utf-8") as jf:
    json.dump({"timestamp": datetime.now(timezone.utc).isoformat(), "summary": summary}, jf, indent=2)
print("Part1 done. Summary saved to part1_summary.json")

2

3

In [None]:
import os, json, re
import numpy as np
import pandas as pd
from datetime import datetime, timezone
from collections import defaultdict

DRIVE_BASE = "/content/drive/MyDrive/image_captioning_bias"
OUT_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs")
os.makedirs(OUT_DIR, exist_ok=True)
DEFAULT_MODEL_FILES = [
    "/content/drive/MyDrive/image_captioning_bias/chatgpt/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/claude/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/gemini/captions.csv"
]
MODEL_NAMES = ["chatgpt","claude","gemini"]

GENDER_TOKEN_BASE = ["man","woman","male","female","boy","girl","men","women","his","her","he","she","him"]
GENDER_TOKENS = set(GENDER_TOKEN_BASE + [w.capitalize() for w in GENDER_TOKEN_BASE])
RACE_TOKENS = {"Black","White","Asian","Brown","Latino","Hispanic","Indian","African","Caucasian","Middle-Eastern","Arab"}

SKIN_TO_GROUP = {1: "light", 2: "light", 3: "mid", 4: "mid", 5: "dark", 6: "dark"}

def load_csv(path):
    df = pd.read_csv(path)
    candidates = ["caption","Caption","captions","text","Text","caption_text","description"]
    text_col = None
    for c in candidates:
        if c in df.columns:
            text_col = c; break
    if text_col is None:
        string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
        if not string_cols:
            raise KeyError(f"No text-like column found in {path}")
        text_col = max(string_cols, key=lambda c: df[c].astype(str).map(len).mean())
    if 'Filename' not in df.columns and 'filename' in df.columns:
        df.rename(columns={'filename':'Filename'}, inplace=True)
    df = df.copy(); df.rename(columns={text_col:"Caption"}, inplace=True)
    df['Caption_norm'] = df['Caption'].astype(str).str.strip()
    def extract_numbers(fname):
        try:
            p=str(fname).split('_'); return int(p[0]), int(p[1])
        except: return None, None
    if 'Filename' in df.columns:
        nums = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
        df[['first_num','second_num']] = nums
    return df

dfs = {name: load_csv(p) for name,p in zip(MODEL_NAMES, DEFAULT_MODEL_FILES)}
common = set.intersection(*[set(df['Filename'].dropna()) for df in dfs.values()])
common = sorted(list(common))
aligned = pd.DataFrame({"Filename": common})
for name, df in dfs.items():
    tmp = df.set_index('Filename')
    aligned[f"Caption_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption'] if x in tmp.index else "")
    aligned[f"Caption_norm_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption_norm'] if x in tmp.index else "")
    aligned[f"first_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'first_num'] if x in tmp.index else None)
    aligned[f"second_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'second_num'] if x in tmp.index else None)

def simple_tokenize_preserve_case(text):
    txt = str(text)
    txt = re.sub(r"[^\w\s'-]"," ", txt)
    toks = [t for t in txt.split() if t]
    return toks

part3_summary_extended = {"by_model": {}, "skin_pairwise": {}}

for name in MODEL_NAMES:
    caps = aligned[f"Caption_norm_{name}"].tolist()
    gender_flags = []
    race_flags = []
    for cap in caps:
        toks = simple_tokenize_preserve_case(cap)
        gender_mention = any((t in GENDER_TOKENS) or (t.lower() in GENDER_TOKEN_BASE) for t in toks)
        race_mention = any(t in RACE_TOKENS for t in toks)
        gender_flags.append(int(gender_mention))
        race_flags.append(int(race_mention))

    aligned[f"gender_mention_{name}"] = gender_flags
    aligned[f"race_mention_{name}"] = race_flags

    genders = []
    skin_groups = []
    for i in range(len(aligned)):
        v = aligned.at[i, f"first_num_{name}"]
        if pd.isna(v) or v is None:
            genders.append("unknown")
        else:
            genders.append("male" if int(v)==1 else "female")
        s = aligned.at[i, f"second_num_{name}"]
        if pd.isna(s) or s is None:
            skin_groups.append("unknown")
        else:
            try:
                sg = SKIN_TO_GROUP.get(int(s), "unknown")
            except Exception:
                sg = "unknown"
            skin_groups.append(sg)

    genders = np.array(genders)
    skin_groups = np.array(skin_groups)

    male_mask = genders == 'male'
    female_mask = genders == 'female'
    overall_gender_mention_rate = float(np.mean(gender_flags)) if len(gender_flags)>0 else float('nan')
    male_gender_mention_rate = float(np.mean(np.array(gender_flags)[male_mask])) if male_mask.sum()>0 else float('nan')
    female_gender_mention_rate = float(np.mean(np.array(gender_flags)[female_mask])) if female_mask.sum()>0 else float('nan')

    overall_race_mention_rate = float(np.mean(race_flags)) if len(race_flags)>0 else float('nan')
    male_race_mention_rate = float(np.mean(np.array(race_flags)[male_mask])) if male_mask.sum()>0 else float('nan')
    female_race_mention_rate = float(np.mean(np.array(race_flags)[female_mask])) if female_mask.sum()>0 else float('nan')

    skin_rates = {}
    for sg in ["light","mid","dark","unknown"]:
        mask = skin_groups == sg
        if mask.sum() > 0:
            skin_rates[f"{sg}_gender_mention_rate"] = float(np.mean(np.array(gender_flags)[mask]))
            skin_rates[f"{sg}_race_mention_rate"] = float(np.mean(np.array(race_flags)[mask]))
            skin_rates[f"{sg}_n"] = int(mask.sum())
        else:
            skin_rates[f"{sg}_gender_mention_rate"] = float('nan')
            skin_rates[f"{sg}_race_mention_rate"] = float('nan')
            skin_rates[f"{sg}_n"] = 0

    part3_summary_extended["by_model"][name] = {
        "overall_gender_mention_rate": overall_gender_mention_rate,
        "male_gender_mention_rate": male_gender_mention_rate,
        "female_gender_mention_rate": female_gender_mention_rate,
        "overall_race_mention_rate": overall_race_mention_rate,
        "male_race_mention_rate": male_race_mention_rate,
        "female_race_mention_rate": female_race_mention_rate,
        "skin_group_rates": skin_rates
    }

    out_df = pd.DataFrame({
        "Filename": aligned['Filename'],
        "Caption": aligned[f"Caption_norm_{name}"],
        "inferred_gender": genders,
        "inferred_skin": skin_groups,
        "gender_mention": aligned[f"gender_mention_{name}"],
        "race_mention": aligned[f"race_mention_{name}"]
    })
    out_path = os.path.join(OUT_DIR, f"part3_bias_metrics_{name}.csv")
    out_df.to_csv(out_path, index=False)
    print("Saved extended Part3 CSV for model:", name, "->", out_path)

for name in MODEL_NAMES:
    skin_metrics = {}
    for a, b in [("light","mid"), ("light","dark"), ("mid","dark")]:
        a_rate = part3_summary_extended["by_model"][name]["skin_group_rates"].get(f"{a}_gender_mention_rate", float('nan'))
        b_rate = part3_summary_extended["by_model"][name]["skin_group_rates"].get(f"{b}_gender_mention_rate", float('nan'))
        if np.isfinite(a_rate) and np.isfinite(b_rate):
            skin_metrics[f"{a}_vs_{b}_gender_mention_diff"] = abs(a_rate - b_rate)
        else:
            skin_metrics[f"{a}_vs_{b}_gender_mention_diff"] = float('nan')

        a_r = part3_summary_extended["by_model"][name]["skin_group_rates"].get(f"{a}_race_mention_rate", float('nan'))
        b_r = part3_summary_extended["by_model"][name]["skin_group_rates"].get(f"{b}_race_mention_rate", float('nan'))
        if np.isfinite(a_r) and np.isfinite(b_r):
            skin_metrics[f"{a}_vs_{b}_race_mention_diff"] = abs(a_r - b_r)
        else:
            skin_metrics[f"{a}_vs_{b}_race_mention_diff"] = float('nan')

    part3_summary_extended["skin_pairwise"][name] = skin_metrics

summary_path = os.path.join(OUT_DIR, "part3_summary_extended.json")
with open(summary_path, "w", encoding="utf-8") as jf:
    json.dump({"timestamp": datetime.now(timezone.utc).isoformat(), "summary": part3_summary_extended}, jf, indent=2)
print("Saved extended Part3 summary JSON ->", summary_path)
print("Part3 (extended) done.")

4

In [None]:
import os, json
import numpy as np
import pandas as pd
from datetime import datetime, timezone
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

DRIVE_BASE = "/content/drive/MyDrive/image_captioning_bias"
OUT_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs")
os.makedirs(OUT_DIR, exist_ok=True)
DEFAULT_MODEL_FILES = [
    "/content/drive/MyDrive/image_captioning_bias/chatgpt/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/claude/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/gemini/captions.csv"
]
MODEL_NAMES = ["chatgpt","claude","gemini"]

SKIN_TO_GROUP = {1: "light", 2: "light", 3: "mid", 4: "mid", 5: "dark", 6: "dark"}

def load_csv(path):
    df = pd.read_csv(path)
    candidates = ["caption","Caption","captions","text","Text","caption_text","description"]
    text_col = None
    for c in candidates:
        if c in df.columns:
            text_col = c; break
    if text_col is None:
        string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
        if not string_cols:
            raise KeyError(f"No text-like column found in {path}")
        text_col = max(string_cols, key=lambda c: df[c].astype(str).map(len).mean())
    if 'Filename' not in df.columns and 'filename' in df.columns:
        df.rename(columns={'filename':'Filename'}, inplace=True)
    df = df.copy(); df.rename(columns={text_col:"Caption"}, inplace=True)
    df['Caption_norm'] = df['Caption'].astype(str).str.strip()
    def extract_numbers(fname):
        try:
            p=str(fname).split('_'); return int(p[0]), int(p[1])
        except: return None, None
    if 'Filename' in df.columns:
        nums = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
        df[['first_num','second_num']] = nums
    return df

dfs = {name: load_csv(p) for name,p in zip(MODEL_NAMES, DEFAULT_MODEL_FILES)}
common = set.intersection(*[set(df['Filename'].dropna()) for df in dfs.values()])
common = sorted(list(common))
aligned = pd.DataFrame({"Filename": common})
for name, df in dfs.items():
    tmp = df.set_index('Filename')
    aligned[f"Caption_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption'] if x in tmp.index else "")
    aligned[f"Caption_norm_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption_norm'] if x in tmp.index else "")
    aligned[f"first_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'first_num'] if x in tmp.index else None)
    aligned[f"second_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'second_num'] if x in tmp.index else None)

sia = SentimentIntensityAnalyzer()

part4_summary_extended = {"by_model": {}, "skin_pairwise": {}}

for name in MODEL_NAMES:
    caps = aligned[f"Caption_norm_{name}"].tolist()
    comp_scores = [sia.polarity_scores(str(c))['compound'] for c in caps]
    aligned[f"sentiment_compound_{name}"] = comp_scores

    genders = []
    skin_groups = []
    for i in range(len(aligned)):
        v = aligned.at[i, f"first_num_{name}"]
        if pd.isna(v) or v is None:
            genders.append("unknown")
        else:
            try:
                genders.append("male" if int(v)==1 else "female")
            except Exception:
                genders.append("unknown")
        s = aligned.at[i, f"second_num_{name}"]
        if pd.isna(s) or s is None:
            skin_groups.append("unknown")
        else:
            try:
                skin_groups.append(SKIN_TO_GROUP.get(int(s), "unknown"))
            except Exception:
                skin_groups.append("unknown")
    genders = np.array(genders)
    skin_groups = np.array(skin_groups)

    male_mask = genders == 'male'
    female_mask = genders == 'female'
    male_mean = float(np.nanmean(np.array(comp_scores)[male_mask])) if male_mask.sum()>0 else float('nan')
    female_mean = float(np.nanmean(np.array(comp_scores)[female_mask])) if female_mask.sum()>0 else float('nan')
    abs_diff_gender = float(np.nan if (np.isnan(male_mean) or np.isnan(female_mean)) else abs(male_mean - female_mean))

    skin_rates = {}
    for sg in ["light","mid","dark","unknown"]:
        mask = skin_groups == sg
        if mask.sum() > 0:
            mean_sg = float(np.nanmean(np.array(comp_scores)[mask]))
            skin_rates[f"{sg}_sentiment_mean"] = mean_sg
            skin_rates[f"{sg}_n"] = int(mask.sum())
        else:
            skin_rates[f"{sg}_sentiment_mean"] = float('nan')
            skin_rates[f"{sg}_n"] = 0

    part4_summary_extended["by_model"][name] = {
        "male_sentiment_mean": male_mean,
        "female_sentiment_mean": female_mean,
        "abs_diff_gender": abs_diff_gender,
        "skin_group_sentiment_means": skin_rates
    }

    out_df = pd.DataFrame({
        "Filename": aligned['Filename'],
        "Caption": aligned[f"Caption_norm_{name}"],
        "inferred_gender": genders,
        "inferred_skin": skin_groups,
        "sentiment_compound": comp_scores
    })
    out_path = os.path.join(OUT_DIR, f"part4_bias_metrics_{name}.csv")
    out_df.to_csv(out_path, index=False)
    print("Saved extended Part4 CSV for model:", name, "->", out_path)

    skin_pairwise = {}
    def safe_mean(sg):
        v = part4_summary_extended["by_model"][name]["skin_group_sentiment_means"].get(f"{sg}_sentiment_mean")
        return v
    for a,b in [("light","mid"), ("light","dark"), ("mid","dark")]:
        ma = safe_mean(a); mb = safe_mean(b)
        if not (np.isnan(ma) or np.isnan(mb)):
            skin_pairwise[f"{a}_vs_{b}_abs_diff"] = abs(ma - mb)
        else:
            skin_pairwise[f"{a}_vs_{b}_abs_diff"] = float('nan')
    part4_summary_extended["skin_pairwise"][name] = skin_pairwise

summary_path = os.path.join(OUT_DIR, "part4_summary_extended.json")
with open(summary_path, "w", encoding="utf-8") as jf:
    json.dump({"timestamp": datetime.now(timezone.utc).isoformat(), "summary": part4_summary_extended}, jf, indent=2)

print("Saved extended Part4 summary JSON ->", summary_path)
print("Extended Part4 done.")

5

In [None]:
import os, re, json
import numpy as np
import pandas as pd
from datetime import datetime, timezone

DRIVE_BASE = "/content/drive/MyDrive/image_captioning_bias"
OUT_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs")
os.makedirs(OUT_DIR, exist_ok=True)
DEFAULT_MODEL_FILES = [
    "/content/drive/MyDrive/image_captioning_bias/chatgpt/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/claude/captions.csv",
    "/content/drive/MyDrive/image_captioning_bias/gemini/captions.csv"
]
MODEL_NAMES = ["chatgpt","claude","gemini"]

SKIN_TO_GROUP = {1: "light", 2: "light", 3: "mid", 4: "mid", 5: "dark", 6: "dark"}

def load_csv(path):
    df = pd.read_csv(path)
    candidates = ["caption","Caption","captions","text","Text","caption_text","description"]
    text_col = None
    for c in candidates:
        if c in df.columns:
            text_col = c; break
    if text_col is None:
        string_cols = [col for col in df.columns if df[col].dtype == object or pd.api.types.is_string_dtype(df[col])]
        if not string_cols:
            raise KeyError(f"No text-like column found in {path}")
        text_col = max(string_cols, key=lambda c: df[c].astype(str).map(len).mean())
    if 'Filename' not in df.columns and 'filename' in df.columns:
        df.rename(columns={'filename':'Filename'}, inplace=True)
    df = df.copy(); df.rename(columns={text_col:"Caption"}, inplace=True)
    df['Caption_norm'] = df['Caption'].astype(str).str.strip()
    def extract_numbers(fname):
        try:
            p=str(fname).split('_'); return int(p[0]), int(p[1])
        except: return None, None
    if 'Filename' in df.columns:
        nums = df['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))
        df[['first_num','second_num']] = nums
    return df

dfs = {name: load_csv(p) for name,p in zip(MODEL_NAMES, DEFAULT_MODEL_FILES)}
common = set.intersection(*[set(df['Filename'].dropna()) for df in dfs.values()])
common = sorted(list(common))
aligned = pd.DataFrame({"Filename": common})
for name, df in dfs.items():
    tmp = df.set_index('Filename')
    aligned[f"Caption_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption'] if x in tmp.index else "")
    aligned[f"Caption_norm_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'Caption_norm'] if x in tmp.index else "")
    aligned[f"first_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'first_num'] if x in tmp.index else None)
    aligned[f"second_num_{name}"] = aligned['Filename'].map(lambda x: tmp.loc[x,'second_num'] if x in tmp.index else None)

def tokenize_preserve_case(text):
    txt = str(text)
    txt = re.sub(r"[^\w\s'-]"," ", txt)
    toks = [t for t in txt.split() if t]
    return toks

def ttr(text):
    toks = tokenize_preserve_case(text)
    if len(toks)==0: return 0.0
    return len(set([t.lower() for t in toks])) / float(len(toks))

part5_summary_extended = {"by_model": {}, "skin_pairwise": {}}

for name in MODEL_NAMES:
    caps = aligned[f"Caption_norm_{name}"].tolist()
    ttrs = [ttr(c) for c in caps]
    aligned[f"ttr_{name}"] = ttrs

    genders = []
    skin_groups = []
    for i in range(len(aligned)):
        v = aligned.at[i, f"first_num_{name}"]
        if pd.isna(v) or v is None:
            genders.append("unknown")
        else:
            try:
                genders.append("male" if int(v)==1 else "female")
            except Exception:
                genders.append("unknown")
        s = aligned.at[i, f"second_num_{name}"]
        if pd.isna(s) or s is None:
            skin_groups.append("unknown")
        else:
            try:
                skin_groups.append(SKIN_TO_GROUP.get(int(s), "unknown"))
            except Exception:
                skin_groups.append("unknown")
    genders = np.array(genders)
    skin_groups = np.array(skin_groups)

    male_mask = genders == 'male'
    female_mask = genders == 'female'
    male_mean = float(np.nanmean(np.array(ttrs)[male_mask])) if male_mask.sum()>0 else float('nan')
    female_mean = float(np.nanmean(np.array(ttrs)[female_mask])) if female_mask.sum()>0 else float('nan')
    abs_diff_gender = float(np.nan if (np.isnan(male_mean) or np.isnan(female_mean)) else abs(male_mean - female_mean))

    skin_rates = {}
    for sg in ["light","mid","dark","unknown"]:
        mask = skin_groups == sg
        if mask.sum() > 0:
            mean_sg = float(np.nanmean(np.array(ttrs)[mask]))
            skin_rates[f"{sg}_ttr_mean"] = mean_sg
            skin_rates[f"{sg}_n"] = int(mask.sum())
        else:
            skin_rates[f"{sg}_ttr_mean"] = float('nan')
            skin_rates[f"{sg}_n"] = 0

    part5_summary_extended["by_model"][name] = {
        "male_ttr_mean": male_mean,
        "female_ttr_mean": female_mean,
        "abs_diff_gender": abs_diff_gender,
        "skin_group_ttr_means": skin_rates
    }

    out_df = pd.DataFrame({
        "Filename": aligned['Filename'],
        "Caption": aligned[f"Caption_norm_{name}"],
        "inferred_gender": genders,
        "inferred_skin": skin_groups,
        "ttr": ttrs
    })
    out_path = os.path.join(OUT_DIR, f"part5_bias_metrics_{name}.csv")
    out_df.to_csv(out_path, index=False)
    print("Saved extended Part5 CSV for model:", name, "->", out_path)

    skin_pairwise = {}
    def safe_mean(sg):
        return part5_summary_extended["by_model"][name]["skin_group_ttr_means"].get(f"{sg}_ttr_mean")
    for a,b in [("light","mid"), ("light","dark"), ("mid","dark")]:
        ma = safe_mean(a); mb = safe_mean(b)
        if not (np.isnan(ma) or np.isnan(mb)):
            skin_pairwise[f"{a}_vs_{b}_abs_diff"] = abs(ma - mb)
        else:
            skin_pairwise[f"{a}_vs_{b}_abs_diff"] = float('nan')
    part5_summary_extended["skin_pairwise"][name] = skin_pairwise

summary_path = os.path.join(OUT_DIR, "part5_summary_extended.json")
with open(summary_path, "w", encoding="utf-8") as jf:
    json.dump({"timestamp": datetime.now(timezone.utc).isoformat(), "summary": part5_summary_extended}, jf, indent=2)
print("Saved extended Part5 summary JSON ->", summary_path)
print("Extended Part5 done.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, json, math
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy import stats
from scipy.spatial.distance import cdist
from datetime import datetime, timezone

DRIVE_BASE = "/content/drive/MyDrive/image_captioning_bias"
IN_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs")
OUT_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs_part7")
os.makedirs(OUT_DIR, exist_ok=True)

MODEL_NAMES = ["chatgpt","claude","gemini"]
N_PERMS = 1000
RNG_SEED = 42

PART1_CSV = os.path.join(IN_DIR, "part1_bias_metrics_{m}.csv")
PART2_CSV = os.path.join(IN_DIR, "part2_bias_metrics_{m}.csv")
PART3_CSV = os.path.join(IN_DIR, "part3_bias_metrics_{m}.csv")
PART4_CSV = os.path.join(IN_DIR, "part4_bias_metrics_{m}.csv")
PART5_CSV = os.path.join(IN_DIR, "part5_bias_metrics_{m}.csv")

def safe_read_csv(path):
    if not os.path.exists(path):
        return None
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"[WARN] Could not read {path}: {e}")
        return None

def multivariate_energy_distance(X, Y):
    X = np.asarray(X); Y = np.asarray(Y)
    if X.ndim != 2 or Y.ndim != 2:
        raise ValueError("X and Y must be 2D arrays.")
    n = X.shape[0]; m = Y.shape[0]
    if n == 0 or m == 0:
        return float('nan')
    cross = cdist(X, Y, metric="euclidean")
    a = (2.0/(n*m))*np.sum(cross)
    xx = cdist(X, X, metric="euclidean") if n>1 else np.zeros((1,1))
    yy = cdist(Y, Y, metric="euclidean") if m>1 else np.zeros((1,1))
    b = (1.0/(n*n))*np.sum(xx) if n>1 else 0.0
    c = (1.0/(m*m))*np.sum(yy) if m>1 else 0.0
    return float(max(a - b - c, 0.0))

def two_prop_ztest(x1,n1,x2,n2):
    if n1==0 or n2==0:
        return float('nan'), float('nan')
    p1 = x1 / n1
    p2 = x2 / n2
    p_pool = (x1 + x2) / (n1 + n2)
    denom = math.sqrt(p_pool*(1-p_pool)*(1.0/n1 + 1.0/n2))
    if denom == 0:
        return float('nan'), float('nan')
    z = (p1 - p2) / denom
    p = 2.0 * (1.0 - stats.norm.cdf(abs(z)))
    return float(z), float(p)

def cohens_d_from_arrays(a,b):
    a = np.asarray(a); b = np.asarray(b)
    if len(a)<2 or len(b)<2:
        return float('nan')
    ma, mb = np.nanmean(a), np.nanmean(b)
    sa, sb = np.nanstd(a, ddof=1), np.nanstd(b, ddof=1)
    pooled = math.sqrt(((len(a)-1)*sa*sa + (len(b)-1)*sb*sb) / (len(a)+len(b)-2))
    if pooled == 0:
        return float('nan')
    return float((ma-mb)/pooled)

def compute_chi2_across_models_binary(arrs):
    table = []
    total = 0
    for a in arrs:
        a = np.asarray(a)
        a = a[~np.isnan(a)]
        n = len(a)
        x = int(np.sum(a==1)) if n>0 else 0
        table.append([n-x, x])
        total += n
    if total == 0:
        return None, None, table
    try:
        chi2, p, _, _ = stats.chi2_contingency(np.array(table))
        return float(chi2), float(p), table
    except Exception:
        return None, None, table

def compute_anova_across_models(arrs):
    groups = [a[~np.isnan(a)] for a in arrs if len(a[~np.isnan(a)])>0]
    if len(groups) < 2:
        return None, None
    try:
        f, p = stats.f_oneway(*groups)
        return float(f), float(p)
    except Exception:
        return None, None

rng = np.random.default_rng(RNG_SEED)

print("Loading Part1..Part5 CSVs for each model from:", IN_DIR)
csvs = {}
for m in MODEL_NAMES:
    csvs[m] = {
        "part1": safe_read_csv(PART1_CSV.format(m=m)),
        "part2": safe_read_csv(PART2_CSV.format(m=m)),
        "part3": safe_read_csv(PART3_CSV.format(m=m)),
        "part4": safe_read_csv(PART4_CSV.format(m=m)),
        "part5": safe_read_csv(PART5_CSV.format(m=m)),
    }
    counts = {k: (len(v) if v is not None else 0) for k,v in csvs[m].items()}
    print(f"  {m}: rows per part:", counts)

results = {"timestamp": datetime.now(timezone.utc).isoformat(), "models": {}}
for m in MODEL_NAMES:
    print(f"\nRecomputing stats for model: {m}")
    part1_df = csvs[m]["part1"]
    part2_df = csvs[m]["part2"]
    part3_df = csvs[m]["part3"]
    part4_df = csvs[m]["part4"]
    part5_df = csvs[m]["part5"]

    model_out = {"part1":{}, "part2":{}, "part3":{}, "part4":{}, "part5":{}}

    if part1_df is None:
        print("  WARNING: part1 CSV missing for", m)
        model_out['part1'] = {"error":"part1_csv_missing"}
    else:
        def pick(colnames):
            for c in colnames:
                if c in part1_df.columns:
                    return c
            return None
        c_sim_male = pick(["sim_to_male_centroid","sim_to_male_centroid_"+m,"sim_to_male_centroid"])
        c_sim_fem  = pick(["sim_to_female_centroid","sim_to_female_centroid_"+m,"sim_to_female_centroid"])
        c_sim_light= pick(["sim_to_skin_light_centroid","sim_to_skin_light_centroid_"+m,"sim_to_skin_light_centroid"])
        c_sim_mid  = pick(["sim_to_skin_mid_centroid","sim_to_skin_mid_centroid_"+m,"sim_to_skin_mid_centroid"])
        c_sim_dark = pick(["sim_to_skin_dark_centroid","sim_to_skin_dark_centroid_"+m,"sim_to_skin_dark_centroid"])
        feature_cols = [c for c in [c_sim_male, c_sim_fem, c_sim_light, c_sim_mid, c_sim_dark] if c is not None]
        if not feature_cols:
            print("  ERROR: no sim columns found in part1 CSV for", m)
            model_out['part1'] = {"error":"no_sim_columns"}
        else:
            feat = part1_df[feature_cols].apply(pd.to_numeric, errors='coerce').to_numpy(dtype=float)
            gender_col = None
            for cand in ["gender_label","inferred_gender","inferred_gender_"+m,"gender"]:
                if cand in part1_df.columns:
                    gender_col = cand; break
            skin_col = None
            for cand in ["skin_group_label","inferred_skin","inferred_skin_"+m,"skin_group"]:
                if cand in part1_df.columns:
                    skin_col = cand; break

            genders = part1_df[gender_col].astype(str).values if gender_col is not None else np.array(["unknown"]*len(part1_df))
            skins   = part1_df[skin_col].astype(str).values if skin_col is not None else np.array(["unknown"]*len(part1_df))

            mask_m = (genders == "male"); mask_f = (genders == "female")
            X = feat[mask_m]; Y = feat[mask_f]
            ed_obs = multivariate_energy_distance(X, Y) if (X.shape[0]>0 and Y.shape[0]>0 and X.ndim==2 and Y.ndim==2) else float('nan')
            perm_count = 0
            all_idx = np.arange(len(feat))
            n_m = int(mask_m.sum()); n_f = int(mask_f.sum())
            if n_m>0 and n_f>0:
                for i in range(N_PERMS):
                    perm = rng.permutation(all_idx)
                    a_idx = perm[:n_m]; b_idx = perm[n_m:n_m+n_f]
                    val = multivariate_energy_distance(feat[a_idx], feat[b_idx])
                    if val >= ed_obs:
                        perm_count += 1
                p_perm = (perm_count + 1) / (N_PERMS + 1)
            else:
                p_perm = float('nan')
            model_out['part1']['gender'] = {"energy_distance": float(ed_obs), "perm_p": float(p_perm), "n_m": int(n_m), "n_f": int(n_f)}

            skins_unique = ["light","mid","dark"]
            skin_pairs = [("light","mid"), ("light","dark"), ("mid","dark")]
            model_out['part1']['skin_pairwise'] = {}
            for a,b in skin_pairs:
                mask_a = (skins == a); mask_b = (skins == b)
                na = int(mask_a.sum()); nb = int(mask_b.sum())
                if na < 2 or nb < 2:
                    model_out['part1']['skin_pairwise'][f"{a}_vs_{b}"] = {"energy_distance": None, "perm_p": None, "n_a": na, "n_b": nb}
                else:
                    ed_ab = multivariate_energy_distance(feat[mask_a], feat[mask_b])
                    idxs = np.where((skins==a)|(skins==b))[0]
                    na = int(mask_a.sum()); nb = int(mask_b.sum())
                    count_ge = 0
                    for i in range(int(N_PERMS/2)):
                        perm = rng.permutation(idxs)
                        a_idx = perm[:na]; b_idx = perm[na:na+nb]
                        if multivariate_energy_distance(feat[a_idx], feat[b_idx]) >= ed_ab:
                            count_ge += 1
                    p_ab = (count_ge + 1) / (int(N_PERMS/2) + 1)
                    model_out['part1']['skin_pairwise'][f"{a}_vs_{b}"] = {"energy_distance": float(ed_ab), "perm_p": float(p_ab), "n_a": na, "n_b": nb}

    # if part2_df is None:
    #     model_out['part2'] = {"error":"part2_csv_missing"}
    # else:
    #     col_adj = None
    #     for cand in ["adj_proj_avg","adj_proj","adj_proj_avg_"+m]:
    #         if cand in part2_df.columns:
    #             col_adj = cand; break
    #     gender_col = None
    #     for cand in ["inferred_gender","inferred_gender_"+m,"gender_label","gender"]:
    #         if cand in part2_df.columns:
    #             gender_col = cand; break
    #     skin_col = None
    #     for cand in ["inferred_skin","inferred_skin_"+m,"skin_group","skin_group_label"]:
    #         if cand in part2_df.columns:
    #             skin_col = cand; break

    #     if col_adj is None:
    #         model_out['part2'] = {"error":"adj_proj_column_missing"}
    #     else:
    #         vals = pd.to_numeric(part2_df[col_adj], errors='coerce').to_numpy(dtype=float)
    #         genders = part2_df[gender_col].astype(str).values if gender_col is not None else np.array(["unknown"]*len(part2_df))
    #         skins = part2_df[skin_col].astype(str).values if skin_col is not None else np.array(["unknown"]*len(part2_df))
    #         m_vals = vals[genders=="male"]; f_vals = vals[genders=="female"]
    #         if len(m_vals[~np.isnan(m_vals)]) < 2 or len(f_vals[~np.isnan(f_vals)]) < 2:
    #             model_out['part2']['gender'] = {"cohens_d": None, "t_p": None, "n_m_tokens": int(np.sum(~np.isnan(m_vals))), "n_f_tokens": int(np.sum(~np.isnan(f_vals)))}
    #         else:
    #             d = cohens_d_from_arrays(m_vals[~np.isnan(m_vals)], f_vals[~np.isnan(f_vals)])
    #             tstat, tp = stats.ttest_ind(m_vals[~np.isnan(m_vals)], f_vals[~np.isnan(f_vals)], equal_var=False, nan_policy='omit')
    #             model_out['part2']['gender'] = {"cohens_d": float(d), "t_p": float(tp), "n_m_tokens": int(np.sum(~np.isnan(m_vals))), "n_f_tokens": int(np.sum(~np.isnan(f_vals)))}
    #         model_out['part2']['skin_pairwise'] = {}
    #         for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
    #             va = vals[skins==a]; vb = vals[skins==b]
    #             if len(va[~np.isnan(va)])<2 or len(vb[~np.isnan(vb)])<2:
    #                 model_out['part2']['skin_pairwise'][f"{a}_vs_{b}"] = {"cohens_d": None, "t_p": None, "n_a": int(np.sum(~np.isnan(va))), "n_b": int(np.sum(~np.isnan(vb)))}
    #             else:
    #                 d = cohens_d_from_arrays(va[~np.isnan(va)], vb[~np.isnan(vb)])
    #                 tstat, tp = stats.ttest_ind(va[~np.isnan(va)], vb[~np.isnan(vb)], equal_var=False, nan_policy='omit')
    #                 model_out['part2']['skin_pairwise'][f"{a}_vs_{b}"] = {"cohens_d": float(d), "t_p": float(tp), "n_a": int(np.sum(~np.isnan(va))), "n_b": int(np.sum(~np.isnan(vb)))}

    if part3_df is None:
        model_out['part3'] = {"error":"part3_csv_missing"}
    else:
        col_gender_mention = None
        for cand in ["gender_mention","gender_mention_"+m]:
            if cand in part3_df.columns:
                col_gender_mention = cand; break
        col_race_mention = None
        for cand in ["race_mention","race_mention_"+m]:
            if cand in part3_df.columns:
                col_race_mention = cand; break
        gender_col = None
        for cand in ["inferred_gender","gender_label","inferred_gender_"+m,"gender"]:
            if cand in part3_df.columns:
                gender_col = cand; break
        skin_col = None
        for cand in ["inferred_skin","skin_group_label","inferred_skin_"+m,"skin_group"]:
            if cand in part3_df.columns:
                skin_col = cand; break

        if col_gender_mention is None or gender_col is None:
            model_out['part3']['gender_tokens'] = {"error":"missing_columns"}
        else:
            flags = pd.to_numeric(part3_df[col_gender_mention], errors='coerce').to_numpy(dtype=float)
            genders = part3_df[gender_col].astype(str).values
            xm = int(np.nansum(flags[genders=="male"]==1)) if (genders=="male").sum()>0 else 0
            nm = int((genders=="male").sum())
            xf = int(np.nansum(flags[genders=="female"]==1)) if (genders=="female").sum()>0 else 0
            nf = int((genders=="female").sum())
            z, p_two = two_prop_ztest(xm,nm,xf,nf)
            table = np.array([[nm-xm, xm],[nf-xf, xf]])
            try:
                chi2, chi_p, _, _ = stats.chi2_contingency(table)
            except Exception:
                chi2, chi_p = None, None
            model_out['part3']['gender_tokens'] = {"x_m": xm, "n_m": nm, "p_m": (xm/nm if nm>0 else None),
                                                   "x_f": xf, "n_f": nf, "p_f": (xf/nf if nf>0 else None),
                                                   "z": float(z) if not math.isnan(z) else None, "p_two_sided": float(p_two) if not math.isnan(p_two) else None,
                                                   "chi2": float(chi2) if chi2 is not None else None, "chi2_p": float(chi_p) if chi_p is not None else None}
        if col_race_mention is None or gender_col is None:
            model_out['part3']['race_tokens'] = {"error":"missing_columns"}
        else:
            flags = pd.to_numeric(part3_df[col_race_mention], errors='coerce').to_numpy(dtype=float)
            genders = part3_df[gender_col].astype(str).values
            xm = int(np.nansum(flags[genders=="male"]==1)) if (genders=="male").sum()>0 else 0
            nm = int((genders=="male").sum())
            xf = int(np.nansum(flags[genders=="female"]==1)) if (genders=="female").sum()>0 else 0
            nf = int((genders=="female").sum())
            z, p_two = two_prop_ztest(xm,nm,xf,nf)
            table = np.array([[nm-xm, xm],[nf-xf, xf]])
            try:
                chi2, chi_p, _, _ = stats.chi2_contingency(table)
            except Exception:
                chi2, chi_p = None, None
            model_out['part3']['race_tokens'] = {"x_m": xm, "n_m": nm, "p_m": (xm/nm if nm>0 else None),
                                                 "x_f": xf, "n_f": nf, "p_f": (xf/nf if nf>0 else None),
                                                 "z": float(z) if not math.isnan(z) else None, "p_two_sided": float(p_two) if not math.isnan(p_two) else None,
                                                 "chi2": float(chi2) if chi2 is not None else None, "chi2_p": float(chi_p) if chi_p is not None else None}

        model_out['part3']['skin_pairwise'] = {}
        if col_gender_mention is not None and skin_col is not None:
            flags = pd.to_numeric(part3_df[col_gender_mention], errors='coerce').to_numpy(dtype=float)
            skins = part3_df[skin_col].astype(str).values
            for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                xa = int(np.nansum(flags[skins==a]==1)) if (skins==a).sum()>0 else 0
                na = int((skins==a).sum())
                xb = int(np.nansum(flags[skins==b]==1)) if (skins==b).sum()>0 else 0
                nb = int((skins==b).sum())
                if na>0 and nb>0:
                    z, p = two_prop_ztest(xa,na,xb,nb)
                else:
                    z,p = float('nan'), float('nan')
                model_out['part3']['skin_pairwise'][f"{a}_vs_{b}_gender_mention"] = {"x_a": xa, "n_a": na, "p_a": (xa/na if na>0 else None),
                                                                                       "x_b": xb, "n_b": nb, "p_b": (xb/nb if nb>0 else None),
                                                                                       "z": None if math.isnan(z) else float(z), "p_two_sided": None if math.isnan(p) else float(p)}
        if col_race_mention is not None and skin_col is not None:
            flags = pd.to_numeric(part3_df[col_race_mention], errors='coerce').to_numpy(dtype=float)
            skins = part3_df[skin_col].astype(str).values
            for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                xa = int(np.nansum(flags[skins==a]==1)) if (skins==a).sum()>0 else 0
                na = int((skins==a).sum())
                xb = int(np.nansum(flags[skins==b]==1)) if (skins==b).sum()>0 else 0
                nb = int((skins==b).sum())
                if na>0 and nb>0:
                    z, p = two_prop_ztest(xa,na,xb,nb)
                else:
                    z,p = float('nan'), float('nan')
                model_out['part3']['skin_pairwise'][f"{a}_vs_{b}_race_mention"] = {"x_a": xa, "n_a": na, "p_a": (xa/na if na>0 else None),
                                                                                      "x_b": xb, "n_b": nb, "p_b": (xb/nb if nb>0 else None),
                                                                                      "z": None if math.isnan(z) else float(z), "p_two_sided": None if math.isnan(p) else float(p)}

    if part4_df is None:
        model_out['part4'] = {"error":"part4_csv_missing"}
    else:
        col_sent = None
        for cand in ["sentiment_compound","sentiment_compound_"+m]:
            if cand in part4_df.columns:
                col_sent = cand; break
        gender_col = None
        for cand in ["inferred_gender","gender_label","inferred_gender_"+m,"gender"]:
            if cand in part4_df.columns:
                gender_col = cand; break
        skin_col = None
        for cand in ["inferred_skin","skin_group_label","inferred_skin_"+m,"skin_group"]:
            if cand in part4_df.columns:
                skin_col = cand; break
        if col_sent is None:
            model_out['part4'] = {"error":"sentiment_column_missing"}
        else:
            svals = pd.to_numeric(part4_df[col_sent], errors='coerce').to_numpy(dtype=float)
            genders = part4_df[gender_col].astype(str).values if gender_col is not None else np.array(["unknown"]*len(part4_df))
            skins = part4_df[skin_col].astype(str).values if skin_col is not None else np.array(["unknown"]*len(part4_df))
            a = svals[genders=="male"]; b = svals[genders=="female"]
            if len(a[~np.isnan(a)])>=2 and len(b[~np.isnan(b)])>=2:
                tstat, t_p = stats.ttest_ind(a[~np.isnan(a)], b[~np.isnan(b)], equal_var=False, nan_policy='omit')
                mw = stats.mannwhitneyu(a[~np.isnan(a)], b[~np.isnan(b)], alternative='two-sided')
                model_out['part4']['gender'] = {"male_mean": float(np.nanmean(a)), "female_mean": float(np.nanmean(b)),
                                                "t_p": float(t_p), "t_stat": float(tstat), "mw_p": float(mw.pvalue)}
            else:
                model_out['part4']['gender'] = {"error":"insufficient_samples"}
            model_out['part4']['skin_pairwise'] = {}
            for a_s,b_s in [("light","mid"),("light","dark"),("mid","dark")]:
                va = svals[skins==a_s]; vb = svals[skins==b_s]
                if len(va[~np.isnan(va)])>=2 and len(vb[~np.isnan(vb)])>=2:
                    tstat, t_p = stats.ttest_ind(va[~np.isnan(va)], vb[~np.isnan(vb)], equal_var=False, nan_policy='omit')
                    mw = stats.mannwhitneyu(va[~np.isnan(va)], vb[~np.isnan(vb)], alternative='two-sided')
                    model_out['part4']['skin_pairwise'][f"{a_s}_vs_{b_s}"] = {"a_mean": float(np.nanmean(va)), "b_mean": float(np.nanmean(vb)),
                                                                              "t_p": float(t_p), "t_stat": float(tstat), "mw_p": float(mw.pvalue)}
                else:
                    model_out['part4']['skin_pairwise'][f"{a_s}_vs_{b_s}"] = {"error":"insufficient_samples"}

    if part5_df is None:
        model_out['part5'] = {"error":"part5_csv_missing"}
    else:
        col_ttr = None
        for cand in ["ttr","ttr_"+m]:
            if cand in part5_df.columns:
                col_ttr = cand; break
        gender_col = None
        for cand in ["inferred_gender","gender_label","inferred_gender_"+m,"gender"]:
            if cand in part5_df.columns:
                gender_col = cand; break
        skin_col = None
        for cand in ["inferred_skin","skin_group_label","inferred_skin_"+m,"skin_group"]:
            if cand in part5_df.columns:
                skin_col = cand; break
        if col_ttr is None:
            model_out['part5'] = {"error":"ttr_column_missing"}
        else:
            tvals = pd.to_numeric(part5_df[col_ttr], errors='coerce').to_numpy(dtype=float)
            genders = part5_df[gender_col].astype(str).values if gender_col is not None else np.array(["unknown"]*len(part5_df))
            skins = part5_df[skin_col].astype(str).values if skin_col is not None else np.array(["unknown"]*len(part5_df))
            a = tvals[genders=="male"]; b = tvals[genders=="female"]
            if len(a[~np.isnan(a)])>=2 and len(b[~np.isnan(b)])>=2:
                tstat, t_p = stats.ttest_ind(a[~np.isnan(a)], b[~np.isnan(b)], equal_var=False, nan_policy='omit')
                mw = stats.mannwhitneyu(a[~np.isnan(a)], b[~np.isnan(b)], alternative='two-sided')
                model_out['part5']['gender'] = {"male_mean": float(np.nanmean(a)), "female_mean": float(np.nanmean(b)),
                                                "t_p": float(t_p), "t_stat": float(tstat), "mw_p": float(mw.pvalue)}
            else:
                model_out['part5']['gender'] = {"error":"insufficient_samples"}
            model_out['part5']['skin_pairwise'] = {}
            for a_s,b_s in [("light","mid"),("light","dark"),("mid","dark")]:
                va = tvals[skins==a_s]; vb = tvals[skins==b_s]
                if len(va[~np.isnan(va)])>=2 and len(vb[~np.isnan(vb)])>=2:
                    tstat, t_p = stats.ttest_ind(va[~np.isnan(va)], vb[~np.isnan(vb)], equal_var=False, nan_policy='omit')
                    mw = stats.mannwhitneyu(va[~np.isnan(va)], vb[~np.isnan(vb)], alternative='two-sided')
                    model_out['part5']['skin_pairwise'][f"{a_s}_vs_{b_s}"] = {"a_mean": float(np.nanmean(va)), "b_mean": float(np.nanmean(vb)),
                                                                               "t_p": float(t_p), "t_stat": float(tstat), "mw_p": float(mw.pvalue)}
                else:
                    model_out['part5']['skin_pairwise'][f"{a_s}_vs_{b_s}"] = {"error":"insufficient_samples"}

    anova_skin = {}
    per_model_numeric = {
        "adj_proj_avg": ("part2", ["adj_proj_avg","adj_proj"]),
        "sentiment_compound": ("part4", ["sentiment_compound"]),
        "ttr": ("part5", ["ttr"]),
        "sim_to_male": ("part1", ["sim_to_male_centroid","sim_to_male_centroid_"+m]),
        "sim_to_female": ("part1", ["sim_to_female_centroid","sim_to_female_centroid_"+m]),
        "sim_skin_light": ("part1", ["sim_to_skin_light_centroid","sim_to_skin_light_centroid_"+m]),
        "sim_skin_mid": ("part1", ["sim_to_skin_mid_centroid","sim_to_skin_mid_centroid_"+m]),
        "sim_skin_dark": ("part1", ["sim_to_skin_dark_centroid","sim_to_skin_dark_centroid_"+m])
    }

    def find_first_column(df, candidates):
        if df is None: return None
        for c in candidates:
            if isinstance(c, str):
                if c in df.columns:
                    return c
        for cand in ["adj_proj","sentiment","ttr","sim_to_male","sim_to_female","sim_to_skin"]:
            for col in df.columns:
                if cand in col:
                    return col
        return None

    for metric, (part_key, cand_list) in per_model_numeric.items():
        df = csvs[m].get(part_key)
        if df is None:
            anova_skin[metric] = {"error":"missing_part_csv"}
            continue
        col = find_first_column(df, cand_list)
        if col is None:
            anova_skin[metric] = {"error":"metric_column_missing"}
            continue
        skin_col = None
        for cand in ["inferred_skin","skin_group_label","inferred_skin_"+m,"skin_group"]:
            if cand in df.columns:
                skin_col = cand; break
        if skin_col is None:
            anova_skin[metric] = {"error":"skin_column_missing"}
            continue
        arrs = []
        ns = []
        for s in ["light","mid","dark"]:
            vals = pd.to_numeric(df.loc[df[skin_col].astype(str)==s, col], errors='coerce').to_numpy(dtype=float)
            arrs.append(vals)
            ns.append(int(np.sum(~np.isnan(vals))))
        f, p = compute_anova_across_models(arrs)
        anova_skin[metric] = {"f": f, "p": p, "n_per_group": ns}

    model_out['anova_skin'] = anova_skin

    results['models'][m] = model_out

print("\nComputing cross-model ANOVA and chi2 across models.")

numeric_metrics = {
    "adj_proj_avg": ("part2", ["adj_proj_avg", "adj_proj"]),
    "sentiment_compound": ("part4", ["sentiment_compound"]),
    "ttr": ("part5", ["ttr"]),
    "sim_to_male": ("part1", ["sim_to_male_centroid"] + [f"sim_to_male_centroid_{mm}" for mm in MODEL_NAMES]),
    "sim_to_female": ("part1", ["sim_to_female_centroid"] + [f"sim_to_female_centroid_{mm}" for mm in MODEL_NAMES]),
    "sim_skin_light": ("part1", ["sim_to_skin_light_centroid"] + [f"sim_to_skin_light_centroid_{mm}" for mm in MODEL_NAMES]),
    "sim_skin_mid": ("part1", ["sim_to_skin_mid_centroid"] + [f"sim_to_skin_mid_centroid_{mm}" for mm in MODEL_NAMES]),
    "sim_skin_dark": ("part1", ["sim_to_skin_dark_centroid"] + [f"sim_to_skin_dark_centroid_{mm}" for mm in MODEL_NAMES])
}

def find_first_column(df, candidates):
    if df is None: return None
    for c in candidates:
        if isinstance(c, str):
            if c in df.columns:
                return c
    for cand in ["adj_proj","sentiment","ttr","sim_to_male","sim_to_female","sim_to_skin"]:
        for col in df.columns:
            if cand in col:
                return col
    return None

cross = {"anova":{}, "chi2":{}}
gender_groups = ["male","female"]
skin_groups = ["light","mid","dark"]

for metric, (part_key, cand_list) in numeric_metrics.items():
    cross['anova'][metric] = {"gender":{}, "skin":{}}
    for g in gender_groups:
        arrs = []
        ns = []
        for m in MODEL_NAMES:
            df = csvs[m].get(part_key)
            if df is None:
                arrs.append(np.array([])); ns.append(0); continue
            col = find_first_column(df, cand_list)
            if col is None:
                arrs.append(np.array([])); ns.append(0); continue
            subgroup_col = None
            for cand in ["inferred_gender","gender_label","gender","inferred_gender_"+m]:
                if cand in df.columns:
                    subgroup_col = cand; break
            if subgroup_col is None:
                arrs.append(np.array([])); ns.append(0); continue
            vals = pd.to_numeric(df.loc[df[subgroup_col].astype(str)==g, col], errors='coerce').to_numpy(dtype=float)
            arrs.append(vals)
            ns.append(int(np.sum(~np.isnan(vals))))
        f,p = compute_anova_across_models(arrs)
        cross['anova'][metric]['gender'][g] = {"f": f, "p": p, "n_per_model": ns}
    for s in skin_groups:
        arrs = []
        ns = []
        for m in MODEL_NAMES:
            df = csvs[m].get(part_key)
            if df is None:
                arrs.append(np.array([])); ns.append(0); continue
            col = find_first_column(df, cand_list)
            if col is None:
                arrs.append(np.array([])); ns.append(0); continue
            subgroup_col = None
            for cand in ["inferred_skin","skin_group_label","inferred_skin_"+m,"skin_group"]:
                if cand in df.columns:
                    subgroup_col = cand; break
            if subgroup_col is None:
                arrs.append(np.array([])); ns.append(0); continue
            vals = pd.to_numeric(df.loc[df[subgroup_col].astype(str)==s, col], errors='coerce').to_numpy(dtype=float)
            arrs.append(vals)
            ns.append(int(np.sum(~np.isnan(vals))))
        f,p = compute_anova_across_models(arrs)
        cross['anova'][metric]['skin'][s] = {"f": f, "p": p, "n_per_model": ns}

binary_metrics = {
    "gender_mention": ("part3", ["gender_mention"]),
    "race_mention": ("part3", ["race_mention"])
}
for bin_metric, (part_key, cand_list) in binary_metrics.items():
    cross['chi2'][bin_metric] = {"gender":{}, "skin":{}}
    for g in gender_groups:
        arrs = []
        for m in MODEL_NAMES:
            df = csvs[m].get(part_key)
            if df is None:
                arrs.append(np.array([])); continue
            col = find_first_column(df, cand_list)
            if col is None:
                arrs.append(np.array([])); continue
            subgroup_col = None
            for cand in ["inferred_gender","gender_label","gender"]:
                if cand in df.columns:
                    subgroup_col = cand; break
            if subgroup_col is None:
                arrs.append(np.array([])); continue
            vals = pd.to_numeric(df.loc[df[subgroup_col].astype(str)==g, col], errors='coerce').to_numpy(dtype=float)
            arrs.append(vals)
        chi2, chi_p, table = compute_chi2_across_models_binary(arrs)
        counts = []
        for a in arrs:
            a = np.asarray(a); a = a[~np.isnan(a)]
            counts.append({"n": int(len(a)), "x": int(np.sum(a==1)) if len(a)>0 else 0})
        cross['chi2'][bin_metric]['gender'][g] = {"chi2": chi2, "chi2_p": chi_p, "counts_per_model": counts, "contingency_table": table}
    for s in skin_groups:
        arrs = []
        for m in MODEL_NAMES:
            df = csvs[m].get(part_key)
            if df is None:
                arrs.append(np.array([])); continue
            col = find_first_column(df, cand_list)
            if col is None:
                arrs.append(np.array([])); continue
            subgroup_col = None
            for cand in ["inferred_skin","skin_group_label","inferred_skin","skin_group"]:
                if cand in df.columns:
                    subgroup_col = cand; break
            if subgroup_col is None:
                arrs.append(np.array([])); continue
            vals = pd.to_numeric(df.loc[df[subgroup_col].astype(str)==s, col], errors='coerce').to_numpy(dtype=float)
            arrs.append(vals)
        chi2, chi_p, table = compute_chi2_across_models_binary(arrs)
        counts = []
        for a in arrs:
            a = np.asarray(a); a = a[~np.isnan(a)]
            counts.append({"n": int(len(a)), "x": int(np.sum(a==1)) if len(a)>0 else 0})
        cross['chi2'][bin_metric]['skin'][s] = {"chi2": chi2, "chi2_p": chi_p, "counts_per_model": counts, "contingency_table": table}

final_output = {"timestamp": datetime.now(timezone.utc).isoformat(), "models": results['models'], "cross_model": cross}
out_json_path = os.path.join(OUT_DIR, "part7_recomputed_with_anova.json")
with open(out_json_path, "w", encoding="utf-8") as jf:
    json.dump(final_output, jf, indent=2)
print("\nWrote final JSON:", out_json_path)

rows = []
for m in MODEL_NAMES:
    md = results['models'].get(m, {})
    p1g = md.get('part1', {}).get('gender', {})
    rows.append({"model": m, "metric":"part1_energy_distance_gender", "value": p1g.get('energy_distance'), "p": p1g.get('perm_p')})
    spp = md.get('part1', {}).get('skin_pairwise', {})
    rows.append({"model": m, "metric":"part1_energy_distance_light_vs_dark", "value": spp.get('light_vs_dark',{}).get('energy_distance'), "p": spp.get('light_vs_dark',{}).get('perm_p')})
    p2g = md.get('part2', {}).get('gender', {})
    rows.append({"model": m, "metric":"part2_adj_cohens_d", "value": p2g.get('cohens_d'), "p": p2g.get('t_p')})
    p3g = md.get('part3', {}).get('gender_tokens', {})
    rows.append({"model": m, "metric":"part3_gender_token_rate_diff", "value": (p3g.get('p_m') - p3g.get('p_f')) if p3g.get('p_m') is not None else None, "p": p3g.get('chi2_p')})
    p3r = md.get('part3', {}).get('race_tokens', {})
    rows.append({"model": m, "metric":"part3_race_token_rate_diff", "value": (p3r.get('p_m') - p3r.get('p_f')) if p3r.get('p_m') is not None else None, "p": p3r.get('chi2_p')})
    p4g = md.get('part4', {}).get('gender', {})
    rows.append({"model": m, "metric":"part4_sentiment_mean_diff", "value": (p4g.get('female_mean') - p4g.get('male_mean')) if 'female_mean' in p4g else None, "p": p4g.get('t_p')})
    p5g = md.get('part5', {}).get('gender', {})
    rows.append({"model": m, "metric":"part5_ttr_mean_diff", "value": (p5g.get('female_mean') - p5g.get('male_mean')) if 'female_mean' in p5g else None, "p": p5g.get('t_p')})

table_df = pd.DataFrame(rows)
csv_out = os.path.join(OUT_DIR, "part7_recomputed_table_for_paper.csv")
table_df.to_csv(csv_out, index=False)
print("Wrote compact CSV for paper:", csv_out)

print("\nDone. Outputs written to:", OUT_DIR)


In [None]:
import os, json, math
import numpy as np
import pandas as pd
from scipy import stats
from scipy.spatial.distance import cdist
from datetime import datetime, timezone

DRIVE_BASE = "/content/drive/MyDrive/image_captioning_bias"
IN_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs")
OUT_DIR = os.path.join(DRIVE_BASE, "bias_analysis_outputs_part7_pvalues")
os.makedirs(OUT_DIR, exist_ok=True)

MODEL_NAMES = ["chatgpt","claude","gemini"]
N_PERMS = 1000
RNG_SEED = 42

PART1_CSV = os.path.join(IN_DIR, "part1_bias_metrics_{m}.csv")
PART2_CSV = os.path.join(IN_DIR, "part2_bias_metrics_{m}.csv")
PART3_CSV = os.path.join(IN_DIR, "part3_bias_metrics_{m}.csv")
PART4_CSV = os.path.join(IN_DIR, "part4_bias_metrics_{m}.csv")
PART5_CSV = os.path.join(IN_DIR, "part5_bias_metrics_{m}.csv")

rng = np.random.default_rng(RNG_SEED)

def safe_read_csv(path):
    if not os.path.exists(path):
        return None
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"[WARN] Could not read {path}: {e}")
        return None

def multivariate_energy_distance(X, Y):
    X = np.asarray(X); Y = np.asarray(Y)
    if X.size==0 or Y.size==0:
        return float('nan')
    if X.ndim==1: X = X.reshape(-1,1)
    if Y.ndim==1: Y = Y.reshape(-1,1)
    n = X.shape[0]; m = Y.shape[0]
    cross = cdist(X, Y, metric="euclidean")
    a = (2.0/(n*m))*np.sum(cross)
    xx = cdist(X, X, metric="euclidean") if n>1 else np.zeros((1,1))
    yy = cdist(Y, Y, metric="euclidean") if m>1 else np.zeros((1,1))
    b = (1.0/(n*n))*np.sum(xx) if n>1 else 0.0
    c = (1.0/(m*m))*np.sum(yy) if m>1 else 0.0
    return float(max(a - b - c, 0.0))

def two_prop_ztest(x1,n1,x2,n2):
    if n1==0 or n2==0:
        return float('nan'), 1.0
    p1 = x1 / n1
    p2 = x2 / n2
    p_pool = (x1 + x2) / (n1 + n2)
    denom = math.sqrt(max(p_pool*(1-p_pool)*(1.0/n1 + 1.0/n2), 1e-20))
    z = (p1 - p2) / denom
    p = 2.0 * (1.0 - stats.norm.cdf(abs(z)))
    return float(z), float(p)

def cohens_d_from_arrays(a,b):
    a = np.asarray(a); b = np.asarray(b)
    a = a[~np.isnan(a)]; b = b[~np.isnan(b)]
    if len(a)<2 or len(b)<2:
        return float('nan')
    ma, mb = np.mean(a), np.mean(b)
    sa, sb = np.std(a, ddof=1), np.std(b, ddof=1)
    pooled = math.sqrt(((len(a)-1)*sa*sa + (len(b)-1)*sb*sb) / (len(a)+len(b)-2))
    if pooled == 0:
        return float('nan')
    return float((ma-mb)/pooled)

def ensure_p(p):
    try:
        if p is None or (isinstance(p,float) and math.isnan(p)):
            return 1.0
        return float(p)
    except:
        return 1.0

csvs = {}
for m in MODEL_NAMES:
    csvs[m] = {
        "part1": safe_read_csv(PART1_CSV.format(m=m)),
        "part2": safe_read_csv(PART2_CSV.format(m=m)),
        "part3": safe_read_csv(PART3_CSV.format(m=m)),
        "part4": safe_read_csv(PART4_CSV.format(m=m)),
        "part5": safe_read_csv(PART5_CSV.format(m=m)),
    }
    sizes = {k:(len(v) if v is not None else 0) for k,v in csvs[m].items()}
    print(f"{m}: rows per part ->", sizes)

out = {"timestamp": datetime.now(timezone.utc).isoformat(), "models": {}}

for m in MODEL_NAMES:
    print(f"\nProcessing model: {m}")
    res = {"part1":{}, "part3":{}, "part4":{}, "part5":{}}
    df1 = csvs[m]["part1"]
    if df1 is None:
        print(f"  WARN part1 CSV missing for {m}; setting p-values to 1.0")
        res['part1'] = {
            "gender_perm_p": 1.0,
            "skin_pairwise_perm_p": {"light_vs_mid":1.0,"light_vs_dark":1.0,"mid_vs_dark":1.0}
        }
    else:
        def find_col(df, candidates):
            for c in candidates:
                if c in df.columns: return c
            for col in df.columns:
                for cand in candidates:
                    if isinstance(cand,str) and cand in col:
                        return col
            return None
        c_m = find_col(df1, ["sim_to_male_centroid", f"sim_to_male_centroid_{m}"])
        c_f = find_col(df1, ["sim_to_female_centroid", f"sim_to_female_centroid_{m}"])
        c_light = find_col(df1, ["sim_to_skin_light_centroid", f"sim_to_skin_light_centroid_{m}"])
        c_mid   = find_col(df1, ["sim_to_skin_mid_centroid", f"sim_to_skin_mid_centroid_{m}"])
        c_dark  = find_col(df1, ["sim_to_skin_dark_centroid", f"sim_to_skin_dark_centroid_{m}"])
        feature_cols = [c for c in [c_m, c_f, c_light, c_mid, c_dark] if c is not None]
        feat = df1[feature_cols].apply(pd.to_numeric, errors='coerce').to_numpy(dtype=float) if feature_cols else np.empty((len(df1),0))
        gender_col = next((c for c in ["gender_label","inferred_gender","inferred_gender_"+m] if c in df1.columns), None)
        skin_col = next((c for c in ["skin_group_label","inferred_skin","inferred_skin_"+m,"skin_group"] if c in df1.columns), None)
        genders = df1[gender_col].astype(str).values if gender_col is not None else np.array(["unknown"]*len(df1))
        skins = df1[skin_col].astype(str).values if skin_col is not None else np.array(["unknown"]*len(df1))

        mask_m = (genders == "male"); mask_f = (genders == "female")
        n_m = int(mask_m.sum()); n_f = int(mask_f.sum())
        if n_m>0 and n_f>0 and feat.size>0:
            X = feat[mask_m]; Y = feat[mask_f]
            ed_obs = multivariate_energy_distance(X,Y)
            perm_ge = 0
            idxs = np.arange(len(feat))
            for i in range(N_PERMS):
                perm = rng.permutation(idxs)
                a_idx = perm[:n_m]; b_idx = perm[n_m:n_m+n_f]
                val = multivariate_energy_distance(feat[a_idx], feat[b_idx])
                if val >= ed_obs:
                    perm_ge += 1
            p_perm = (perm_ge + 1) / (N_PERMS + 1)
        else:
            ed_obs = float('nan'); p_perm = 1.0
            if n_m==0 or n_f==0:
                print(f"  WARN part1 gender groups insufficient for {m}: n_m={n_m}, n_f={n_f}")
        res['part1']['gender_energy_distance'] = float(ed_obs) if not math.isnan(ed_obs) else None
        res['part1']['gender_perm_p'] = ensure_p(p_perm)
        res['part1']['skin_pairwise_perm_p'] = {}
        for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
            ma = (skins==a); mb = (skins==b)
            na = int(ma.sum()); nb = int(mb.sum())
            if na>1 and nb>1 and feat.size>0:
                ed_ab = multivariate_energy_distance(feat[ma], feat[mb])
                idxs = np.where((skins==a)|(skins==b))[0]
                count_ge = 0
                nperm = max(200, int(N_PERMS/5))
                for i in range(nperm):
                    perm = rng.permutation(idxs)
                    a_idx = perm[:na]; b_idx = perm[na:na+nb]
                    if multivariate_energy_distance(feat[a_idx], feat[b_idx]) >= ed_ab:
                        count_ge += 1
                p_ab = (count_ge + 1) / (nperm + 1)
            else:
                ed_ab = float('nan'); p_ab = 1.0
                if na<=1 or nb<=1:
                    print(f"  WARN part1 skin pair insufficient {a} vs {b} for {m}: na={na}, nb={nb}")
            res['part1']['skin_pairwise_perm_p'][f"{a}_vs_{b}"] = ensure_p(p_ab)

    df3 = csvs[m]["part3"]
    if df3 is None:
        print(f"  WARN part3 CSV missing for {m}; setting p-values to 1.0")
        res['part3']['gender_tokens_chi2_p'] = 1.0
        res['part3']['race_tokens_chi2_p'] = 1.0
        res['part3']['skin_pairwise_gender_mention_p'] = {"light_vs_mid":1.0,"light_vs_dark":1.0,"mid_vs_dark":1.0}
        res['part3']['skin_pairwise_race_mention_p'] = {"light_vs_mid":1.0,"light_vs_dark":1.0,"mid_vs_dark":1.0}
    else:
        col_gender_mention = next((c for c in ["gender_mention", f"gender_mention_{m}"] if c in df3.columns), None)
        col_race_mention = next((c for c in ["race_mention", f"race_mention_{m}"] if c in df3.columns), None)
        gender_col = next((c for c in ["inferred_gender","gender_label", f"inferred_gender_{m}"] if c in df3.columns), None)
        skin_col = next((c for c in ["inferred_skin","skin_group_label", f"inferred_skin_{m}", "skin_group"] if c in df3.columns), None)

        if col_gender_mention and gender_col:
            flags = pd.to_numeric(df3[col_gender_mention], errors='coerce').to_numpy(dtype=float)
            genders = df3[gender_col].astype(str).values
            xm = int(np.nansum(flags[genders=="male"]==1)) if (genders=="male").sum()>0 else 0
            nm = int((genders=="male").sum())
            xf = int(np.nansum(flags[genders=="female"]==1)) if (genders=="female").sum()>0 else 0
            nf = int((genders=="female").sum())
            try:
                table = np.array([[nm-xm, xm],[nf-xf, xf]])
                chi2, chi_p, _, _ = stats.chi2_contingency(table)
            except Exception:
                chi2, chi_p = None, None
            res['part3']['gender_tokens_chi2_p'] = ensure_p(chi_p)
        else:
            res['part3']['gender_tokens_chi2_p'] = 1.0

        if col_race_mention and gender_col:
            flags = pd.to_numeric(df3[col_race_mention], errors='coerce').to_numpy(dtype=float)
            genders = df3[gender_col].astype(str).values
            xm = int(np.nansum(flags[genders=="male"]==1)) if (genders=="male").sum()>0 else 0
            nm = int((genders=="male").sum())
            xf = int(np.nansum(flags[genders=="female"]==1)) if (genders=="female").sum()>0 else 0
            nf = int((genders=="female").sum())
            try:
                table = np.array([[nm-xm, xm],[nf-xf, xf]])
                chi2, chi_p, _, _ = stats.chi2_contingency(table)
            except Exception:
                chi2, chi_p = None, None
            res['part3']['race_tokens_chi2_p'] = ensure_p(chi_p)
        else:
            res['part3']['race_tokens_chi2_p'] = 1.0

        res['part3']['skin_pairwise_gender_mention_p'] = {}
        res['part3']['skin_pairwise_race_mention_p'] = {}
        if skin_col:
            if col_gender_mention:
                flags = pd.to_numeric(df3[col_gender_mention], errors='coerce').to_numpy(dtype=float)
                skins = df3[skin_col].astype(str).values
                for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                    xa = int(np.nansum(flags[skins==a]==1)) if (skins==a).sum()>0 else 0
                    na = int((skins==a).sum())
                    xb = int(np.nansum(flags[skins==b]==1)) if (skins==b).sum()>0 else 0
                    nb = int((skins==b).sum())
                    z,p = two_prop_ztest(xa,na,xb,nb)
                    res['part3']['skin_pairwise_gender_mention_p'][f"{a}_vs_{b}"] = ensure_p(p)
            else:
                for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                    res['part3']['skin_pairwise_gender_mention_p'][f"{a}_vs_{b}"] = 1.0

            if col_race_mention:
                flags = pd.to_numeric(df3[col_race_mention], errors='coerce').to_numpy(dtype=float)
                skins = df3[skin_col].astype(str).values
                for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                    xa = int(np.nansum(flags[skins==a]==1)) if (skins==a).sum()>0 else 0
                    na = int((skins==a).sum())
                    xb = int(np.nansum(flags[skins==b]==1)) if (skins==b).sum()>0 else 0
                    nb = int((skins==b).sum())
                    z,p = two_prop_ztest(xa,na,xb,nb)
                    res['part3']['skin_pairwise_race_mention_p'][f"{a}_vs_{b}"] = ensure_p(p)
            else:
                for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                    res['part3']['skin_pairwise_race_mention_p'][f"{a}_vs_{b}"] = 1.0
        else:
            for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                res['part3']['skin_pairwise_gender_mention_p'][f"{a}_vs_{b}"] = 1.0
                res['part3']['skin_pairwise_race_mention_p'][f"{a}_vs_{b}"] = 1.0

    df4 = csvs[m]["part4"]
    res['part4']['gender_t_p'] = 1.0
    res['part4']['gender_mw_p'] = 1.0
    res['part4']['skin_pairwise'] = {"light_vs_mid":{"t_p":1.0,"mw_p":1.0}, "light_vs_dark":{"t_p":1.0,"mw_p":1.0},"mid_vs_dark":{"t_p":1.0,"mw_p":1.0}}
    if df4 is not None:
        col_sent = next((c for c in ["sentiment_compound", f"sentiment_compound_{m}"] if c in df4.columns), None)
        gender_col = next((c for c in ["inferred_gender","gender_label", f"inferred_gender_{m}"] if c in df4.columns), None)
        skin_col = next((c for c in ["inferred_skin","skin_group_label", f"inferred_skin_{m}", "skin_group"] if c in df4.columns), None)
        if col_sent and gender_col:
            svals = pd.to_numeric(df4[col_sent], errors='coerce').to_numpy(dtype=float)
            genders = df4[gender_col].astype(str).values
            a = svals[genders=="male"]; b = svals[genders=="female"]
            if np.sum(~np.isnan(a))>=2 and np.sum(~np.isnan(b))>=2:
                tstat, t_p = stats.ttest_ind(a[~np.isnan(a)], b[~np.isnan(b)], equal_var=False, nan_policy='omit')
                mw = stats.mannwhitneyu(a[~np.isnan(a)], b[~np.isnan(b)], alternative='two-sided')
                res['part4']['gender_t_p'] = ensure_p(t_p)
                res['part4']['gender_mw_p'] = ensure_p(mw.pvalue)
        if col_sent and skin_col:
            svals = pd.to_numeric(df4[col_sent], errors='coerce').to_numpy(dtype=float)
            skins = df4[skin_col].astype(str).values
            for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                va = svals[skins==a]; vb = svals[skins==b]
                if np.sum(~np.isnan(va))>=2 and np.sum(~np.isnan(vb))>=2:
                    tstat, t_p = stats.ttest_ind(va[~np.isnan(va)], vb[~np.isnan(vb)], equal_var=False, nan_policy='omit')
                    mw = stats.mannwhitneyu(va[~np.isnan(va)], vb[~np.isnan(vb)], alternative='two-sided')
                    res['part4']['skin_pairwise'][f"{a}_vs_{b}"] = {"t_p": ensure_p(t_p), "mw_p": ensure_p(mw.pvalue)}
                else:
                    res['part4']['skin_pairwise'][f"{a}_vs_{b}"] = {"t_p": 1.0, "mw_p": 1.0}

    df5 = csvs[m]["part5"]
    res['part5']['gender_t_p'] = 1.0
    res['part5']['gender_mw_p'] = 1.0
    res['part5']['skin_pairwise'] = {"light_vs_mid":{"t_p":1.0,"mw_p":1.0}, "light_vs_dark":{"t_p":1.0,"mw_p":1.0},"mid_vs_dark":{"t_p":1.0,"mw_p":1.0}}
    if df5 is not None:
        col_ttr = next((c for c in ["ttr", f"ttr_{m}"] if c in df5.columns), None)
        gender_col = next((c for c in ["inferred_gender","gender_label", f"inferred_gender_{m}"] if c in df5.columns), None)
        skin_col = next((c for c in ["inferred_skin","skin_group_label", f"inferred_skin_{m}", "skin_group"] if c in df5.columns), None)
        if col_ttr and gender_col:
            tvals = pd.to_numeric(df5[col_ttr], errors='coerce').to_numpy(dtype=float)
            genders = df5[gender_col].astype(str).values
            a = tvals[genders=="male"]; b = tvals[genders=="female"]
            if np.sum(~np.isnan(a))>=2 and np.sum(~np.isnan(b))>=2:
                tstat, t_p = stats.ttest_ind(a[~np.isnan(a)], b[~np.isnan(b)], equal_var=False, nan_policy='omit')
                mw = stats.mannwhitneyu(a[~np.isnan(a)], b[~np.isnan(b)], alternative='two-sided')
                res['part5']['gender_t_p'] = ensure_p(t_p)
                res['part5']['gender_mw_p'] = ensure_p(mw.pvalue)
        if col_ttr and skin_col:
            tvals = pd.to_numeric(df5[col_ttr], errors='coerce').to_numpy(dtype=float)
            skins = df5[skin_col].astype(str).values
            for a,b in [("light","mid"),("light","dark"),("mid","dark")]:
                va = tvals[skins==a]; vb = tvals[skins==b]
                if np.sum(~np.isnan(va))>=2 and np.sum(~np.isnan(vb))>=2:
                    tstat, t_p = stats.ttest_ind(va[~np.isnan(va)], vb[~np.isnan(vb)], equal_var=False, nan_policy='omit')
                    mw = stats.mannwhitneyu(va[~np.isnan(va)], vb[~np.isnan(vb)], alternative='two-sided')
                    res['part5']['skin_pairwise'][f"{a}_vs_{b}"] = {"t_p": ensure_p(t_p), "mw_p": ensure_p(mw.pvalue)}
                else:
                    res['part5']['skin_pairwise'][f"{a}_vs_{b}"] = {"t_p": 1.0, "mw_p": 1.0}

    out['models'][m] = res

out_path = os.path.join(OUT_DIR, "part7_pvalues_filled.json")
with open(out_path, "w", encoding="utf-8") as jf:
    json.dump(out, jf, indent=2)
print("\nWrote JSON:", out_path)

rows = []
for m in MODEL_NAMES:
    md = out['models'][m]
    rows.append({"model":m, "metric":"part1_gender_perm_p", "M_vs_F_p": md['part1']['gender_perm_p'] if 'gender_perm_p' in md['part1'] else md['part1'].get('gender_perm_p',1.0)})
    spp = md['part1'].get('skin_pairwise_perm_p', {})
    for pair in ["light_vs_mid","mid_vs_dark","light_vs_dark"]:
        rows.append({"model":m, "metric":f"part1_{pair}_perm_p", "p": spp.get(pair,1.0)})

    rows.append({"model":m, "metric":"part3_gender_tokens_chi2_p", "p": md['part3'].get('gender_tokens_chi2_p',1.0)})
    rows.append({"model":m, "metric":"part3_race_tokens_chi2_p", "p": md['part3'].get('race_tokens_chi2_p',1.0)})
    for pair in ["light_vs_mid","mid_vs_dark","light_vs_dark"]:
        rows.append({"model":m, "metric":f"part3_skin_gender_mention_{pair}_p", "p": md['part3']['skin_pairwise_gender_mention_p'].get(pair,1.0)})
        rows.append({"model":m, "metric":f"part3_skin_race_mention_{pair}_p", "p": md['part3']['skin_pairwise_race_mention_p'].get(pair,1.0)})

    rows.append({"model":m, "metric":"part4_gender_t_p", "p": md['part4'].get('gender_t_p',1.0)})
    rows.append({"model":m, "metric":"part4_gender_mw_p", "p": md['part4'].get('gender_mw_p',1.0)})
    for pair in ["light_vs_mid","mid_vs_dark","light_vs_dark"]:
        pvals = md['part4']['skin_pairwise'].get(pair, {"t_p":1.0,"mw_p":1.0})
        rows.append({"model":m, "metric":f"part4_{pair}_t_p", "p": pvals.get("t_p",1.0)})
        rows.append({"model":m, "metric":f"part4_{pair}_mw_p", "p": pvals.get("mw_p",1.0)})

    rows.append({"model":m, "metric":"part5_gender_t_p", "p": md['part5'].get('gender_t_p',1.0)})
    rows.append({"model":m, "metric":"part5_gender_mw_p", "p": md['part5'].get('gender_mw_p',1.0)})
    for pair in ["light_vs_mid","mid_vs_dark","light_vs_dark"]:
        pvals = md['part5']['skin_pairwise'].get(pair, {"t_p":1.0,"mw_p":1.0})
        rows.append({"model":m, "metric":f"part5_{pair}_t_p", "p": pvals.get("t_p",1.0)})
        rows.append({"model":m, "metric":f"part5_{pair}_mw_p", "p": pvals.get("mw_p",1.0)})

df_rows = pd.DataFrame(rows)
csv_out = os.path.join(OUT_DIR, "part7_pvalues_grid.csv")
df_rows.to_csv(csv_out, index=False)
print("Wrote CSV:", csv_out)

print("\nDone. All requested p-values are present (non-null). If some tests lacked data, p=1.0 was used and a WARN was printed above.")
