# Alt-Text Data Analysis - Pandas

Comparing content of AI-generated alt text for a set of 1200 images. VLMs used were GPT-4.1, Claude Sonnet 4.5, and Google Gemini 2.5 Pro.

This notebook reads in the caption files as csvs, splits according to gender and race demographics, and performs analysis of trends in appearance- and identity-based attributes in the captions. It generates some preliminary tables and heatmaps to visualize the results.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

In [None]:
df_chatgpt = pd.read_csv("/path/to/file")
df_claude = pd.read_csv("/path/to/file")
df_gemini = pd.read_csv("/path/to/file")
df_chatgpt

In [None]:
# split dataframe by filename syntax: now have gender and race separated

# Function to extract the first two numbers from the filename
def extract_numbers(filename):
  parts = filename.split('_')
  if len(parts) >= 2:
    try:
      return int(parts[0]), int(parts[1])
    except ValueError:
      return None, None # Handle cases where parts[0] or parts[1] are not integers
  return None, None


In [None]:
# split for chatgpt
df_chatgpt[['first_num', 'second_num']] = df_chatgpt['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))

# create separate dataframes for each combination of the first and second numbers
split_dataframes_chatgpt = {}

for first in [1, 2]:
  for second in range(1, 7):
    df_name = f'df_chatgpt_{first}_{second}'
    split_dataframes_chatgpt[df_name] = df_chatgpt[(df_chatgpt['first_num'] == first) & (df_chatgpt['second_num'] == second)].copy()

for df_name, dataframe in split_dataframes_chatgpt.items():
    print(f"\nDataframe for {df_name}:")
    if not dataframe.empty:
        display(dataframe.head())
    else:
        print("DataFrame is empty.")

In [None]:
# split for claude
df_claude[['first_num', 'second_num']] = df_claude['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))

split_dataframes_claude = {}

for first in [1, 2]:
  for second in range(1, 7):
    df_name = f'df_claude_{first}_{second}'
    split_dataframes_claude[df_name] = df_claude[(df_claude['first_num'] == first) & (df_claude['second_num'] == second)].copy()

for df_name, dataframe in split_dataframes_claude.items():
    print(f"\nDataframe for {df_name}:")
    if not dataframe.empty:
        display(dataframe.head())
    else:
        print("DataFrame is empty.")

In [None]:
# split for gemini
df_gemini[['first_num', 'second_num']] = df_gemini['Filename'].apply(lambda x: pd.Series(extract_numbers(x)))

split_dataframes_gemini = {}

for first in [1, 2]:
  for second in range(1, 7):
    df_name = f'df_gemini_{first}_{second}'
    split_dataframes_gemini[df_name] = df_gemini[(df_gemini['first_num'] == first) & (df_gemini['second_num'] == second)].copy()

for df_name, dataframe in split_dataframes_gemini.items():
    print(f"\nDataframe for {df_name}:")
    if not dataframe.empty:
        display(dataframe.head())
    else:
        print("DataFrame is empty.")

In [None]:
# creating lists for terms we want to search for

skin_strings = ["light skin", "medium skin", "dark skin", "tan skin",  "olive skin", "fair skin", "brown skin", "warm-toned skin", "fair-skinned", "light-skinned", "dark-skinned", "tan-skinned", "tanned skin"]
hair_strings = ["brown hair", "black hair", "red hair", "gray hair", "white hair", "light brown hair",
        "dark brown hair", "dreadlocks", "curly hair", "natural hair", "textured hair", "short hair", "styled hair", "light hair", "dark hair", "blonde", "wavy hair",
              "curly dark hair", "curly black hair", "afro", "braided hair"]
facial_hair_strings = ["mustache", "beard", "sideburns"]
race_strings = ["African American", "Caucasian", "Indian", "Mexican", "Muslim", "Hindu", "Hispanic", "Latino", "Latina", "Black", "white man", "white male", "white woman", "white female", "South Asian", "white flight attendant"]
gender_strings = ["woman", "man", "male", "female", "girl", "boy", "he", "she", "his", "her"]


1. GPT-4.1 search

In [None]:
import re

# function to find which skin tone substrings are in a caption as whole words
def get_found_substrings(caption, substrings):
  found = []
  for sub in substrings:
    # Use regex to find whole word matches
    if re.search(r'\b' + re.escape(sub) + r'\b', caption):
      found.append(sub)
  return found if found else None # Return None if no substrings are found

In [None]:
# chatgpt - search for strings

# add a new column to each dataframe listing found skin tone substrings
for df_name, dataframe in split_dataframes_chatgpt.items():
  dataframe['contains_skin'] = dataframe['Caption'].apply(lambda x: get_found_substrings(x, skin_strings))

# add new columns for other lists of terms
term_lists = {
    'contains_hair': hair_strings,
    'contains_facial_hair': facial_hair_strings,
    'contains_race': race_strings,
    'contains_gender': gender_strings
}

for df_name, dataframe in split_dataframes_chatgpt.items():
  for col_name, term_list in term_lists.items():
    dataframe[col_name] = dataframe['Caption'].apply(lambda x: get_found_substrings(x, term_list))

# display the head of each modified dataframe to verify (optional)
for df_name, dataframe in split_dataframes_chatgpt.items():
    print(f"\nDataframe for {df_name} with found skin tone substrings:")
    display(dataframe.head())

In [None]:
for df_name, dataframe in split_dataframes_chatgpt.items():
  print(f"\nCounts for {df_name}:")
  for col in dataframe.columns:
    if col.startswith('contains_'):
      non_none_count = dataframe[col].notna().sum()
      none_count = dataframe[col].isna().sum()
      print(f"  {col}: {non_none_count} non-None, {none_count} None")

In [None]:
# Count the frequency of each unique value within the lists in the 'contains_' columns for ChatGPT
chatgpt_term_frequencies = {}
for df_name, dataframe in split_dataframes_chatgpt.items():
  chatgpt_term_frequencies[df_name] = {}
  print(f"\nFrequency counts for {df_name}:")
  for col in dataframe.columns:
    if col.startswith('contains_') and dataframe[col].notna().any():
      all_terms = [item for sublist in dataframe[col].dropna() for item in sublist]
      term_counts = pd.Series(all_terms).value_counts()
      chatgpt_term_frequencies[df_name][col] = term_counts
      print(f"  {col}:")
      display(term_counts)
    elif col.startswith('contains_'):
        print(f"  {col}: No terms found")

2. Claude Sonnet 4.5 Search (repeat the same process)



In [None]:
# string counting for claude

for df_name, dataframe in split_dataframes_claude.items():
  dataframe['contains_skin'] = dataframe['Caption'].apply(lambda x: get_found_substrings(x, skin_strings))

term_lists = {
    'contains_hair': hair_strings,
    'contains_facial_hair': facial_hair_strings,
    'contains_race': race_strings,
    'contains_gender': gender_strings
}

for df_name, dataframe in split_dataframes_claude.items():
  for col_name, term_list in term_lists.items():
    dataframe[col_name] = dataframe['Caption'].apply(lambda x: get_found_substrings(x, term_list))


for df_name, dataframe in split_dataframes_claude.items():
    print(f"\nDataframe for {df_name} with found skin tone substrings:")
    display(dataframe.head())

In [None]:
# Count non-None values for each 'contains_' column in each dataframe for Claude
for df_name, dataframe in split_dataframes_claude.items():
  print(f"\nCounts for {df_name}:")
  for col in dataframe.columns:
    if col.startswith('contains_'):
      non_none_count = dataframe[col].notna().sum()
      none_count = dataframe[col].isna().sum()
      print(f"  {col}: {non_none_count} non-None, {none_count} None")

In [None]:
# Count the frequency of each unique value within the lists in the 'contains_' columns for Claude
claude_term_frequencies = {}
for df_name, dataframe in split_dataframes_claude.items():
  claude_term_frequencies[df_name] = {}
  print(f"\nFrequency counts for {df_name}:")
  for col in dataframe.columns:
    if col.startswith('contains_') and dataframe[col].notna().any():
      all_terms = [item for sublist in dataframe[col].dropna() for item in sublist]
      term_counts = pd.Series(all_terms).value_counts()
      claude_term_frequencies[df_name][col] = term_counts
      print(f"  {col}:")
      display(term_counts)
    elif col.startswith('contains_'):
        print(f"  {col}: No terms found")

3. Gemini 2.5 Pro Search (repeat again)

In [None]:
# string counts for gemini

for df_name, dataframe in split_dataframes_gemini.items():
  dataframe['contains_skin'] = dataframe['Caption'].apply(lambda x: get_found_substrings(x, skin_strings))

term_lists = {
    'contains_hair': hair_strings,
    'contains_facial_hair': facial_hair_strings,
    'contains_race': race_strings,
    'contains_gender': gender_strings
}

for df_name, dataframe in split_dataframes_gemini.items():
  for col_name, term_list in term_lists.items():
    dataframe[col_name] = dataframe['Caption'].apply(lambda x: get_found_substrings(x, term_list))

for df_name, dataframe in split_dataframes_gemini.items():
    print(f"\nDataframe for {df_name} with found skin tone substrings:")
    display(dataframe.head())

In [None]:
# Count non-None values for each 'contains_' column in each dataframe for Gemini
for df_name, dataframe in split_dataframes_gemini.items():
  print(f"\nCounts for {df_name}:")
  for col in dataframe.columns:
    if col.startswith('contains_'):
      non_none_count = dataframe[col].notna().sum()
      none_count = dataframe[col].isna().sum()
      print(f"  {col}: {non_none_count} non-None, {none_count} None")

In [None]:
# Count the frequency of each unique value within the lists in the 'contains_' columns for Gemini
gemini_term_frequencies = {}
for df_name, dataframe in split_dataframes_gemini.items():
  gemini_term_frequencies[df_name] = {}
  print(f"\nFrequency counts for {df_name}:")
  for col in dataframe.columns:
    if col.startswith('contains_') and dataframe[col].notna().any():
      all_terms = [item for sublist in dataframe[col].dropna() for item in sublist]
      term_counts = pd.Series(all_terms).value_counts()
      gemini_term_frequencies[df_name][col] = term_counts
      print(f"  {col}:")
      display(term_counts)
    elif col.startswith('contains_'):
        print(f"  {col}: No terms found")

Build the complete table!

In [None]:
import pandas as pd
from collections import Counter

def build_full_demographic_term_table_safe(chatgpt_dict, claude_dict, gemini_dict, list_cols=None):
    if list_cols is None:
        list_cols = ["contains_skin", "contains_hair", "contains_facial_hair",
                     "contains_race", "contains_gender"]

    models_dict = {
        "gpt": chatgpt_dict,
        "claude": claude_dict,
        "gemini": gemini_dict
    }

    tone_bins = {
        "light": [1, 2],
        "medium": [3, 4],
        "dark": [5, 6]
    }

    # Step 1: Collect all unique terms across all models
    all_terms_set = set()
    for split_dict in models_dict.values():
        for idx, df in enumerate(split_dict.values()):
            for col in list_cols:
                for items in df[col]:
                    if items:
                        all_terms_set.update(items)

    all_terms = sorted(all_terms_set)
    output = pd.DataFrame({"term": all_terms})

    # Step 2: Fill counts for each model + tone bin
    for model_name, split_dict in models_dict.items():
        for tone_label, skins in tone_bins.items():
            c = Counter()
            for idx, df in enumerate(split_dict.values()):
                gender = (idx // 6) + 1
                skin   = (idx % 6) + 1
                if skin not in skins:
                    continue
                for col in list_cols:
                    for items in df[col]:
                        if items:
                            c.update(items)

            colname = f"{model_name}_{tone_label}"
            output[colname] = output['term'].map(c).fillna(0).astype(int)

    return output


In [None]:
final_table = build_full_demographic_term_table_safe(
    split_dataframes_chatgpt,
    split_dataframes_claude,
    split_dataframes_gemini,
    list_cols=["contains_skin","contains_hair","contains_facial_hair","contains_race","contains_gender"]
)

final_table
# Compute the row-wise total across all columns except 'term'
final_table['total_count'] = final_table.drop(columns='term').sum(axis=1)

# Keep only rows where total_count >= 10
final_table_filtered = final_table[final_table['total_count'] >= 10].copy()

# Optional: drop the 'total_count' column if you no longer need it
final_table_filtered = final_table_filtered.drop(columns='total_count')

final_table_filtered



In [None]:
final_table.to_excel('path/to/file')

In [None]:
# Function to display term frequencies per split dataframe for a given model
def display_split_frequencies(term_frequencies_dict, model_name):
  print(f"\nTerm Frequencies per Split Dataframe for {model_name}:")
  for df_name, term_counts_dict in term_frequencies_dict.items():
    print(f"\n  Dataframe: {df_name}")
    for col, term_counts in term_counts_dict.items():
      print(f"    {col}:")
      display(term_counts.sort_values(ascending=False))

# Function to calculate total frequency of each term across all dataframes for a model
def calculate_total_frequencies(term_frequencies_dict):
  total_frequencies = {}
  for df_name, term_counts_dict in term_frequencies_dict.items():
    for col, term_counts in term_counts_dict.items():
      for term, count in term_counts.items():
        if term not in total_frequencies:
          total_frequencies[term] = 0
        total_frequencies[term] += count
  return total_frequencies


# Calculate term frequencies per split dataframe for each model
# Assuming the cells above already populated split_dataframes_chatgpt, split_dataframes_claude, and split_dataframes_gemini

chatgpt_term_frequencies = {}
for df_name, dataframe in split_dataframes_chatgpt.items():
  chatgpt_term_frequencies[df_name] = {}
  for col_name, term_list in term_lists.items():
    if col_name in dataframe.columns and dataframe[col_name].notna().any():
        all_terms = [item for sublist in dataframe[col_name].dropna() for item in sublist]
        term_counts = pd.Series(all_terms).value_counts()
        chatgpt_term_frequencies[df_name][col_name] = term_counts

claude_term_frequencies = {}
for df_name, dataframe in split_dataframes_claude.items():
  claude_term_frequencies[df_name] = {}
  for col_name, term_list in term_lists.items():
    if col_name in dataframe.columns and dataframe[col_name].notna().any():
        all_terms = [item for sublist in dataframe[col_name].dropna() for item in sublist]
        term_counts = pd.Series(all_terms).value_counts()
        claude_term_frequencies[df_name][col_name] = term_counts

gemini_term_frequencies = {}
for df_name, dataframe in split_dataframes_gemini.items():
  gemini_term_frequencies[df_name] = {}
  for col_name, term_list in term_lists.items():
    if col_name in dataframe.columns and dataframe[col_name].notna().any():
        all_terms = [item for sublist in dataframe[col_name].dropna() for item in sublist]
        term_counts = pd.Series(all_terms).value_counts()
        gemini_term_frequencies[df_name][col_name] = term_counts


# Calculate total frequencies for each model
total_frequencies_chatgpt = calculate_total_frequencies(chatgpt_term_frequencies)
total_frequencies_claude = calculate_total_frequencies(claude_term_frequencies)
total_frequencies_gemini = calculate_total_frequencies(gemini_term_frequencies)


# Display split frequencies for each model
# display_split_frequencies(chatgpt_term_frequencies, "ChatGPT")
# display_split_frequencies(claude_term_frequencies, "Claude")
# display_split_frequencies(gemini_term_frequencies, "Gemini")

In [None]:
# Create dataframes from the total frequency dictionaries
df_total_chatgpt = pd.DataFrame.from_dict(total_frequencies_chatgpt, orient='index', columns=['ChatGPT Total Count'])
df_total_claude = pd.DataFrame.from_dict(total_frequencies_claude, orient='index', columns=['Claude Total Count'])
df_total_gemini = pd.DataFrame.from_dict(total_frequencies_gemini, orient='index', columns=['Gemini Total Count'])

# Merge the dataframes
# Use outer join to include all terms from all models
total_comparison_table = df_total_chatgpt.merge(df_total_claude, left_index=True, right_index=True, how='outer')
total_comparison_table = total_comparison_table.merge(df_total_gemini, left_index=True, right_index=True, how='outer')

# Fill NaN values with 0 (for terms not present in all models)
total_comparison_table = total_comparison_table.fillna(0).astype(int)

# Sort the table by the total count across all models (optional)
total_comparison_table['Total'] = total_comparison_table.sum(axis=1)
total_comparison_table = total_comparison_table.sort_values(by='Total', ascending=False).drop('Total', axis=1)

# Display the total comparison table
print("\nTotal Term Frequencies Across All Split Dataframes:")
display(total_comparison_table)

Comparing models' term counts

In [None]:
def compare_split_frequencies_tables(chatgpt_term_frequencies, claude_term_frequencies, gemini_term_frequencies):
    """
    Compares term frequencies across split dataframes for each model and displays them in tables.
    """
    print("\nComparing Term Frequencies per Split Dataframe Across Models:")

    # define the expected 12 split dataframe names
    expected_df_names = []
    for first in [1, 2]:
        for second in range(1, 7):
            expected_df_names.append(f'df_chatgpt_{first}_{second}') # assume consistent naming

    for df_name_chatgpt in expected_df_names:
        # construct the corresponding dataframe names for Claude and Gemini
        df_name_claude = df_name_chatgpt.replace('df_chatgpt_', 'df_claude_')
        df_name_gemini = df_name_chatgpt.replace('df_chatgpt_', 'df_gemini_')


        print(f"\n--- Comparison Table for {df_name_chatgpt} ---")

        # get the term frequencies for the current split from each model, defaulting to empty dict if not present
        chatgpt_counts_split = chatgpt_term_frequencies.get(df_name_chatgpt, {})
        claude_counts_split = claude_term_frequencies.get(df_name_claude, {})
        gemini_counts_split = gemini_term_frequencies.get(df_name_gemini, {})

        combined_split_counts = {}

        all_terms_in_split = set()
        for counts_split in [chatgpt_counts_split, claude_counts_split, gemini_counts_split]:
            for col_counts in counts_split.values():
                all_terms_in_split.update(col_counts.index)


        for term in all_terms_in_split:
             combined_split_counts[term] = {'ChatGPT Count': 0, 'Claude Count': 0, 'Gemini Count': 0}
             for col in ['contains_skin', 'contains_eyes', 'contains_hair', 'contains_facial_hair', 'contains_race', 'contains_gender']:
                if col in chatgpt_counts_split and term in chatgpt_counts_split[col]:
                    combined_split_counts[term]['ChatGPT Count'] += chatgpt_counts_split[col][term]
                if col in claude_counts_split and term in claude_counts_split[col]:
                     combined_split_counts[term]['Claude Count'] += claude_counts_split[col][term]
                if col in gemini_counts_split and term in gemini_counts_split[col]:
                    combined_split_counts[term]['Gemini Count'] += gemini_counts_split[col][term]


        comparison_split_table = pd.DataFrame.from_dict(combined_split_counts, orient='index')

        comparison_split_table = comparison_split_table.fillna(0).astype(int)

        comparison_split_table['Total'] = comparison_split_table.sum(axis=1)

        comparison_split_table = comparison_split_table.sort_values(by='Total', ascending=False)

        display(comparison_split_table)

In [None]:
# Call the function to generate and display the comparison tables for each split
compare_split_frequencies_tables(chatgpt_term_frequencies, claude_term_frequencies, gemini_term_frequencies)

In [None]:
# sample heatmap

import seaborn as sns
import matplotlib.pyplot as plt

first_six_dfs = [f'df_chatgpt_1_{i}' for i in range(1, 7)]
second_six_dfs = [f'df_chatgpt_2_{i}' for i in range(1, 7)]

def prepare_heatmap_data(term_frequencies_dict, df_names, term_types):
    heatmap_data = {}
    for df_name in df_names:
        model_name = df_name.split('_')[1]
        base_df_name = '_'.join(df_name.split('_')[2:])

        if model_name == 'chatgpt':
            current_df_name = df_name
            counts_split = chatgpt_term_frequencies.get(current_df_name, {})
        elif model_name == 'claude':
            current_df_name = df_name.replace('df_chatgpt_', 'df_claude_')
            counts_split = claude_term_frequencies.get(current_df_name, {})
        elif model_name == 'gemini':
            current_df_name = df_name.replace('df_chatgpt_', 'df_gemini_')
            counts_split = gemini_term_frequencies.get(current_df_name, {})
        else:
            counts_split = {}


        for term_type in term_types:
          if term_type in counts_split:
            for term, count in counts_split[term_type].items():
                if term not in heatmap_data:
                    heatmap_data[term] = {}
                heatmap_data[term][df_name] = count

    heatmap_df = pd.DataFrame.from_dict(heatmap_data, orient='index').fillna(0).astype(int)

    ordered_cols = [df for df in df_names if df in heatmap_df.columns]
    heatmap_df = heatmap_df[ordered_cols]

    return heatmap_df

all_term_types = ['contains_skin', 'contains_hair', 'contains_facial_hair', 'contains_race', 'contains_gender']

# male data
heatmap_first_six_chatgpt = prepare_heatmap_data(chatgpt_term_frequencies, first_six_dfs, all_term_types)
heatmap_first_six_claude = prepare_heatmap_data(claude_term_frequencies, [name.replace('chatgpt', 'claude') for name in first_six_dfs], all_term_types)
heatmap_first_six_gemini = prepare_heatmap_data(gemini_term_frequencies, [name.replace('chatgpt', 'gemini') for name in first_six_dfs], all_term_types)


# female data
heatmap_second_six_chatgpt = prepare_heatmap_data(chatgpt_term_frequencies, second_six_dfs, all_term_types)
heatmap_second_six_claude = prepare_heatmap_data(claude_term_frequencies, [name.replace('chatgpt', 'claude') for name in second_six_dfs], all_term_types)
heatmap_second_six_gemini = prepare_heatmap_data(gemini_term_frequencies, [name.replace('chatgpt', 'gemini') for name in second_six_dfs], all_term_types)


def plot_heatmaps(heatmap_dataframes, title):
    plt.figure(figsize=(12, len(heatmap_dataframes) * 0.5)) # Adjust figure size based on number of terms
    sns.heatmap(heatmap_dataframes, annot=True, fmt="d", cmap="YlGnBu")
    plt.title(title)
    plt.xlabel("Fitzpatrick Skin Tone")
    plt.ylabel("Terms")
    plt.xticks(ticks=range(len(heatmap_dataframes.columns)), labels=[str(i) for i in range(1, 7)])
    plt.tight_layout()
    plt.show()

# plot heatmaps for all terms. this shows the light, medium, then dark men then women.
plot_heatmaps(heatmap_first_six_chatgpt, "ChatGPT Appearance Term Frequency (Men)")
plot_heatmaps(heatmap_first_six_claude, "Claude Appearance Term Frequency (Men)")
plot_heatmaps(heatmap_first_six_gemini, "Gemini Appearance Term Frequency (Men)")

plot_heatmaps(heatmap_second_six_chatgpt, "ChatGPT Appearance Term Frequency (Women)")
plot_heatmaps(heatmap_second_six_claude, "Claude Appearance Term Frequency (Women)")
plot_heatmaps(heatmap_second_six_gemini, "Gemini Appearance Term Frequency (Women)")