In [1]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
import json

## Load Perspective API scores and calculate MaxToxicity

In [12]:
def load_perspective_scores(output_dir, dataset_type):
    files = [i for i in os.listdir(output_dir) if i.startswith('perspective_api_all_occupations_all_gens') and i.endswith('.csv')]
    print(files)
    perspective_scores = pd.concat([pd.read_csv(os.path.join(output_dir,i)) for i in files])
    if 'dataset_type' not in perspective_scores.columns:
        perspective_scores = perspective_scores.assign(dataset_type = dataset_type)
    
    perspective_scores['model_name'] = perspective_scores['model_name'].apply(lambda x: x.split('/')[-1].replace('-hf',''))
    perspective_scores['prompt_original'] = perspective_scores['prompt']
    perspective_scores['prompt'] = perspective_scores['prompt'].apply(lambda x: '_'.join(x.split('_')[:-1]))
    perspective_scores = perspective_scores.assign(seed = perspective_scores['prompt_original'].apply(lambda x: x.split('_')[-1]))
    
    if 'dataset_type' in perspective_scores.columns:
        print(perspective_scores[['model_name','dataset_type']].value_counts())
    else:
        print(perspective_scores['model_name'].value_counts())

    return perspective_scores


def format_df_agg(df_agg, agg_cols, col, dataset, prompt):
    df_agg = df_agg.describe(percentiles=[i / 100 for i in range(5,100,5)]).transpose().iloc[1:,]
    df_agg = df_agg[agg_cols]
    df_agg.index = df_agg.index.rename('model')
    df_agg = df_agg.reset_index()
    df_agg = df_agg.assign(score = col, dataset = dataset, prompt = prompt)[['score','dataset', 'prompt', 'model'] + agg_cols]

    return df_agg


def calc_max_toxicity(perspective_scores):
    prompts = perspective_scores['prompt'].unique()
    datasets = perspective_scores['dataset_type'].unique()
    score_cols = ['TOXICITY']
    # score_cols = ['TOXICITY', 'INSULT','IDENTITY_ATTACK','FLIRTATION']
    agg_cols = ['mean', 'std', '25%', '50%', '75%', '90%', 'max']
    
    df_list = []
    df_diff_list = []
    df_max_id_list = []
    for col in tqdm(score_cols):
        for dataset in datasets:
            for prompt in prompts:
                df_agg = perspective_scores[(perspective_scores['prompt'] == prompt) & (perspective_scores['dataset_type'] == dataset)]
                df_agg = pd.pivot_table(df_agg, values=col, columns='model_name', index=['filename_prefix','rank','image_set_seed','im_index','prompt_original']).reset_index()
                df_agg_max = df_agg.groupby(['filename_prefix','rank','image_set_seed','prompt_original']).agg('max')
                df_agg_min = df_agg.groupby(['filename_prefix','rank','image_set_seed','prompt_original']).agg('min')
                df_agg_diff = df_agg_max - df_agg_min

                df_agg_id_max = df_agg.groupby(['filename_prefix','rank','image_set_seed','prompt_original']).idxmax()
                del df_agg_id_max['im_index']
                for i in df_agg_id_max.columns:
                    df_agg_id_max.loc[:,i] = df_agg_id_max[i].apply(lambda x: df_agg.iloc[int(x)]['im_index'])
                df_agg_id_max = df_agg_id_max.reset_index()
                df_agg_id_max = df_agg_id_max.assign(score = col, dataset = dataset, prompt = prompt)

                df_agg = format_df_agg(df_agg_max, agg_cols, col, dataset, prompt)
                df_agg_diff = format_df_agg(df_agg_diff, agg_cols, col, dataset, prompt)
                
                df_list.append(df_agg)
                df_diff_list.append(df_agg_diff)
                df_max_id_list.append(df_agg_id_max)

    return [pd.concat(df_list), pd.concat(df_diff_list), pd.concat(df_max_id_list)]

In [13]:
perspective_scores = load_perspective_scores('outputs_vllm/physical_gender', 'physical_gender')

['perspective_api_all_occupations_all_gens_2_prompts_0.csv']
model_name              dataset_type   
InternVL3-2B            physical_gender    4080
Qwen2.5-VL-3B-Instruct  physical_gender    4080
SmolVLM2-2.2B-Instruct  physical_gender    4080
deepseek-vl2-tiny       physical_gender    4066
Name: count, dtype: int64


In [14]:
max_toxicity, max_toxicity_diff, max_toxicity_id = calc_max_toxicity(perspective_scores)

100%|██████████| 1/1 [00:00<00:00,  2.34it/s]


In [15]:
max_toxicity

Unnamed: 0,score,dataset,prompt,model,mean,std,25%,50%,75%,90%,max
0,TOXICITY,physical_gender,keywords_characteristics,InternVL3-2B,0.10892,0.088028,0.038049,0.076481,0.157253,0.253013,0.401485
1,TOXICITY,physical_gender,keywords_characteristics,Qwen2.5-VL-3B-Instruct,0.136902,0.081158,0.065797,0.124336,0.182691,0.241222,0.479008
2,TOXICITY,physical_gender,keywords_characteristics,SmolVLM2-2.2B-Instruct,0.148425,0.107819,0.076877,0.110732,0.177848,0.324406,0.507899
3,TOXICITY,physical_gender,keywords_characteristics,deepseek-vl2-tiny,0.122869,0.095251,0.060187,0.104579,0.151172,0.228699,0.584095
0,TOXICITY,physical_gender,keywords_personality,InternVL3-2B,0.049496,0.043069,0.027413,0.034277,0.048595,0.092493,0.241222
1,TOXICITY,physical_gender,keywords_personality,Qwen2.5-VL-3B-Instruct,0.103557,0.071353,0.038461,0.089143,0.164187,0.193523,0.347574
2,TOXICITY,physical_gender,keywords_personality,SmolVLM2-2.2B-Instruct,0.038136,0.046064,0.021756,0.026087,0.032451,0.055175,0.38115
3,TOXICITY,physical_gender,keywords_personality,deepseek-vl2-tiny,0.026807,0.021143,0.018786,0.02102,0.026882,0.037554,0.233071


## Count frequency of competency words

In [16]:
generations = pd.read_csv('outputs_vllm/physical_gender/generations.csv')
generations = generations.assign(text_tokens = generations['keywords_characteristics_0'].str.lower().str.replace(',','').str.replace('.','').str.split())

In [17]:
competence_tokens = []
with open('competency_words.txt', 'r') as f:
    for line in f:
        competence_tokens.append(line.replace('\n',''))

In [18]:
generations = generations.assign(competence_words = generations.apply(lambda x: set(x['text_tokens']).intersection(competence_tokens), axis=1))
generations = generations.assign(competence_count = generations['competence_words'].str.len())

In [19]:
generations['competence_count'].describe()

count    202080.000000
mean          0.953350
std           1.273863
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max          15.000000
Name: competence_count, dtype: float64