Adapted from chair.py for thesis experimentation. 

In [1]:
# load the save imid_to_objects dictionary from '../data/coco/imid_to_objects.json'
import json

with open('../data/coco/imid_to_objects.json', 'r') as f:
    imid_to_objects = json.load(f)

# print a bit of the imid_to_objects dictionary to make sure it worked
print("Sample data from imid_to_objects:")
for imid, objects in list(imid_to_objects.items())[:5]:
    print(f"Image ID: {imid}, Objects: {objects}")
# print the number of images in the imid_to_objects dictionary
print(f"Total number of images: {len(imid_to_objects)}")

Sample data from imid_to_objects:
Image ID: 558840, Objects: ['car', 'dining table', 'hot dog', 'cup', 'spoon', 'bottle', 'person']
Image ID: 200365, Objects: ['car', 'dining table', 'hot dog', 'cup', 'bicycle']
Image ID: 495357, Objects: ['dog', 'person', 'motorcycle', 'handbag']
Image ID: 116061, Objects: ['car', 'dog', 'handbag', 'bottle', 'person', 'motorcycle', 'bus']
Image ID: 16164, Objects: ['dog', 'toilet']
Total number of images: 40504


In [2]:
# load generated captions from file '../data/smolvlm_results_combined_1.tsv'
import pandas as pd
caption_df = pd.read_csv('../data/smolvlm_m2_vti.tsv', sep='\t')
# print the first few rows of the captions DataFrame
print("Sample data from captions DataFrame:")
display(caption_df.head())

caps = caption_df['prompt1'].tolist()

print(f"Total number of captions: {len(caps)}")
print(f"Sample captions: {caps[:5]}")


Sample data from captions DataFrame:


Unnamed: 0,index,prompt1,prompt2,prompt3,prompt4
0,0,In this image we can see a person holding an u...,Yes.,Male.,Male.
1,1,A kitchen with a white door and a stove.,No.,Male.,Female.
2,2,A girl is holding a cat in her arms.,Yes.,GIRL.,GIRL.
3,3,"In this picture we can see a toilet, bottle, r...",No.,Male.,Female.
4,4,In this image we can see a washroom. There are...,No.,Male.,Male.


Total number of captions: 5000
Sample captions: ['In this image we can see a person holding an umbrella and holding a rope. There are animals. There are people. There are trees. There are houses. There are clouds in the sky.', 'A kitchen with a white door and a stove.', 'A girl is holding a cat in her arms.', 'In this picture we can see a toilet, bottle, roll of paper, and a wall.', 'In this image we can see a washroom. There are two sinks, a fire extinguisher, a mirror, a pipe, a light, a switch board, a pipe, a wall, a door, a floor, a wall, a pipe']


In [3]:
# load image ids from hugggingface dataset validation yerevann/coco-karpathy
# from datasets import load_dataset
# dataset = load_dataset('yerevann/coco-karpathy', split='validation')
# eval_imids = list(dataset['cocoid'])

# #write eval_imids to a file '../data/coco/karpathy_eval_imids.json'
# with open('../data/coco/karpathy_eval_imids.json', 'w') as f:
#     json.dump(eval_imids, f)

# read eval_imids from the file '../data/coco/karpathy_eval_imids.json'
with open('../data/coco/karpathy_eval_imids.json', 'r') as f:
    eval_imids = json.load(f)

print(eval_imids[:5])
print(f"Total number of eval image ids: {len(eval_imids)}")

[184613, 403013, 562150, 360772, 340559]
Total number of eval image ids: 5000


In [4]:
# load synonyms.txt
with open('synonyms.txt', 'r') as f:
    synonyms_txt = f.readlines()

print("Synonyms Loaded Successfully!")

synonyms = [s.strip().split(', ') for s in synonyms_txt if s.strip()]
mscoco_objects = [] #mscoco objects and *all* synonyms
inverse_synonym_dict = {}
for synonym in synonyms:
    mscoco_objects.extend(synonym)
    for s in synonym:
        inverse_synonym_dict[s] = synonym[0]

#Some hard coded rules for implementing CHAIR metrics on MSCOCO
        
#common 'double words' in MSCOCO that should be treated as a single word
coco_double_words = ['motor bike', 'motor cycle', 'air plane', 'traffic light', 'street light', 'traffic signal', 'stop light', 'fire hydrant', 'stop sign', 'parking meter', 'suit case', 'sports ball', 'baseball bat', 'baseball glove', 'tennis racket', 'wine glass', 'hot dog', 'cell phone', 'mobile phone', 'teddy bear', 'hair drier', 'potted plant', 'bow tie', 'laptop computer', 'stove top oven', 'hot dog', 'teddy bear', 'home plate', 'train track']
        
#Hard code some rules for special cases in MSCOCO
#qualifiers like 'baby' or 'adult' animal will lead to a false fire for the MSCOCO object 'person'.  'baby bird' --> 'bird'.
animal_words = ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'animal', 'cub']
#qualifiers like 'passenger' vehicle will lead to a false fire for the MSCOCO object 'person'.  'passenger jet' --> 'jet'.
vehicle_words = ['jet', 'train']
        
#double_word_dict will map double words to the word they should be treated as in our analysis
        
double_word_dict = {}
for double_word in coco_double_words:
    double_word_dict[double_word] = double_word
for animal_word in animal_words:
    double_word_dict['baby %s' %animal_word] = animal_word
    double_word_dict['adult %s' %animal_word] = animal_word
for vehicle_word in vehicle_words:
    double_word_dict['passenger %s' %vehicle_word] = vehicle_word
double_word_dict['bow tie'] = 'tie'
double_word_dict['toilet seat'] = 'toilet'
double_word_dict['wine glas'] = 'wine glass'

Synonyms Loaded Successfully!


In [5]:
import nltk

nltk.download('wordnet', quiet=True)
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def caption_to_words(caption):
    '''
    Input: caption
    Output: MSCOCO words in the caption
    '''

    #standard preprocessing
    words = nltk.word_tokenize(caption.lower())
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]

    #replace double words
    i = 0
    double_words = []
    idxs = []
    while i < len(words):
        idxs.append(i) 
        double_word = ' '.join(words[i:i+2])
        if double_word in double_word_dict: 
            double_words.append(double_word_dict[double_word])
            i += 2
        else:
            double_words.append(words[i])
            i += 1
    words = double_words

    #toilet seat is not chair (sentences like "the seat of the toilet" will fire for "chair" if we do not include this line)
    if ('toilet' in words) & ('seat' in words): words = [word for word in words if word != 'seat']

    #get synonyms for all words in the caption
    idxs = [idxs[idx] for idx, word in enumerate(words) \
            if word in set(mscoco_objects)]
    words = [word for word in words if word in set(mscoco_objects)]
    node_words = []
    for word in words:
        node_words.append(inverse_synonym_dict[word])
    #return all the MSCOCO objects in the caption
    return words, node_words, idxs, double_words

In [6]:
def compute_chair(): 
    num_caps = 0.
    num_hallucinated_caps = 0.
    hallucinated_word_count = 0.
    coco_word_count = 0.

    num_recall_gt_objects = 0.
    num_gt_objects = 0.

    output = {'sentences': []} 

    for i in range(len(caps)):
        cap :str = caps[i]
        imid :int = eval_imids[i]

        words, node_words, idxs, raw_words = caption_to_words(cap) 

        if str(imid) not in imid_to_objects:
            print(f"Image ID {imid} not found in imid_to_objects. Skipping caption.")
            print(type(imid))
            continue

        gt_objects = imid_to_objects[str(imid)]
        cap_dict = {'image_id': imid, 
                        'caption': cap,
                        'mscoco_hallucinated_words': [],
                        'mscoco_gt_words': list(gt_objects),
                        'mscoco_generated_words': list(node_words),
                        'hallucination_idxs': [], 
                        'words': raw_words 
                        }
        
        cap_dict['metrics'] = {'CHAIRs': 0,
                                   'CHAIRi': 0,
                                   'Recall': 0}
        
        #count hallucinated words
        coco_word_count += len(node_words) 
        hallucinated = False
            
        # add
        recall_gt_objects = set()
        for word, node_word, idx in zip(words, node_words, idxs):
            if node_word not in gt_objects:
                hallucinated_word_count += 1 
                cap_dict['mscoco_hallucinated_words'].append((word, node_word))
                cap_dict['hallucination_idxs'].append(idx)
                hallucinated = True
            else:
                recall_gt_objects.add(node_word)
    
        #count hallucinated caps
        num_caps += 1
        if hallucinated:
            num_hallucinated_caps += 1
        
        # add
        num_gt_objects += len(gt_objects)
        num_recall_gt_objects += len(recall_gt_objects)

        cap_dict['metrics']['CHAIRs'] = int(hallucinated)
        cap_dict['metrics']['CHAIRi'] = 0.
        cap_dict['metrics']['Recall'] = 0.
        
        if len(words) > 0:
            cap_dict['metrics']['CHAIRi'] = len(cap_dict['mscoco_hallucinated_words'])/float(len(words))
        
        # add
        if len(gt_objects) > 0:
            cap_dict['metrics']['Recall'] = len(recall_gt_objects) / len(gt_objects)

        output['sentences'].append(cap_dict)
 
    chair_s = (num_hallucinated_caps/num_caps)
    chair_i = (hallucinated_word_count/coco_word_count)

    recall = num_recall_gt_objects / num_gt_objects

    output['overall_metrics'] = {'CHAIRs': chair_s,
                                    'CHAIRi': chair_i,
                                    'Recall': recall}

    return output 

In [7]:
output = compute_chair()

In [8]:
output['overall_metrics']

{'CHAIRs': 0.1664, 'CHAIRi': 0.09850846886323418, 'Recall': 0.5406761588157812}