In [1]:
import base64
import json
import requests
import pandas as pd
import os
from dotenv import load_dotenv
from collections import defaultdict
import ast
load_dotenv()

# Custom OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')

<h2>BLIP-2 Analysis</h2>

In [2]:
# Load BLIP-2 dataset with generated answers
blip2_answers = pd.read_csv("blip2_answers.csv")
wrong_blip2_preds = blip2_answers[blip2_answers["correct_answer"].str.lower() != blip2_answers["mapped_predictions"].str.lower()]
wrong_blip2_preds.to_csv("wrong_blip2_preds.csv", index=False)

In [3]:
# Check which categories is BLIP-2 struggling with the most
wrong_blip2_preds["lesson_name"].value_counts()

introduction to plants                    537
flow of energy                            444
insects and other arthropods              310
evolution and classification of plants    116
echinoderms and invertebrate chordates     42
climate and its causes                     31
Name: lesson_name, dtype: int64

<h3> BLIP-2 analysis </h3>

In [5]:
# This function returns those images that have either defined or undefined labels. This is essential to understanding
# what types of images do our VLMs fail on.
def return_val_counts(df, has_labels):
    questions = df[df["image_has_labels_to_guess"] == has_labels]
    return questions["image_path"].value_counts()

In [6]:
# Get all images with undefined labels and that BLIP-2 answered questions incorrectly for
val_counts = return_val_counts(wrong_blip2_preds, "Yes")
val_counts[0:60]

../Dataset/test/abc_question_images/parts_plant_11160.png            5
../Dataset/test/abc_question_images/life_cycles_16223.png            5
../Dataset/test/abc_question_images/parts_plant_13579.png            5
../Dataset/test/abc_question_images/types_leaves_14425.png           5
../Dataset/test/abc_question_images/parts_flower_13366.png           5
../Dataset/test/abc_question_images/parts_flower_13364.png           5
../Dataset/test/abc_question_images/life_cycles_10035.png            5
../Dataset/test/abc_question_images/life_cycles_16216.png            5
../Dataset/test/abc_question_images/parts_leaf_13106.png             5
../Dataset/test/abc_question_images/parts_leaf_16266.png             4
../Dataset/test/abc_question_images/parts_leaf_16262.png             4
../Dataset/test/abc_question_images/types_leaves_14711.png           4
../Dataset/test/abc_question_images/rain_shadow_17525.png            4
../Dataset/test/abc_question_images/parts_plant_11145.png            4
../Dat

In [7]:
# Get all images that BLIP-2 correctly guessed answers for and where the image labels are not defined.
correct_blip2_preds = blip2_answers[blip2_answers["correct_answer"] == blip2_answers["mapped_predictions"]]
correct_blip2_preds.to_csv("correct_blip2_preds.csv", index=False)
val_counts = return_val_counts(correct_blip2_preds, "Yes")
val_counts[0:60]

../Dataset/test/abc_question_images/parts_leaf_10558.png             4
../Dataset/test/abc_question_images/rain_shadow_17535.png            4
../Dataset/test/abc_question_images/life_cycles_10650.png            3
../Dataset/test/abc_question_images/parts_flower_11137.png           3
../Dataset/test/abc_question_images/life_cycles_10580.png            3
../Dataset/test/abc_question_images/life_cycles_12290.png            3
../Dataset/test/abc_question_images/types_leaves_11021.png           2
../Dataset/test/abc_question_images/life_cycles_16211.png            2
../Dataset/test/abc_question_images/parts_plant_16274.png            2
../Dataset/test/abc_question_images/parts_seed_10003.png             2
../Dataset/test/abc_question_images/rain_shadow_17524.png            2
../Dataset/test/abc_question_images/life_cycles_12249.png            2
../Dataset/test/abc_question_images/types_leaves_11029.png           2
../Dataset/test/abc_question_images/parts_plant_13195.png            2
../Dat

In [8]:
# Get all images and questions that BLIP-2 gave wrong predictions for and where image labels are defined
val_counts = return_val_counts(wrong_blip2_preds, "No")

In [9]:
# Get all images and questions that BLIP-2 gave correct predictions for and where image labels are defined
val_counts = return_val_counts(correct_blip2_preds, "No")

<h2> LLaVA Analysis </h2>

In [10]:
all_llava_preds = pd.read_csv("llava_7B_answers.csv")
correct_llava_preds = all_llava_preds[all_llava_preds["correct_answer"].str.lower() == all_llava_preds["llava_generated_answers"].str.lower()]
correct_llava_preds.to_csv("correct_llava_preds.csv", index=False)
wrong_llava_preds = all_llava_preds[all_llava_preds["correct_answer"].str.lower() != all_llava_preds["llava_generated_answers"].str.lower()]
wrong_llava_preds.to_csv("wrong_llava_preds.csv", index=False)

<h3>Examine Image Complexity</h3>

In [11]:
# Get all images that have undefined labels and whose questions were answered incorrectly by LLaVA
val_counts = return_val_counts(wrong_llava_preds, "Yes")
val_counts[0:60]

../Dataset/test/abc_question_images/types_leaves_14425.png           5
../Dataset/test/abc_question_images/parts_plant_13217.png            5
../Dataset/test/abc_question_images/parts_leaf_16264.png             5
../Dataset/test/abc_question_images/life_cycles_10580.png            5
../Dataset/test/abc_question_images/life_cycles_16210.png            5
../Dataset/test/abc_question_images/parts_plant_11160.png            5
../Dataset/test/abc_question_images/parts_plant_13195.png            5
../Dataset/test/abc_question_images/parts_plant_13579.png            5
../Dataset/test/abc_question_images/parts_flower_13366.png           5
../Dataset/test/abc_question_images/parts_flower_13425.png           4
../Dataset/test/abc_question_images/parts_seed_17249.png             4
../Dataset/test/abc_question_images/life_cycles_12400.png            4
../Dataset/test/abc_question_images/life_cycles_12290.png            4
../Dataset/test/abc_question_images/parts_flower_13417.png           4
../Dat

In [12]:
# Get all images that LLaVA correctly guessed answers for and where the image labels are not provided
val_counts = return_val_counts(correct_llava_preds, "Yes")
val_counts[:60]

../Dataset/test/abc_question_images/rain_shadow_17535.png            5
../Dataset/test/abc_question_images/rain_shadow_17534.png            3
../Dataset/test/abc_question_images/parts_plant_13164.png            3
../Dataset/test/abc_question_images/life_cycles_10035.png            3
../Dataset/test/abc_question_images/life_cycles_16226.png            3
../Dataset/test/abc_question_images/parts_leaf_13149.png             3
../Dataset/test/abc_question_images/parts_plant_11145.png            3
../Dataset/test/abc_question_images/parts_flower_11137.png           3
../Dataset/test/abc_question_images/parts_seed_10003.png             2
../Dataset/test/abc_question_images/parts_plant_13577.png            2
../Dataset/test/abc_question_images/life_cycles_12427.png            2
../Dataset/test/abc_question_images/life_cycles_12404.png            2
../Dataset/test/abc_question_images/types_leaves_10988.png           2
../Dataset/test/abc_question_images/life_cycles_12400.png            2
../Dat

In [13]:
# Get all images and questions that LLaVA gave wrong predictions for and where image labels are provided
val_counts = return_val_counts(wrong_llava_preds, "No")
val_counts[:60]

../Dataset/test/question_images/food_chains_webs_435.png        8
../Dataset/test/question_images/food_chains_webs_6061.png       8
../Dataset/test/question_images/food_chains_webs_805.png        8
../Dataset/test/question_images/food_chains_webs_823.png        7
../Dataset/test/question_images/life_cycles_580.png             7
../Dataset/test/question_images/food_chains_webs_821.png        7
../Dataset/test/question_images/food_chains_webs_293.png        7
../Dataset/test/question_images/food_chains_webs_799.png        7
../Dataset/test/question_images/parts_leaf_6265.png             6
../Dataset/test/question_images/food_chains_webs_250.png        6
../Dataset/test/question_images/food_chains_webs_311.png        6
../Dataset/test/question_images/food_chains_webs_2111.png       6
../Dataset/test/question_images/parts_flower_3798.png           6
../Dataset/test/question_images/food_chains_webs_6058.png       6
../Dataset/test/question_images/food_chains_webs_6057.png       6
../Dataset

In [14]:
# Get all images and questions that LLaVA correctly guessed answers for and where image labels are provided
val_counts = return_val_counts(correct_llava_preds, "No")
val_counts[:60]

../Dataset/test/question_images/food_chains_webs_6047.png    7
../Dataset/test/question_images/food_chains_webs_6054.png    7
../Dataset/test/question_images/food_chains_webs_897.png     7
../Dataset/test/question_images/food_chains_webs_475.png     7
../Dataset/test/question_images/food_chains_webs_6033.png    7
../Dataset/test/question_images/parts_plant_3579.png         6
../Dataset/test/question_images/food_chains_webs_2124.png    6
../Dataset/test/question_images/life_cycles_43.png           6
../Dataset/test/question_images/food_chains_webs_800.png     6
../Dataset/test/question_images/food_chains_webs_258.png     6
../Dataset/test/question_images/parts_flower_3426.png        6
../Dataset/test/question_images/food_chains_webs_808.png     6
../Dataset/test/question_images/life_cycles_2454.png         6
../Dataset/test/question_images/life_cycles_2267.png         6
../Dataset/test/question_images/food_chains_webs_867.png     6
../Dataset/test/question_images/food_chains_webs_881.pn

In [15]:
all_llava_preds["image_has_labels_to_guess"].value_counts()

No     2831
Yes     454
Name: image_has_labels_to_guess, dtype: int64

<h2> GPT-4 Analysis </h2>

We perform manual image checks to understand the complexity of the image

In [16]:
gpt4_preds = pd.read_csv("../GPT4/GPT_4_preds.csv")
wrong_gpt4_preds = gpt4_preds[gpt4_preds["correct_answer"].str.lower() != gpt4_preds["gpt4_generated_answers"].str.lower()]
correct_gpt4_preds = gpt4_preds[gpt4_preds["correct_answer"].str.lower() == gpt4_preds["gpt4_generated_answers"].str.lower()]
wrong_gpt4_preds.to_csv("wrong_gpt4_preds.csv")
correct_gpt4_preds.to_csv("correct_gpt4_preds.csv")

In [17]:
# Get all images that have undefined labels and whose questions were answered incorrectly by GPT-4
val_counts = return_val_counts(wrong_gpt4_preds, "Yes")
val_counts[0:60]

../Dataset/test/abc_question_images/parts_leaf_16264.png             4
../Dataset/test/abc_question_images/parts_flower_11014.png           4
../Dataset/test/abc_question_images/parts_flower_11137.png           4
../Dataset/test/abc_question_images/parts_leaf_16266.png             4
../Dataset/test/abc_question_images/life_cycles_10580.png            4
../Dataset/test/abc_question_images/parts_flower_13366.png           4
../Dataset/test/abc_question_images/types_leaves_14759.png           3
../Dataset/test/abc_question_images/parts_flower_13424.png           3
../Dataset/test/abc_question_images/parts_flower_13425.png           3
../Dataset/test/abc_question_images/parts_plant_13217.png            3
../Dataset/test/abc_question_images/parts_flower_13417.png           3
../Dataset/test/abc_question_images/parts_plant_16272.png            3
../Dataset/test/abc_question_images/parts_leaf_13853.png             3
../Dataset/test/abc_question_images/types_leaves_10979.png           3
../Dat

In [18]:
# Get all images that GPT-4 correctly guessed answers for questions where the image label values are not provided
val_counts = return_val_counts(correct_gpt4_preds, "Yes")
val_counts[0:60]

../Dataset/test/abc_question_images/life_cycles_16216.png            5
../Dataset/test/abc_question_images/life_cycles_16223.png            5
../Dataset/test/abc_question_images/types_leaves_14425.png           5
../Dataset/test/abc_question_images/life_cycles_12290.png            5
../Dataset/test/abc_question_images/rain_shadow_17535.png            5
../Dataset/test/abc_question_images/life_cycles_10595.png            4
../Dataset/test/abc_question_images/parts_leaf_10558.png             4
../Dataset/test/abc_question_images/parts_seed_17249.png             4
../Dataset/test/abc_question_images/life_cycles_10650.png            4
../Dataset/test/abc_question_images/parts_plant_11160.png            4
../Dataset/test/abc_question_images/parts_plant_13195.png            4
../Dataset/test/abc_question_images/types_leaves_14728.png           4
../Dataset/test/abc_question_images/life_cycles_10035.png            4
../Dataset/test/abc_question_images/life_cycles_16210.png            4
../Dat

In [19]:
wrong_gpt4_preds["image_has_labels_to_guess"].value_counts()

No     555
Yes    210
Name: image_has_labels_to_guess, dtype: int64

In [20]:
correct_gpt4_preds["image_has_labels_to_guess"].value_counts()

No     2276
Yes     244
Name: image_has_labels_to_guess, dtype: int64

In [21]:
gpt4_preds["image_has_labels_to_guess"].value_counts()

No     2831
Yes     454
Name: image_has_labels_to_guess, dtype: int64

In [22]:
# Get all images that GPT-4 gave wrong predictions for and where image label values are provided
val_counts = return_val_counts(wrong_gpt4_preds, "No")
val_counts[0:60]

../Dataset/test/question_images/food_chains_webs_293.png     6
../Dataset/test/question_images/parts_flower_3798.png        5
../Dataset/test/question_images/food_chains_webs_6044.png    5
../Dataset/test/question_images/food_chains_webs_2104.png    5
../Dataset/test/question_images/food_chains_webs_1929.png    5
../Dataset/test/question_images/types_leaves_4743.png        4
../Dataset/test/question_images/food_chains_webs_273.png     4
../Dataset/test/question_images/food_chains_webs_287.png     4
../Dataset/test/question_images/food_chains_webs_6059.png    4
../Dataset/test/question_images/food_chains_webs_6051.png    4
../Dataset/test/question_images/types_leaves_4393.png        4
../Dataset/test/question_images/parts_leaf_6270.png          4
../Dataset/test/question_images/food_chains_webs_802.png     4
../Dataset/test/question_images/food_chains_webs_804.png     4
../Dataset/test/question_images/food_chains_webs_444.png     4
../Dataset/test/question_images/food_chains_webs_281.pn

In [23]:
# Get all images that GPT-4 correctly guessed answers for and where the image label values are provided.
val_counts = return_val_counts(correct_gpt4_preds, "No")
val_counts[0:60]

../Dataset/test/question_images/food_chains_webs_471.png     8
../Dataset/test/question_images/parts_plant_3579.png         8
../Dataset/test/question_images/food_chains_webs_426.png     8
../Dataset/test/question_images/food_chains_webs_6054.png    8
../Dataset/test/question_images/parts_plant_3164.png         8
../Dataset/test/question_images/food_chains_webs_303.png     8
../Dataset/test/question_images/food_chains_webs_2124.png    8
../Dataset/test/question_images/parts_flower_3426.png        8
../Dataset/test/question_images/life_cycles_342.png          8
../Dataset/test/question_images/parts_plant_6276.png         8
../Dataset/test/question_images/food_chains_webs_6058.png    8
../Dataset/test/question_images/parts_leaf_3860.png          7
../Dataset/test/question_images/parts_flower_3362.png        7
../Dataset/test/question_images/food_chains_webs_801.png     7
../Dataset/test/question_images/parts_leaf_3853.png          7
../Dataset/test/question_images/food_chains_webs_6061.p

<h2> Quick Assessment of True/False questions </h2>

We manually check the created CSV files below to understand if a failure pattern exists or not.

In [5]:
non_diag_mcq_df = pd.read_csv("../Dataset/test/NonDiagram_MCQ_QuestionsData.csv")
true_false_mcq_df = pd.read_csv("../Dataset/test/NonDiagram_True_False_QuestionsData.csv")

In [7]:
wrong_gpt4_mcq_preds = non_diag_mcq_df[non_diag_mcq_df["correct_answer"].str.lower() != non_diag_mcq_df["gpt4_generated_answers"].str.lower()]
wrong_gpt4_mcq_preds.to_csv("wrong_gpt4_mcq_preds.csv")
wrong_gpt4_tf_preds = true_false_mcq_df[true_false_mcq_df["correct_answer"] != true_false_mcq_df["gpt4_generated_answers"]]
wrong_gpt4_tf_preds.to_csv("wrong_gpt4_tf_preds.csv")

<h2> Find common incorrect predictions between BLIP-2 and LLaVA </h2>

In [184]:
common_blip_llava_preds = pd.merge(wrong_blip2_preds, wrong_llava_preds)
gpt4_preds = pd.read_csv("../GPT4/GPT_4_preds.csv")
all_gpt4_correct_preds = gpt4_preds[gpt4_preds["correct_answer"].str.lower() == gpt4_preds["gpt4_generated_answers"].str.lower()]

In [186]:
# Each image has multiple questions associated with it, so we extract only those datapoints where majority of the
# answers predicted by GPT4 for a particular question were correct.
preds_val_counts = all_gpt4_correct_preds["image_path"].value_counts()
all_val_counts = gpt4_preds["image_path"].value_counts()
image_paths = []
for image_path, question_count in preds_val_counts.items():
    total_question_count = all_val_counts[image_path]
    fraction = question_count / total_question_count
    
    # Let's assume that we want all questions for images where GPT-4 had a > 80% success rate
    # we do this to ensure that GPT-4 is used to generate descriptions for those images that it
    # understood well.
    if fraction >= 0.8:
        image_paths.append(image_path)
filtered_df = all_gpt4_correct_preds[all_gpt4_correct_preds["image_path"].isin(image_paths)]

In [187]:
all_common_preds = pd.merge(common_blip_llava_preds, filtered_df)
all_common_preds.head()

Unnamed: 0.1,Unnamed: 0,lesson_name,question_name,answer_choice_1,answer_choice_2,answer_choice_3,answer_choice_4,correct_answer,image_path,image_has_labels_to_guess,caption,blip_2_generated_answers,mapped_predictions,llava_generated_answers,gpt4_generated_answers
0,19,climate and its causes,Moist air comes from what area in the diagram?,Atacama desert,The Andes slopes,Clouds,Forest,Forest,../Dataset/test/question_images/rain_shadow_75...,No,a diagram of the water cycle,c,Clouds,Atacama desert,Forest
1,20,climate and its causes,How many arrows are in the diagram?,3,4,2,6,6,../Dataset/test/question_images/rain_shadow_75...,No,a diagram of the water cycle,a,3,3,6
2,32,climate and its causes,"In the diagram, how many slopes does the mount...",3,4,2,1,2,../Dataset/test/question_images/rain_shadow_75...,No,a diagram of a mountain with a few different t...,a,3,3,2
3,47,climate and its causes,Where does the rain shadow occur?,Wet Leeward Side,Dry Leeward Side,Dry Windward Slope,Rainy Windward SLope,Dry Leeward Side,../Dataset/test/question_images/rain_shadow_75...,No,a diagram of a mountain with a few different t...,c,Dry Windward Slope,Dry Windward Slope,Dry Leeward Side
4,52,climate and its causes,What happens after dry air is warmed?,windward slide,leeward slide,zone of precipitation,rain shadow desert,rain shadow desert,../Dataset/test/question_images/rain_shadow_81...,No,a diagram of a mountain with a few different t...,c,zone of precipitation,Leeward slide,rain shadow desert


In [188]:
all_common_preds.shape

(403, 15)

Query GPT-4 to create image descriptions of all images in the above dataframe. The goat is to integrate these GPT-4 generated image descriptions into our BLIP-2 and LLaVA prompts to judge whether the added context helps them answer a question they struggled with earlier correctly now.

In [2]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Prompt GPT for VQA task
def prompt_gpt(text, base64_image):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": text
              },
              {
                "type": "image_url",
                "image_url": {
                  "url": f"data:image/jpeg;base64,{base64_image}"
                }
              }
            ]
          }
        ],
        "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    return response.json()

In [190]:
all_common_preds["image_description"] = None
all_common_preds.drop(columns=["Unnamed: 0", "blip_2_generated_answers", "mapped_predictions", "llava_generated_answers", "gpt4_generated_answers"], inplace=True)
all_common_preds.to_csv("common_blip2_llava_gpt4_preds.csv", index=False)

In [5]:
# Read CSV file and make predictions
all_common_preds = pd.read_csv("common_blip2_llava_gpt4_preds.csv")
image_description_map = {}

In [208]:
# Get image descriptions from GPT-4
for idx, image_path in enumerate(all_common_preds["image_path"]):
    if pd.isna(all_common_preds.loc[idx, "image_description"]):
        # If an image description has already been generated before, then we don't need to prompt the GPT-4 API again
        if image_path in image_description_map:
            all_common_preds.loc[idx, "image_description"] = image_description_map[image_path]
        else:
            # Get base64 string version of the image
            base64_image = encode_image(all_common_preds.loc[idx, "image_path"])

            # Build prompt to generate image description
            prompt = f"Give a brief 3-4 line description of this image."
            gpt_json_result = prompt_gpt(prompt, base64_image)

            if ("error" in gpt_json_result):
                print(all_common_preds.loc[idx, "image_path"])
                print(gpt_json_result)
                # Rate Limits can be reached, hence we have to manually run this function multiple times in order
                # to get all the results of our MCQ questions in the dataset
                print("Rate limit exceed. Exiting loop...")
                break
            all_common_preds.loc[idx, "image_description"] = gpt_json_result["choices"][0]["message"]["content"]
            image_description_map[image_path] = gpt_json_result["choices"][0]["message"]["content"]
        
        # Keep writing intermediate results to CSV file
        all_common_preds.to_csv("common_blip2_llava_gpt4_preds.csv")

In [212]:
# Remove unnecessary columns
#all_common_preds.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.2", "Unnamed: 0.3", "Unnamed: 0.4"], inplace=True)
all_common_preds.to_csv("common_blip2_llava_gpt4_preds.csv", index=False)

<h2> Analyze predictions with image descriptions </h2>

In [3]:
def map_predicted_to_actual(row):
    if row['blip_2_generated_answers'] in ['a.', 'a']:
        return row['answer_choice_1']
    elif row['blip_2_generated_answers'] in ['b', 'b.']:
        return row['answer_choice_2']
    elif row['blip_2_generated_answers'] in ['c', 'c.']:
        return row['answer_choice_3']
    elif row['blip_2_generated_answers'] in ['d.', '(d)', 'd']:
        return row['answer_choice_4']
    else:
        return None

In [32]:
common_preds = pd.read_csv("common_blip2_llava_gpt4_preds_img_desc.csv")
common_preds['mapped_blip2_predictions'] = common_preds.apply(map_predicted_to_actual, axis=1)

In [33]:
# Focusing on LLaVA predictions first
common_llava_wrong_preds = common_preds[common_preds["correct_answer"].str.lower() != common_preds["llava_generated_answers"].str.lower()]
common_llava_wrong_preds.to_csv("common_llava_wrong_preds.csv", index=False)
common_llava_correct_preds = common_preds[common_preds["correct_answer"].str.lower() == common_preds["llava_generated_answers"].str.lower()]
common_llava_correct_preds.to_csv("common_llava_correct_preds.csv", index=False)

In [36]:
# Get all images without defined labels and whose questions were answered incorrectly by LLaVA
val_counts = return_val_counts(common_llava_wrong_preds, "Yes")
val_counts

../Dataset/test/abc_question_images/parts_plant_13579.png     4
../Dataset/test/abc_question_images/types_leaves_14425.png    4
../Dataset/test/abc_question_images/life_cycles_16210.png     4
../Dataset/test/abc_question_images/life_cycles_16216.png     4
../Dataset/test/abc_question_images/types_leaves_14728.png    3
../Dataset/test/abc_question_images/parts_seed_17249.png      3
../Dataset/test/abc_question_images/life_cycles_16223.png     3
../Dataset/test/abc_question_images/parts_leaf_11090.png      2
../Dataset/test/abc_question_images/types_leaves_16313.png    2
../Dataset/test/abc_question_images/life_cycles_10831.png     2
../Dataset/test/abc_question_images/parts_leaf_10558.png      1
../Dataset/test/abc_question_images/parts_plant_11155.png     1
../Dataset/test/abc_question_images/parts_plant_16276.png     1
../Dataset/test/abc_question_images/types_leaves_11017.png    1
../Dataset/test/abc_question_images/life_cycles_10035.png     1
../Dataset/test/abc_question_images/life

In [37]:
# Get all images without defined labels and whose questions were answered correctly by LLaVA
val_counts = return_val_counts(common_llava_correct_preds, "Yes")
val_counts

../Dataset/test/abc_question_images/life_cycles_10595.png     2
../Dataset/test/abc_question_images/life_cycles_16226.png     2
../Dataset/test/abc_question_images/parts_plant_13221.png     1
../Dataset/test/abc_question_images/types_leaves_14425.png    1
../Dataset/test/abc_question_images/types_leaves_14498.png    1
../Dataset/test/abc_question_images/parts_seed_13817.png      1
../Dataset/test/abc_question_images/life_cycles_10035.png     1
../Dataset/test/abc_question_images/life_cycles_10044.png     1
../Dataset/test/abc_question_images/life_cycles_10650.png     1
../Dataset/test/abc_question_images/life_cycles_12290.png     1
../Dataset/test/abc_question_images/life_cycles_12300.png     1
../Dataset/test/abc_question_images/life_cycles_12331.png     1
../Dataset/test/abc_question_images/life_cycles_16223.png     1
../Dataset/test/abc_question_images/life_cycles_16225.png     1
Name: image_path, dtype: int64

In [43]:
# Get all images that have defined labels and whose questions were answered incorrectly by LLaVA
val_counts = return_val_counts(common_llava_wrong_preds, "No")
val_counts[0:60]

../Dataset/test/question_images/life_cycles_592.png             3
../Dataset/test/question_images/parts_plant_3791.png            3
../Dataset/test/question_images/food_chains_webs_705.png        3
../Dataset/test/question_images/parts_flower_3371.png           3
../Dataset/test/question_images/food_chains_webs_810.png        3
../Dataset/test/question_images/types_leaves_4495.png           3
../Dataset/test/question_images/parts_plant_3788.png            3
../Dataset/test/question_images/food_chains_webs_2105.png       3
../Dataset/test/question_images/life_cycles_2443.png            3
../Dataset/test/question_images/life_cycles_619.png             3
../Dataset/test/question_images/types_leaves_4387.png           2
../Dataset/test/question_images/food_chains_webs_24.png         2
../Dataset/test/question_images/parts_flower_1137.png           2
../Dataset/test/question_images/types_leaves_1093.png           2
../Dataset/test/question_images/food_chains_webs_250.png        2
../Dataset

In [44]:
# Get all images where label values are provided and LLaVA answered questions correctly
val_counts = return_val_counts(common_llava_correct_preds, "No")
val_counts[:60]

../Dataset/test/question_images/life_cycles_353.png             2
../Dataset/test/question_images/food_chains_webs_6057.png       2
../Dataset/test/question_images/types_leaves_4408.png           2
../Dataset/test/question_images/parts_seed_3817.png             2
../Dataset/test/question_images/life_cycles_6226.png            2
../Dataset/test/question_images/life_cycles_889.png             2
../Dataset/test/question_images/life_cycles_595.png             2
../Dataset/test/question_images/life_cycles_6207.png            1
../Dataset/test/question_images/life_cycles_885.png             1
../Dataset/test/question_images/life_cycles_884.png             1
../Dataset/test/question_images/life_cycles_575.png             1
../Dataset/test/question_images/life_cycles_849.png             1
../Dataset/test/question_images/life_cycles_840.png             1
../Dataset/test/question_images/life_cycles_792.png             1
../Dataset/test/question_images/life_cycles_608.png             1
../Dataset

In [45]:
# Focusing on BLIP-2 predictions
common_blip2_wrong_preds = common_preds[common_preds["correct_answer"].str.lower() != common_preds["mapped_blip2_predictions"].str.lower()]
common_blip2_wrong_preds.to_csv("common_blip2_wrong_preds.csv", index=False)
common_blip2_correct_preds = common_preds[common_preds["correct_answer"].str.lower() == common_preds["mapped_blip2_predictions"].str.lower()]
common_blip2_correct_preds.to_csv("common_blip2_correct_preds.csv", index=False)

In [46]:
# Get all images where label values are not provided and BLIP-2 answered questions incorrectly.
val_counts = return_val_counts(common_blip2_wrong_preds, "Yes")
val_counts

../Dataset/test/abc_question_images/types_leaves_14425.png    5
../Dataset/test/abc_question_images/life_cycles_16223.png     4
../Dataset/test/abc_question_images/life_cycles_16216.png     4
../Dataset/test/abc_question_images/life_cycles_16210.png     4
../Dataset/test/abc_question_images/parts_plant_13579.png     4
../Dataset/test/abc_question_images/parts_seed_17249.png      3
../Dataset/test/abc_question_images/types_leaves_14728.png    3
../Dataset/test/abc_question_images/parts_leaf_11090.png      2
../Dataset/test/abc_question_images/types_leaves_16313.png    2
../Dataset/test/abc_question_images/life_cycles_10831.png     2
../Dataset/test/abc_question_images/types_leaves_11017.png    1
../Dataset/test/abc_question_images/types_leaves_14498.png    1
../Dataset/test/abc_question_images/parts_seed_13817.png      1
../Dataset/test/abc_question_images/parts_plant_16276.png     1
../Dataset/test/abc_question_images/life_cycles_11446.png     1
../Dataset/test/abc_question_images/part

In [47]:
# Get all images where label values are not provided and BLIP-2 answered questions correctly.
val_counts = return_val_counts(common_blip2_correct_preds, "Yes")
val_counts

../Dataset/test/abc_question_images/life_cycles_10035.png    2
../Dataset/test/abc_question_images/life_cycles_10595.png    2
../Dataset/test/abc_question_images/life_cycles_16226.png    2
../Dataset/test/abc_question_images/life_cycles_10044.png    1
../Dataset/test/abc_question_images/life_cycles_10650.png    1
../Dataset/test/abc_question_images/life_cycles_12290.png    1
../Dataset/test/abc_question_images/life_cycles_12300.png    1
../Dataset/test/abc_question_images/life_cycles_12331.png    1
../Dataset/test/abc_question_images/life_cycles_16225.png    1
Name: image_path, dtype: int64

In [55]:
# Get all images where label values are provided and BLIP-2 answered questions incorrectly
val_counts = return_val_counts(common_blip2_wrong_preds, "No")
val_counts[:60]

../Dataset/test/question_images/life_cycles_884.png             3
../Dataset/test/question_images/types_leaves_4495.png           3
../Dataset/test/question_images/rain_shadow_7525.png            2
../Dataset/test/question_images/food_chains_webs_2069.png       2
../Dataset/test/question_images/food_chains_webs_6032.png       2
../Dataset/test/question_images/food_chains_webs_435.png        2
../Dataset/test/question_images/food_chains_webs_324.png        2
../Dataset/test/question_images/food_chains_webs_303.png        2
../Dataset/test/question_images/types_leaves_1028.png           2
../Dataset/test/question_images/food_chains_webs_24.png         2
../Dataset/test/question_images/food_chains_webs_2099.png       2
../Dataset/test/question_images/types_leaves_6312.png           2
../Dataset/test/question_images/parts_plant_3579.png            2
../Dataset/test/question_images/food_chains_webs_1965.png       2
../Dataset/test/question_images/parts_flower_1137.png           2
../Dataset

In [56]:
# Get all images where label values are provided and BLIP-2 answered questions correctly
val_counts = return_val_counts(common_blip2_correct_preds, "No")
val_counts[:60]

../Dataset/test/question_images/life_cycles_2175.png            3
../Dataset/test/question_images/food_chains_webs_2105.png       3
../Dataset/test/question_images/life_cycles_2443.png            3
../Dataset/test/question_images/parts_flower_3371.png           3
../Dataset/test/question_images/life_cycles_2459.png            2
../Dataset/test/question_images/types_leaves_4387.png           2
../Dataset/test/question_images/types_leaves_4408.png           2
../Dataset/test/question_images/life_cycles_595.png             2
../Dataset/test/question_images/food_chains_webs_1775.png       2
../Dataset/test/question_images/parts_chordate_body_8145.png    2
../Dataset/test/question_images/life_cycles_889.png             2
../Dataset/test/question_images/parts_flower_1144.png           2
../Dataset/test/question_images/parts_seed_3817.png             2
../Dataset/test/question_images/life_cycles_6227.png            2
../Dataset/test/question_images/life_cycles_6226.png            2
../Dataset