# Model Responses

In [None]:
from pycocoevalcap.eval import Bleu, Cider, COCOEvalCap, Meteor, Rouge, Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocotools.coco import COCO
from loguru import logger as eval_logger

def dota_captioning_sample_score(result, metric):
    """
    Calculate the score for a single caption sample.

    Args:
        result (dict): Must have keys:
            - "pred": predicted caption (str)
            - "answer": list of reference captions (list of str)
        metric (str): One of ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "ROUGE_L", "CIDEr"]

    Returns:
        float: score for this sample
    """
    scorers = {
        "Bleu_1": Bleu(1),
        "Bleu_2": Bleu(2),
        "Bleu_3": Bleu(3),
        "Bleu_4": Bleu(4),
        "ROUGE_L": Rouge(),
    }

    scorer = scorers.get(metric)
    if scorer is None:
        raise ValueError(f"Unknown metric: {metric}")

    if isinstance(result["answer"], str):
        answers = [result["answer"]]
    else:
        # list
        answers = result["answer"]


    # Ground truths: list of dicts with "caption" keys
    gts = {0: [{"caption": ans} for ans in answers]}

    # Predictions: must also be a list of dicts
    res = {0: [{"caption": result["pred"]}]}

    # Tokenize like COCOEvalCap does
    tokenizer = PTBTokenizer()
    gts_tok = tokenizer.tokenize(gts)
    res_tok = tokenizer.tokenize(res)

    score, _ = scorer.compute_score(gts_tok, res_tok)

    # BLEU returns a list of scores, pick the right n
    if isinstance(score, list):
        n = int(metric.split("_")[-1])
        score = score[n - 1]

    return score


In [23]:
result = {
    "answer": "Dog sitting on floor.",
    "pred": "Dog is sitting on the floor."
}

In [28]:
print(dota_captioning_sample_score(result, "ROUGE_L"))

0.8299319727891156


In [29]:
{"doc_id": 0, "target": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "filtered_resps": ["This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic."], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "dota_Bleu_4": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.9999999999852116}, "dota_Bleu_3": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.9999999999852394}, "dota_Bleu_2": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.9999999999852669}, "dota_Bleu_1": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.9999999999852941}, "dota_ROUGE_L": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 1.0}, "input": "Describe this image."}


{'doc_id': 0,
 'target': 'This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.',
 'filtered_resps': 

In [30]:
{"doc_id": 1, "target": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "filtered_resps": ["The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body."], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "dota_Bleu_4": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "answer": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "score": 0.9999999999756793}, "dota_Bleu_3": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "answer": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "score": 0.9999999999757553}, "dota_Bleu_2": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "answer": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "score": 0.9999999999758299}, "dota_Bleu_1": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "answer": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "score": 0.9999999999759034}, "dota_ROUGE_L": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "answer": "The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.", "score": 1.0}, "input": "Describe this image."}


{'doc_id': 1,
 'target': 'The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gardens. The overall scene suggests a peaceful, suburban neighborhood located near a natural water body.',
 'filtered_resps': ['The image is an aerial view of a lake surrounded by residential properties. The lake has a dark, reflective surface, and the surrounding area is densely forested with green trees. There are several docks and piers extending into the water from the shore, indicating that the lake is likely used for recreational purposes such as boating or fishing. The houses and properties are well-maintained, with manicured lawns and gard

In [32]:
def dota_captioning_aggregation_result(results):
    """
    Aggregate the results of the dota evaluation task using the specified metric.

    Args:
    - results (list of dict): List of result dictionaries.
    - metric (str): Metric to use for aggregation.

    Returns:
    - dict: Dictionary containing the aggregated results for the specified metric.
    """
    metric = "CIDEr"
    # scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]  # , (Spice(), "SPICE")]
    scorers = [(Cider(), "CIDEr")]  # , (Spice(), "SPICE")]

    scorers_dict = {s[1]: s for s in scorers}

    stored_results = []
    # In order to make the coco eval tools to successfully create index
    # We need at least two dict in the dataset
    # 'annotation' and 'images'
    # 'annotation' exactly reproduce the original annotation
    # 'images' however only need the image id which is contained in the file name
    dataset = {
        "info": {},
        "annotations": [],
        "images": []
    }
    idx = 0
    ann_id = 0
    for result in results:
        stored_results.append({"image_id": idx, "caption": result["pred"]})
        for s in result["answer"]:
            dataset["annotations"].append({"image_id": idx, "caption": s, "id": ann_id})
            ann_id += 1

        dataset["images"].append({"id": idx})
        idx += 1

    coco = COCO()
    # Manually create index here
    coco.dataset = dataset
    coco.createIndex()

    coco_result = coco.loadRes(stored_results)
    coco_eval = COCOEvalCap(coco, coco_result)

    imgIds = coco_eval.params["image_id"]
    gts = {}
    res = {}
    for imgId in imgIds:
        gts[imgId] = coco_eval.coco.imgToAnns[imgId]
        res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]

    eval_logger.info("tokenization...")
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    eval_logger.info(f"Computing {metric} scores...")

    score, scores = scorers_dict[metric][0].compute_score(gts, res)

    return score



In [34]:
results = [
    {
        "pred": "A dog is playing with a ball.",
        "answer": ["A dog plays with a ball.", "A dog is playing with a toy."]
    },
    {
        "pred": "A cat is sitting on a windowsill.",
        "answer": ["A cat sits on a windowsill.", "A cat is on the windowsill."]
    }
]

In [35]:
dota_captioning_aggregation_result(results)

[32m2025-08-13 16:38:32.905[0m | [1mINFO    [0m | [36m__main__[0m:[36mdota_captioning_aggregation_result[0m:[36m55[0m - [1mtokenization...[0m


creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!


[32m2025-08-13 16:38:34.528[0m | [1mINFO    [0m | [36m__main__[0m:[36mdota_captioning_aggregation_result[0m:[36m60[0m - [1mComputing CIDEr scores...[0m


4.658946801950781

In [36]:
from pycocoevalcap.cider.cider import Cider

references = {
            'img1': ["A dog plays with a ball.", "A dog is playing with a toy."],
            'img2':  ["A cat sits on a windowsill.", "A cat is on the windowsill."]
}

candidates = {
    'img1': ['A dog is playing with a ball.'], 
    'img2': ['A cat is sitting on a windowsill.']
}


cider_scorer = Cider()

score, scores = cider_scorer.compute_score(references, candidates)

print(f"Average CIDEr score: {score:.4f}")
# print("CIDEr scores per image:")
# print(scores)

Average CIDEr score: 4.6589


In [80]:
def dota_captioning_aggregation_result(results, metric):
    """
    Aggregate the results of the dota evaluation task using the specified metric.

    Args:
    - results (list of dict): List of result dictionaries.
    - metric (str): Metric to use for aggregation.

    Returns:
    - dict: Dictionary containing the aggregated results for the specified metric.
    """
    # metric = "CIDEr"
    scorers = [(Bleu(1), "Bleu_1"), (Bleu(2), "Bleu_2"), (Bleu(3), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]  # , (Spice(), "SPICE")]
    # scorers = [(Cider(), "CIDEr")]  # , (Spice(), "SPICE")]

    scorers_dict = {s[1]: s for s in scorers}

    stored_results = []
    # In order to make the coco eval tools to successfully create index
    # We need at least two dict in the dataset
    # 'annotation' and 'images'
    # 'annotation' exactly reproduce the original annotation
    # 'images' however only need the image id which is contained in the file name
    dataset = {
        "info": {},
        "annotations": [],
        "images": []
    }
    idx = 0
    ann_id = 0
    for result in results:
        stored_results.append({"image_id": idx, "caption": result["pred"]})
        for s in result["answer"]:
            dataset["annotations"].append({"image_id": idx, "caption": s, "id": ann_id})
            ann_id += 1

        dataset["images"].append({"id": idx})
        idx += 1

    coco = COCO()
    # Manually create index here
    coco.dataset = dataset
    coco.createIndex()

    coco_result = coco.loadRes(stored_results)
    coco_eval = COCOEvalCap(coco, coco_result)

    imgIds = coco_eval.params["image_id"]
    gts = {}
    res = {}
    for imgId in imgIds:
        gts[imgId] = coco_eval.coco.imgToAnns[imgId]
        res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]

    eval_logger.info("tokenization...")
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    eval_logger.info(f"Computing {metric} scores...")

    score, scores = scorers_dict[metric][0].compute_score(gts, res)

    return score



In [5]:
a = {"doc_id": 0, "target": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "filtered_resps": ["This image is an aerial view of an airport terminal and surrounding area. The terminal"], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "dota_Bleu_4": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.0003138276315755334}, "dota_Bleu_3": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.00031382763157644823}, "dota_Bleu_2": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.00031382763157727206}, "dota_Bleu_1": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.0003138276315780193}, "dota_ROUGE_L": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0.17359134889015365}, "dota_CIDEr": {"anno_id": "10", "pred": "This image is an aerial view of an airport terminal and surrounding area. The terminal", "answer": "This image is an aerial view of an airport terminal and surrounding area. The terminal is designed with multiple concourses and gates, each equipped with jet bridges for boarding and disembarking passengers. Several airplanes are parked at the gates, indicating active operations.\n\nThe terminal building is centrally located, with various runways and taxiways extending outward. The airport is surrounded by greenery, including trees and grassy areas, which provide a contrast to the man-made structures. There are also several buildings and facilities visible, likely related to airport operations such as maintenance, security, and support services.\n\nThe image shows a well-organized layout typical of major international airports, with clear demarcations for different functions and areas. The presence of multiple aircraft and the extensive infrastructure suggests that this is a significant airport handling a high volume of passenger traffic.", "score": 0}, "input": "Describe this image."}
b = {"doc_id": 1, "target": "The image shows a lake with residential properties and a lot of green trees.", "filtered_resps": ["The image is an aerial view of a lake surrounded by residential properties. The lake"], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "dota_Bleu_4": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake", "answer": "The image shows a lake with residential properties and a lot of green trees.", "score": 5.03174762615483e-09}, "dota_Bleu_3": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake", "answer": "The image shows a lake with residential properties and a lot of green trees.", "score": 1.9740230336070894e-06}, "dota_Bleu_2": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake", "answer": "The image shows a lake with residential properties and a lot of green trees.", "score": 0.31622776599500324}, "dota_Bleu_1": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake", "answer": "The image shows a lake with residential properties and a lot of green trees.", "score": 0.4666666666355556}, "dota_ROUGE_L": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake", "answer": "The image shows a lake with residential properties and a lot of green trees.", "score": 0.416382252559727}, "dota_CIDEr": {"anno_id": "11", "pred": "The image is an aerial view of a lake surrounded by residential properties. The lake", "answer": "The image shows a lake with residential properties and a lot of green trees.", "score": 0}, "input": "Describe this image."}
c = {"doc_id": 2, "target": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "filtered_resps": ["The image is an aerial view of a park or recreational area featuring several sports fields"], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "dota_Bleu_4": {"anno_id": "12", "pred": "The image is an aerial view of a park or recreational area featuring several sports fields", "answer": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "score": 1.0783428502639563e-05}, "dota_Bleu_3": {"anno_id": "12", "pred": "The image is an aerial view of a park or recreational area featuring several sports fields", "answer": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "score": 1.0783428502666679e-05}, "dota_Bleu_2": {"anno_id": "12", "pred": "The image is an aerial view of a park or recreational area featuring several sports fields", "answer": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "score": 1.0783428502691282e-05}, "dota_Bleu_1": {"anno_id": "12", "pred": "The image is an aerial view of a park or recreational area featuring several sports fields", "answer": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "score": 1.0783428502713748e-05}, "dota_ROUGE_L": {"anno_id": "12", "pred": "The image is an aerial view of a park or recreational area featuring several sports fields", "answer": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "score": 0.12903225806451613}, "dota_CIDEr": {"anno_id": "12", "pred": "The image is an aerial view of a park or recreational area featuring several sports fields", "answer": "The image is an aerial view of a park or recreational area featuring several sports fields and facilities. Here are the key elements visible in the image:\n\n1. **Sports Fields**:\n   - There are multiple grassy fields, likely used for various sports such as soccer, football, and possibly baseball.\n   - The fields are surrounded by pathways and are bordered by trees and greenery.\n\n2. **Parking Area**:\n   - A large parking lot is situated adjacent to the sports fields. It appears to be quite spacious, with rows of parking spaces.\n\n3. **Lakes and Water Bodies**:\n   - There is a large body of water, possibly a lake or pond, located near the top right corner of the image. This water body is surrounded by trees and greenery.\n\n4. **Pathways and Trails**:\n   - Various paths and trails are visible throughout the area, connecting different parts of the park and providing access to the sports fields and the lake.\n\n5. **Trees and Vegetation**:\n   - The park is lush with trees and vegetation, indicating a well-maintained natural environment.\n\n6. **Buildings and Structures**:\n   - There are some buildings or structures visible near the parking lot, which might serve as restrooms, administrative", "score": 0}, "input": "Describe this image."}


In [6]:
from pycocoevalcap.cider.cider import Cider

references = {
            'img1': [a["target"]],
            'img2':  [b["target"]],
            'img3':  [c["target"]]
}

candidates = {
    'img1': a["filtered_resps"], 
    'img2': b["filtered_resps"],
    'img3': c["filtered_resps"]
}


cider_scorer = Cider()

score, scores = cider_scorer.compute_score(references, candidates)

print(f"Average CIDEr score: {score:.4f}")
# print("CIDEr scores per image:")
# print(scores)

Average CIDEr score: 0.2564


# Grounding Eval Code
- return [0,0,0,0] if no objects detected
- supports prediction of multiple bounding boxes

In [9]:
a = "There are none."
b = "[174, 108, 439, 560]; [262, 277, 512, 643]; [291, 1271, 430, 1404]"
c= "[174, 108, 439, 560]"
d = ""

In [2]:
import re

def parse_float_sequence_within(input_str):
    """
    Extract all sequences of four floating-point numbers within square brackets from a string.

    Args:
    input_str (str): A string that may contain multiple sequences of four floats within square brackets.

    Returns:
    list: A list of bounding boxes, where each bounding box is a list of four floats.
          Returns an empty list if no valid bounding boxes are found.
    """
    # Define the regex pattern to find all instances of four floats within square brackets
    pattern = r"\[\s*(-?\d+(?:\.\d+)?),\s*(-?\d+(?:\.\d+)?),\s*(-?\d+(?:\.\d+)?),\s*(-?\d+(?:\.\d+)?)\s*\]"

    # Use re.findall to find all matches of the pattern in the input string
    matches = re.findall(pattern, input_str)

    # If matches are found, convert each group of matches into a list of floats
    if matches:
        return [[float(x1), float(y1), float(x2), float(y2)] for x1, y1, x2, y2 in matches]

    # If the input does not contain any valid bounding boxes, return an empty list
    return [0, 0, 0, 0]


In [3]:
parse_float_sequence_within(a)

[0, 0, 0, 0]

In [4]:
parse_float_sequence_within(b)

[[174.0, 108.0, 439.0, 560.0],
 [262.0, 277.0, 512.0, 643.0],
 [291.0, 1271.0, 430.0, 1404.0]]

In [8]:
parse_float_sequence_within("0")

[0, 0, 0, 0]

In [4]:
def compute_iou(box1, box2):
    """
    Compute the Intersection over Union (IoU) of two bounding boxes.

    Parameters:
    - box1 (list of float): Bounding box [x_min, y_min, x_max, y_max].
    - box2 (list of float): Bounding box [x_min, y_min, x_max, y_max].

    Returns:
    - float: IoU of box1 and box2.
    """
    # Determine the coordinates of the intersection rectangle
    x_left = max(box1[0], box2[0])
    y_top = max(box1[1], box2[1])
    x_right = min(box1[2], box2[2])
    y_bottom = min(box1[3], box2[3])

    # Compute the area of intersection
    intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)

    # Compute the area of both bounding boxes
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    # Compute the area of the union
    union_area = box1_area + box2_area - intersection_area

    # Compute the Intersection over Union
    iou = intersection_area / union_area

    return iou

def compute_accuracy(box1, box2, threshold=0.5):
    return compute_iou(box1, box2) >= threshold

def compute_center_accuracy(box1, box2):
    cx = (box2[0] + box2[2]) / 2
    cy = (box2[1] + box2[3]) / 2
    return box1[0] <= cx <= box1[2] and box1[1] <= cy <= box1[3]

def match_bboxes(gt_bboxes, pred_bboxes, scorer):
    """
    Match GT bboxes to pred bboxes using greedy matching.
    Returns list of best scores for each GT box.
    """
    if not gt_bboxes:
        return [0.0] * len(pred_bboxes)
    if not pred_bboxes:
        return [0.0] * len(gt_bboxes)

    matched = set()
    scores = []
    for gt in gt_bboxes:
        best_score = 0.0
        best_pred = None
        for i, pred in enumerate(pred_bboxes):
            if i in matched:
                continue
            score = scorer(gt, pred)
            if score > best_score:
                best_score = score
                best_pred = i
        if best_pred is not None:
            matched.add(best_pred)
        scores.append(best_score)
    return scores

def dota_bbox_rec_aggregation_result(results, metric):
    """
    Aggregate the results of the dota evaluation task using the specified metric.

    Args:
    - results (list of dict): List of result dictionaries.
    - metric (str): Metric to use for aggregation.

    Returns:
    - dict: Dictionary containing the aggregated results for the specified metric.
    """

    scorers = {
        "IoU": compute_iou,
        "ACC@0.1": lambda x, y: float(compute_accuracy(x, y, 0.1)),
        "ACC@0.3": lambda x, y: float(compute_accuracy(x, y, 0.3)),
        "ACC@0.5": lambda x, y: float(compute_accuracy(x, y, 0.5)),
        "ACC@0.7": lambda x, y: float(compute_accuracy(x, y, 0.7)),
        "ACC@0.9": lambda x, y: float(compute_accuracy(x, y, 0.9)),
        "Center_ACC": lambda x, y: float(compute_center_accuracy(x, y)),
    }

    scorer = scorers[metric]
    scores_all = []
    for result in results:
        gt_bboxes = result["bbox"]
        pred_bboxes = result["pred"]
        scores = match_bboxes(gt_bboxes, pred_bboxes, scorer)
        avg_score = sum(scores) / len(scores) if scores else 0.0
        scores_all.append(avg_score)
    print(scores_all)
    final_score = sum(scores_all) / len(scores_all)
    print(f"Aggregated {metric} score: {final_score}")
    return final_score


In [5]:
results = [
    {
        "bbox": [  # ground truth boxes
            [10, 10, 50, 50],
            [100, 100, 150, 150],
        ],
        "pred": [  # predicted boxes
            [12, 12, 48, 48],      # good match with GT 1
            [102, 102, 148, 148],  # good match with GT 2
            [200, 200, 220, 220],  # unmatched, false positive
        ]
    },
    {
        "bbox": [
            [30, 30, 60, 60],
        ],
        "pred": [
            [35, 35, 55, 55],
        ]
    },
    {
        "bbox": [],  # no objects in GT
        "pred": [
            [10, 10, 30, 30]
        ]
    },
    {
        "bbox": [
            [10, 10, 30, 30],
            [40, 40, 60, 60],
        ],
        "pred": []  # missed both
    }
]


In [9]:
# results = [
#     {"bbox": [[10, 10, 50, 50]], "pred": [[12, 12, 48, 48]]},
#     {"bbox": [[100, 100, 150, 150]], "pred": [[95, 95, 145, 145]]},
# ]

# Run the aggregation
dota_bbox_rec_aggregation_result(results, "ACC@0.9")

[0.0, 0.0, 0.0, 0.0]
Aggregated ACC@0.9 score: 0.0


0.0

# Ignore Puncutations for ChartQA

In [10]:
import string

def remove_punctuation(s):
    return s.translate(str.maketrans("", "", string.punctuation))

In [11]:
remove_punctuation("hi!")

'hi'

# Visualize for Best Data-Mix

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Example data
model_names = ['Model A', 'Model B', 'Model C', 'Model D', 'Model E', 'Model F', 'Model G']
performance_sets = [
    [0.85, 0.78, 0.92, 0.88, 0.81, 0.75, 0.89],  # Set 1
    [0.83, 0.76, 0.90, 0.86, 0.79, 0.72, 0.87],  # Set 2
    [0.80, 0.74, 0.91, 0.85, 0.77, 0.71, 0.86],  # Set 3
    [0.82, 0.75, 0.89, 0.84, 0.78, 0.73, 0.88],  # Set 4
    [0.84, 0.77, 0.93, 0.87, 0.80, 0.74, 0.90],  # Set 5
]
labels = ['Condition 1', 'Condition 2', 'Condition 3', 'Condition 4', 'Condition 5']
colors = ['steelblue', 'darkorange', 'forestgreen', 'mediumpurple', 'crimson']

# Bar setup
x = np.arange(len(model_names))
total_sets = len(performance_sets)
width = 0.15  # Smaller width since we have more sets

# Plot
plt.figure(figsize=(12, 6))

for i, (scores, label, color) in enumerate(zip(performance_sets, labels, colors)):
    offset = (i - total_sets / 2) * width + width / 2
    bars = plt.bar(x + offset, scores, width, label=label, color=color)
    
    # Add labels (optional, remove if cluttered)
    for bar in bars:
        y = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, y + 0.005, f'{y:.2f}', ha='center', va='bottom', fontsize=7)

# Final formatting
plt.title('Model Performance Across 5 Conditions')
plt.xlabel('Model')
plt.ylabel('Performance Metric')
plt.xticks(x, model_names)
plt.ylim(0, max(max(s) for s in performance_sets) + 0.1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Models and scores
model_names = ['Model A', 'Model B', 'Model C', 'Model D', 'Model E', 'Model F', 'Model G']
performance_sets = [
    [0.85, 0.78, 0.92, 0.88, 0.81, 0.75, 0.89],  # Metric 1
    [0.83, 0.76, 0.90, 0.86, 0.79, 0.72, 0.87],  # Metric 2
    [0.80, 0.74, 0.91, 0.85, 0.77, 0.71, 0.86],  # Metric 3
    [0.82, 0.75, 0.89, 0.84, 0.78, 0.73, 0.88],  # Metric 4
]
large_scale_set = [2000, 2500, 1800, 2200, 2100, 2600, 2300]  # e.g., model size in MB

# Settings
labels = ['Metric 1', 'Metric 2', 'Metric 3', 'Metric 4']
colors = ['steelblue', 'darkorange', 'forestgreen', 'mediumpurple']
x = np.arange(len(model_names))
width = 0.15

fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot 4 scaled performance sets on primary y-axis
for i, (scores, label, color) in enumerate(zip(performance_sets, labels, colors)):
    offset = (i - 2) * width  # center bars around each model
    ax1.bar(x + offset, scores, width, label=label, color=color)

ax1.set_ylabel("Performance Metric (0–1)")
ax1.set_ylim(0, 1.05)
ax1.set_xticks(x)
ax1.set_xticklabels(model_names)
ax1.grid(axis='y', linestyle='--', alpha=0.6)

# Secondary y-axis for large-scale metric
ax2 = ax1.twinx()
ax2.plot(x, large_scale_set, 'o--', color='crimson', label='Model Size (MB)', linewidth=2, markersize=6)
ax2.set_ylabel("Model Size (MB)")
ax2.set_ylim(0, max(large_scale_set) * 1.1)

# Legends (combine from both axes)
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

plt.title("Model Performance Metrics + Model Size")
plt.tight_layout()
plt.show()
