In [2]:
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration
from rouge import Rouge
from bert_score import score
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.cider.cider import Cider

In [3]:
ds_nocaps = load_dataset("lmms-lab/NoCaps")

ds_nocaps

DatasetDict({
    validation: Dataset({
        features: ['image', 'image_coco_url', 'image_date_captured', 'image_file_name', 'image_height', 'image_width', 'image_id', 'image_license', 'image_open_images_id', 'annotations_ids', 'annotations_captions'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['image', 'image_coco_url', 'image_date_captured', 'image_file_name', 'image_height', 'image_width', 'image_id', 'image_license', 'image_open_images_id', 'annotations_ids', 'annotations_captions'],
        num_rows: 10600
    })
})

In [4]:
def check_for_children(caption_list):
    keywords = ["child", "girl", "boy", "baby"]
    return any(
        keyword in caption.lower() for keyword in keywords for caption in caption_list
    )

In [5]:
ds_nocaps["validation"] = ds_nocaps["validation"].add_column(
    "has_children",
    [
        check_for_children(example["annotations_captions"])
        for example in ds_nocaps["validation"]
    ],
)

filter_ds_nocaps = ds_nocaps["validation"].filter(
    lambda example: example["has_children"]
)
filter_ds_nocaps = filter_ds_nocaps.remove_columns(
    [
        "annotations_ids",
        "has_children",
        "image_open_images_id",
        "image_license",
        "image_width",
        "image_height",
        "image_date_captured",
        "image_coco_url",
    ]
)

filter_ds_nocaps = filter_ds_nocaps.rename_columns(
    {
        "annotations_captions": "caption",
        "image_file_name": "filename",
        "image_id": "img_id",
    }
)

filter_ds_nocaps

Dataset({
    features: ['image', 'filename', 'img_id', 'caption'],
    num_rows: 667
})

In [6]:
# Модель №1
processor = BlipProcessor.from_pretrained("abhijit2111/Pic2Story")
model = BlipForConditionalGeneration.from_pretrained("abhijit2111/Pic2Story")

In [7]:
predicted_captions = []
reference_captions = []

for i in range(5):
    inputs = processor(filter_ds_nocaps[i]["image"], return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(filter_ds_nocaps[i]["caption"])

In [8]:
predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 0.386


In [9]:
P, R, F1 = score(predicted_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.900
BERTScore Recall: 0.939
BERTScore F1: 0.916


In [10]:
predicted_captions_tokenized = [
    word_tokenize(caption) for caption in predicted_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.504


In [11]:
ref_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(predicted_captions, ref_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 0.22548692185107386, 'p': 0.7040170940170939, 'f': 0.33676125066644025}
ROUGE-2 Scores: {'r': 0.07954457867616299, 'p': 0.40660130718954246, 'f': 0.1323900260738019}
ROUGE-l Scores: {'r': 0.20980386412604224, 'p': 0.6496367521367521, 'f': 0.3129299195012005}


In [12]:
# Модель №2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

In [13]:
predicted_captions = []
reference_captions = []

for i in range(5):
    inputs = processor(filter_ds_nocaps[i]["image"], return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(filter_ds_nocaps[i]["caption"])



In [14]:
predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 0.787


In [15]:
P, R, F1 = score(predicted_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.950
BERTScore Recall: 0.950
BERTScore F1: 0.948


In [16]:
predicted_captions_tokenized = [
    word_tokenize(caption) for caption in predicted_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.573


In [17]:
ref_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(predicted_captions, ref_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 0.15737122523282476, 'p': 0.9, 'f': 0.26503357260843846}
ROUGE-2 Scores: {'r': 0.06074103427391693, 'p': 0.6452020202020202, 'f': 0.11015277620490949}
ROUGE-l Scores: {'r': 0.15737122523282476, 'p': 0.9, 'f': 0.26503357260843846}


In [18]:
# Модель №3
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-large"
)

In [19]:
predicted_captions = []
reference_captions = []

for i in range(5):
    inputs = processor(filter_ds_nocaps[i]["image"], return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(filter_ds_nocaps[i]["caption"])

In [20]:
predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 0.847


In [21]:
P, R, F1 = score(predicted_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.930
BERTScore Recall: 0.941
BERTScore F1: 0.934


In [22]:
predicted_captions_tokenized = [
    word_tokenize(caption) for caption in predicted_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.573


In [23]:
ref_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(predicted_captions, ref_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 0.17474507184376725, 'p': 0.8122222222222224, 'f': 0.28590673369714664}
ROUGE-2 Scores: {'r': 0.06471296960726579, 'p': 0.5334188034188034, 'f': 0.11521377127903625}
ROUGE-l Scores: {'r': 0.16191205120586855, 'p': 0.758888888888889, 'f': 0.2654790607411718}
