In [2]:
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration
from rouge import Rouge
from bert_score import score
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.cider.cider import Cider

In [4]:
ds_flickr30k = load_dataset("lmms-lab/flickr30k")

ds_flickr30k

DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'img_id', 'filename'],
        num_rows: 31783
    })
})

In [5]:
def check_for_children(caption_list):
    keywords = ["child", "girl", "boy", "baby"]
    return any(
        keyword in caption.lower() for keyword in keywords for caption in caption_list
    )

In [6]:
ds_flickr30k["test"] = ds_flickr30k["test"].add_column(
    "has_children",
    [check_for_children(example["caption"]) for example in ds_flickr30k["test"]],
)

filter_ds_flickr30k = ds_flickr30k["test"].filter(
    lambda example: example["has_children"]
)
filter_ds_flickr30k = filter_ds_flickr30k.remove_columns(["sentids", "has_children"])

filter_ds_flickr30k

Dataset({
    features: ['image', 'caption', 'img_id', 'filename'],
    num_rows: 10361
})

In [7]:
# Модель №1
processor = BlipProcessor.from_pretrained("abhijit2111/Pic2Story")
model = BlipForConditionalGeneration.from_pretrained("abhijit2111/Pic2Story")

In [19]:
predicted_captions = []
reference_captions = []

for i in range(5):
    inputs = processor(filter_ds_flickr30k[i]["image"], return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(filter_ds_flickr30k[i]["caption"])

In [21]:
predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 0.209


In [22]:
P, R, F1 = score(predicted_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.892
BERTScore Recall: 0.918
BERTScore F1: 0.901


In [23]:
predicted_captions_tokenized = [
    word_tokenize(caption) for caption in predicted_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.366


In [24]:
ref_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(predicted_captions, ref_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 0.2724145557162635, 'p': 0.507704678362573, 'f': 0.35370769543976177}
ROUGE-2 Scores: {'r': 0.04073425603819765, 'p': 0.09726084373143198, 'f': 0.057134230101694584}
ROUGE-l Scores: {'r': 0.2659629428130377, 'p': 0.49717836257309944, 'f': 0.34570769543976176}


In [25]:
# Модель №2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

In [26]:
predicted_captions = []
reference_captions = []

for i in range(5):
    inputs = processor(filter_ds_flickr30k[i]["image"], return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(filter_ds_flickr30k[i]["caption"])



In [27]:
predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 0.346


In [28]:
P, R, F1 = score(predicted_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.928
BERTScore Recall: 0.893
BERTScore F1: 0.907


In [29]:
predicted_captions_tokenized = [
    word_tokenize(caption) for caption in predicted_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.192


In [30]:
ref_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(predicted_captions, ref_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 0.10647240172856871, 'p': 0.6366666666666667, 'f': 0.18059141901856915}
ROUGE-2 Scores: {'r': 0.02631578947368421, 'p': 0.16666666666666669, 'f': 0.045454544983471075}
ROUGE-l Scores: {'r': 0.10059004878739224, 'p': 0.5866666666666667, 'f': 0.17006510322909546}


In [31]:
# Модель №3
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-large"
)

In [32]:
predicted_captions = []
reference_captions = []

for i in range(5):
    inputs = processor(filter_ds_flickr30k[i]["image"], return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(filter_ds_flickr30k[i]["caption"])

In [33]:
predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 0.383


In [34]:
P, R, F1 = score(predicted_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.904
BERTScore Recall: 0.915
BERTScore F1: 0.908


In [35]:
predicted_captions_tokenized = [
    word_tokenize(caption) for caption in predicted_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.350


In [36]:
ref_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(predicted_captions, ref_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 0.18589854168601797, 'p': 0.53, 'f': 0.27445342968105413}
ROUGE-2 Scores: {'r': 0.022641206675224644, 'p': 0.08818181818181818, 'f': 0.036006400594121316}
ROUGE-l Scores: {'r': 0.1800161887448415, 'p': 0.51, 'f': 0.26536252059014503}
