In [43]:
from PIL import Image
from rouge import Rouge
from bert_score import score
from torch.utils.data import Dataset
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
from transformers import BlipProcessor, BlipForConditionalGeneration
from pycocoevalcap.cider.cider import Cider

In [44]:
class MultiCaptionImageDataset(Dataset):
    def __init__(self, image_dir, caption_file, transform=None):
        self.image_dir = image_dir
        self.image_ids, self.captions = self.load_captions(caption_file)
        self.transform = transform

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image = Image.open(f"{self.image_dir}/{image_id}")

        if self.transform:
            image = self.transform(image)

        captions = self.captions[idx]
        processed_captions = [self.preprocess_caption(caption) for caption in captions]

        return image, processed_captions

    def load_captions(self, caption_file):
        image_ids = []
        captions = []
        with open(caption_file, "r") as file:
            for line in file:
                parts = line.split(";")
                image_id = parts[0]
                caption = parts[1]
                if image_id not in image_ids:
                    image_ids.append(image_id)
                    captions.append([])
                captions[image_ids.index(image_id)].append(caption)
        return image_ids, captions

    def preprocess_caption(self, caption):
        tokens = word_tokenize(caption.lower())
        processed_caption = " ".join(tokens)
        return processed_caption

In [45]:
sin_dir = "datasets/sin_dataset_img"

image_dir = f"{sin_dir}/images"
caption_file = f"{sin_dir}/captions.txt"

dataset = MultiCaptionImageDataset(image_dir, caption_file)

In [46]:
# Модель №1
processor = BlipProcessor.from_pretrained("abhijit2111/Pic2Story")
model = BlipForConditionalGeneration.from_pretrained("abhijit2111/Pic2Story")

In [37]:
# Модель №2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

In [38]:
# Модель №3
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-large"
)

In [47]:
generated_captions = []
reference_captions = []

for image, captions in dataset:
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_length=200)
    generated_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(captions)

In [49]:
references = {i: caption for i, caption in enumerate(reference_captions)}
candidates = {i: [caption] for i, caption in enumerate(generated_captions)}

cider_scorer = Cider()

cider_score, _ = cider_scorer.compute_score(references, candidates)

print(f"CIDEr Score: {cider_score:.3f}")

CIDEr Score: 8.200


In [50]:
reference_captions = [". ".join(ref) for ref in reference_captions]

rouge = Rouge()

scores = rouge.get_scores(generated_captions, reference_captions, avg=True)

print("ROUGE-1 Scores:", scores["rouge-1"])
print("ROUGE-2 Scores:", scores["rouge-2"])
print("ROUGE-l Scores:", scores["rouge-l"])

ROUGE-1 Scores: {'r': 1.0, 'p': 0.8888888888888888, 'f': 0.9333333284000002}
ROUGE-2 Scores: {'r': 0.9722222222222222, 'p': 0.8703703703703703, 'f': 0.9111111061777778}
ROUGE-l Scores: {'r': 1.0, 'p': 0.8888888888888888, 'f': 0.9333333284000002}


In [51]:
generated_captions_tokenized = [
    word_tokenize(caption) for caption in generated_captions
]
reference_captions_tokenized = [
    [word_tokenize(caption) for caption in ref] for ref in reference_captions
]

scores = [
    meteor_score(ref, gen)
    for ref, gen in zip(reference_captions_tokenized, generated_captions_tokenized)
]
average_score = sum(scores) / len(scores)

print(f"METEOR Score: {average_score:.3f}")

METEOR Score: 0.167


In [52]:
P, R, F1 = score(generated_captions, reference_captions, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.3f}")
print(f"BERTScore Recall: {R.mean().item():.3f}")
print(f"BERTScore F1: {F1.mean().item():.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.981
BERTScore Recall: 0.996
BERTScore F1: 0.988
