In [2]:
import os
import wandb
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset
from transformers import BlipProcessor, BlipForConditionalGeneration
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.cider.cider import Cider
from collections import Counter

In [3]:
# Кастомный класс создания синтетического датасета
class CustomDataset(Dataset):
    def __init__(self, data_dir, data, transform=None):
        self.data_dir = data_dir
        self.data = data
        self.transform = transform
        self.images = [os.path.join(data_dir, img) for img in os.listdir(data_dir)]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_path = self.images[idx]
        image, caption = loader(image_path, self.data)
        if self.transform:
            image = self.transform(image)
        return image, caption


def loader(path, data):
    image = Image.open(path)
    caption = data.loc[data["image"] == os.path.basename(path), "caption"].values[0]
    return image, caption

In [4]:
data_dir = "datasets/sin_dataset_img"

data = pd.read_csv(f"{data_dir}/captions.csv")

ds_sin = CustomDataset(f"{data_dir}/images", data)

In [5]:
# Функция расчета метрики CIDEr
def metric_cider(predicted_captions, reference_captions):
    predicted_dict = {i: [caption] for i, caption in enumerate(predicted_captions)}
    reference_dict = {i: captions for i, captions in enumerate(reference_captions)}

    cider_scorer = Cider()

    cider_score, _ = cider_scorer.compute_score(reference_dict, predicted_dict)

    return cider_score

In [7]:
# Функция расчета метрики METEOR
def metric_meteor(predicted_captions, reference_captions):
    predicted_captions_tokenized = [
        word_tokenize(caption) for caption in predicted_captions
    ]
    reference_captions_tokenized = [
        [word_tokenize(caption) for caption in ref] for ref in reference_captions
    ]

    scores = [
        meteor_score(ref, gen)
        for ref, gen in zip(reference_captions_tokenized, predicted_captions_tokenized)
    ]
    average_meteor = sum(scores) / len(scores)

    return average_meteor

In [8]:
# Функция расчета метрики SPICE
def metric_spice(predicted_captions, reference_captions):
    spice_scores = []

    for gen_caption, ref_captions in zip(predicted_captions, reference_captions):
        gen_tokens = word_tokenize(gen_caption.lower())

        ref_tokens = [word_tokenize(ref.lower()) for ref in ref_captions]

        gen_counter = Counter(gen_tokens)
        ref_counters = [Counter(ref) for ref in ref_tokens]

        precisions = []
        recalls = []
        for ref_counter in ref_counters:
            common = gen_counter & ref_counter
            precisions.append(sum(common.values()) / len(gen_tokens))
            recalls.append(sum(common.values()) / len(ref_counter))

        precision = sum(precisions) / len(ref_counters)
        recall = sum(recalls) / len(ref_counters)
        if precision + recall > 0:
            spice_score = (precision * recall) / (precision + recall)
        else:
            spice_score = 0.0

        spice_scores.append(spice_score)

    average_spice_score = sum(spice_scores) / len(spice_scores)

    return average_spice_score

In [9]:
# Список моледей от простой к сложной
models = {
    "blip_base": "Salesforce/blip-image-captioning-base",
    "blip_large": "Salesforce/blip-image-captioning-large",
    "pic2story": "abhijit2111/Pic2Story",
}

In [10]:
# Цикл предсказания каждой модели с логированием метрик в wandb
for name, model_name in models.items():
    wandb.init(project="child_diary", group=name, job_type="base")

    # Загрузка модели генерации описаний изображений
    processor = BlipProcessor.from_pretrained(model_name)
    model = BlipForConditionalGeneration.from_pretrained(model_name)

    predicted_captions = []
    reference_captions = []

    # Выполнение предсказания модели
    for image, captions in ds_sin:
        inputs = processor(image, return_tensors="pt")
        out = model.generate(**inputs)
        predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
        reference_captions.append([captions])

    wandb.log(
        {
            "CIDEr": metric_cider(predicted_captions, reference_captions),
            "METEOR": metric_meteor(predicted_captions, reference_captions),
            "SPICE": metric_spice(predicted_captions, reference_captions),
        }
    )

    wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mn-hilkovich[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
BERTScore_F1,▁
CIDEr,▁
METEOR,▁
SPICE,▁

0,1
BERTScore_F1,0.91119
CIDEr,0.40045
METEOR,0.24829
SPICE,0.21557


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0111142495888891, max=1.0))…

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
BERTScore_F1,▁
CIDEr,▁
METEOR,▁
SPICE,▁

0,1
BERTScore_F1,0.90736
CIDEr,0.654
METEOR,0.30281
SPICE,0.24125


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114127522222992, max=1.0…

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
BERTScore_F1,▁
CIDEr,▁
METEOR,▁
SPICE,▁

0,1
BERTScore_F1,0.90835
CIDEr,1.02592
METEOR,0.38718
SPICE,0.26395
